In [1]:
import pandas as pd
data_1 = pd.read_csv('dataset_1.csv')

In [2]:
#Converts each categorical data into one-hot, supllied a data. 
def one_hot(dataset):
    dataset_hot = dataset.copy()
    for column in dataset_hot:
        if column != "adsorption_energy" and type(dataset_hot[column][0])==type('str'):
            onehots = pd.get_dummies(dataset_hot[column], column, drop_first=True, dtype=int)
            dataset_hot = pd.concat([dataset_hot.drop(column, axis=1), onehots], axis=1)
    return dataset_hot

In [3]:
#Does max-min normalization for all numerical attributes. 
def scale_dataset(dataset):
    dataset_scaled = dataset.copy()
    for column in dataset_scaled:
        if column != "adsorption_energy" and type(dataset_scaled[column][0])!=type('str'):
            min = dataset_scaled[column].min()
            max = dataset_scaled[column].max()
            if min != max:
                dataset_scaled[column]= (dataset_scaled[column] - min) / (max - min)
    return dataset_scaled

In [4]:
data_1 = one_hot(data_1)
data_1 = scale_dataset(data_1)
data_1

Unnamed: 0,adsorption_energy,adsorbate_num_atoms,atomic_num_1,atomic_num_2,atomic_num_3,atomic_num_4,atomic_num_5,atomic_num_6,atomic_num_6.1,atomic_num_8,...,adsorbate_connectivity_smiles_[O][C][C][C][O],adsorbate_connectivity_smiles_[O][C][C][O],adsorbate_connectivity_smiles_[O][C][O],adsorbate_connectivity_smiles_[O][C][S],adsorbate_connectivity_smiles_[O][N][N]O,adsorbate_connectivity_smiles_[O][O],adsorbate_connectivity_smiles_[P],adsorbate_connectivity_smiles_[S],adsorbate_connectivity_smiles_[Si],adsorbate_connectivity_smiles_[Zn]
0,-4.29,0.000000,0.000000,0,0,0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.56,0.045455,0.000000,0,0,0,0.0,0.0,0.0,0.50,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-3.14,0.000000,0.000000,0,0,0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.50,0.045455,0.000000,0,0,0,0.0,0.0,0.0,0.50,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-1.67,0.045455,0.000000,0,0,0,0.0,0.0,0.0,0.50,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-1.72,0.045455,0.083333,0,0,0,0.0,0.1,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,1.66,0.045455,0.083333,0,0,0,0.0,0.1,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,-1.57,0.181818,0.250000,0,0,0,0.0,0.1,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,-2.51,0.136364,0.083333,0,0,0,0.0,0.2,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#parameters
seed = 12345
train_percent = 0.75

#shuffle data
data_1 = data_1.sample(frac=1, random_state=int(seed))
total_rows = data_1.shape[0]

#divide into training and testing set
training_rows = int(train_percent * total_rows)

# create the training set
training = data_1.iloc[:training_rows, :]
testing = data_1.iloc[training_rows:, :]

#divide into attributes and labels
training_y = training["adsorption_energy"]
training_X = training.drop("adsorption_energy", axis='columns')  
    
testing_y = testing["adsorption_energy"]
testing_X = testing.drop("adsorption_energy", axis='columns')

import tensorflow as tf

# convert to tensors
training_X = tf.convert_to_tensor(training_X, dtype=tf.float32)
training_y = tf.convert_to_tensor(training_y, dtype=tf.int32)
testing_X = tf.convert_to_tensor(testing_X, dtype=tf.float32)
testing_y = tf.convert_to_tensor(testing_y, dtype=tf.float32)

In [6]:
# creates a neural network for regression tasks
def create_reg_network(num_neuron, num_out):
    hidden_layer = tf.keras.layers.Dense(num_neuron, activation='sigmoid')  # 128 hidden neurons
    output_layer = tf.keras.layers.Dense(num_out) # 10 output neurons because there are 10 possible labels for mnist1000 

    all_layers = [hidden_layer, output_layer]
    network = tf.keras.models.Sequential(all_layers)

    return network

In [7]:
# trains a neural network with given training data
def train_reg_network(network, training_X, training_y, learn_rate, num_neuron):
    # create the algorithm that learns the weight of the network (with a learning rate of 0.0001)
    optimizer = tf.keras.optimizers.Adam(learning_rate=float(learn_rate))

    # create the loss function function that tells optimizer how much error it has in its predictions
    loss_function = tf.keras.losses.MeanSquaredError()

    # prepare the network for training
    network.compile(optimizer=optimizer, loss=loss_function, metrics=["mean_absolute_error"])

    # create a logger to save the training details to file
    csv_logger = tf.keras.callbacks.CSVLogger("data_0"+"_"+str(learn_rate)+"_"+str(num_neuron)+".csv")

    # train the network for 200 epochs (setting aside 20% of the training data as validation data)
    network.fit(training_X, training_y, validation_split=0.2, epochs=250, callbacks=[csv_logger])

In [8]:
import numpy as np
learn_rate = 0.0001
num_neuron = 500

network=create_reg_network(num_neuron, 1)
train_reg_network(network, training_X, training_y, float(learn_rate), int(num_neuron))
outputs = network.predict(testing_X)
predictions = outputs.flatten() 
performance = np.mean(np.abs(predictions - testing_y))

Epoch 1/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 6.4073 - mean_absolute_error: 1.8157 - val_loss: 3.9435 - val_mean_absolute_error: 1.4550
Epoch 2/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.9137 - mean_absolute_error: 1.5138 - val_loss: 3.4045 - val_mean_absolute_error: 1.4277
Epoch 3/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.8652 - mean_absolute_error: 1.5769 - val_loss: 3.3032 - val_mean_absolute_error: 1.4333
Epoch 4/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.2161 - mean_absolute_error: 1.4593 - val_loss: 3.2279 - val_mean_absolute_error: 1.4154
Epoch 5/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.5915 - mean_absolute_error: 1.5428 - val_loss: 3.1574 - val_mean_absolute_error: 1.3990
Epoch 6/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

In [9]:
def mae_calculator(predictions, testing_y):
    n = len(predictions)
    sum = 0
    for x in range(n):
        sum += abs(predictions[x] - testing_y[x])
    return sum/n

In [11]:
def rmse_calculator(predictions, testing_y):
    n = len(predictions)
    sum = 0
    for x in range(n):
        sum += (predictions[x] - testing_y[x])**2
    return (sum/n)**0.5

In [10]:
# get the output of the network for each testing instance
# NOTE: there will be 10 values per instance, one for each output neuron
predictions = network.predict(testing_X)

# calculate the accuracy of the predictions
mae = mae_calculator(predictions, testing_y)
print("Mean Absolute Error:",float(mae))

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 686us/step
Mean Absolute Error: 1.111276626586914


In [12]:
rmse = rmse_calculator(predictions, testing_y)
print("Root Mean Square Error:",float(rmse))

Root Mean Square Error: 1.4551808834075928
