In [1]:
import pandas as pd
import numpy as np
data_3 = pd.read_csv('dataset_3.csv')

In [2]:
#Converts each categorical data into one-hot, supllied a data. 
def one_hot(dataset):
    dataset_hot = dataset.copy()
    for column in dataset_hot:
        if column != "adsorption_energy" and type(dataset_hot[column][0])==type('str'):
            onehots = pd.get_dummies(dataset_hot[column], column, drop_first=True, dtype=int)
            dataset_hot = pd.concat([dataset_hot.drop(column, axis=1), onehots], axis=1)
    return dataset_hot

In [3]:
#Does max-min normalization for all numerical attributes. 
def scale_dataset(dataset):
    dataset_scaled = dataset.copy()
    for column in dataset_scaled:
        if column != "adsorption_energy" and type(dataset_scaled[column][0])!=type('str') and type(dataset_scaled[column][0])!=np.bool_:
            min = dataset_scaled[column].min()
            max = dataset_scaled[column].max()
            if min != max:
                dataset_scaled[column]= (dataset_scaled[column] - min) / (max - min)
    return dataset_scaled

In [4]:
data_3 = one_hot(data_3)
data_3 = scale_dataset(data_3)
data_3.head()

Unnamed: 0,adsorption_energy,fraction,adsorbate_num_atoms,atomic_num_1,atomic_num_2,atomic_num_3,atomic_num_4,atomic_num_5,atomic_num_6,atomic_num_6.1,...,Substrate_Symmetry_2x2,Substrate_Symmetry_2x3,Substrate_Symmetry_2x4,Substrate_Symmetry_3x2,Substrate_Symmetry_3x3,Substrate_Symmetry_4x2,Substrate_Symmetry_4x4,Substrate_Symmetry_c(2x2),Substrate_Symmetry_c(4x4),Substrate_Symmetry_√3x√3
0,-4.29,0.750075,0.0,0.0,0,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.56,0.750075,0.045455,0.0,0,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-3.14,0.50015,0.0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.5,0.50015,0.045455,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.67,0.750075,0.045455,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [5]:
#parameters
seed = 12345
train_percent = 0.75

#shuffle data
data_3 = data_3.sample(frac=1, random_state=int(seed))
total_rows = data_3.shape[0]

#divide into training and testing set
training_rows = int(train_percent * total_rows)

# create the training set
training = data_3.iloc[:training_rows, :]
testing = data_3.iloc[training_rows:, :]

#divide into attributes and labels
training_y = training["adsorption_energy"]
training_X = training.drop("adsorption_energy", axis='columns')  
    
testing_y = testing["adsorption_energy"]
testing_X = testing.drop("adsorption_energy", axis='columns')

import tensorflow as tf

# convert to tensors
training_X = tf.convert_to_tensor(training_X, dtype=tf.float32)
training_y = tf.convert_to_tensor(training_y, dtype=tf.int32)
testing_X = tf.convert_to_tensor(testing_X, dtype=tf.float32)
testing_y = tf.convert_to_tensor(testing_y, dtype=tf.float32)

In [6]:
# creates a neural network for regression tasks
def create_reg_network(num_neuron, num_out):
    hidden_layer = tf.keras.layers.Dense(num_neuron, activation='sigmoid')  
    output_layer = tf.keras.layers.Dense(num_out) 

    all_layers = [hidden_layer, output_layer]
    network = tf.keras.models.Sequential(all_layers)

    return network

In [7]:
# trains a neural network with given training data
def train_reg_network(network, training_X, training_y, learn_rate, num_neuron):
    # create the algorithm that learns the weight of the network (with a learning rate of 0.0001)
    optimizer = tf.keras.optimizers.Adam(learning_rate=float(learn_rate))

    # create the loss function function that tells optimizer how much error it has in its predictions
    loss_function = tf.keras.losses.MeanSquaredError()

    # prepare the network for training
    network.compile(optimizer=optimizer, loss=loss_function, metrics=["mean_absolute_error"])

    # create a logger to save the training details to file
    csv_logger = tf.keras.callbacks.CSVLogger("data_0"+"_"+str(learn_rate)+"_"+str(num_neuron)+".csv")

    # train the network for 200 epochs (setting aside 20% of the training data as validation data)
    network.fit(training_X, training_y, validation_split=0.2, epochs=250, callbacks=[csv_logger])

In [8]:
import numpy as np
learn_rate = 0.0001
num_neuron = 500

network=create_reg_network(num_neuron, 1)
train_reg_network(network, training_X, training_y, float(learn_rate), int(num_neuron))
outputs = network.predict(testing_X)
predictions = outputs.flatten() 
performance = np.mean(np.abs(predictions - testing_y))

Epoch 1/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.2193 - mean_absolute_error: 1.6315 - val_loss: 3.6262 - val_mean_absolute_error: 1.4110
Epoch 2/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3.8133 - mean_absolute_error: 1.5454 - val_loss: 3.3003 - val_mean_absolute_error: 1.4270
Epoch 3/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.4829 - mean_absolute_error: 1.5180 - val_loss: 3.2003 - val_mean_absolute_error: 1.4096
Epoch 4/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.4616 - mean_absolute_error: 1.5067 - val_loss: 3.0913 - val_mean_absolute_error: 1.3803
Epoch 5/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.0337 - mean_absolute_error: 1.4084 - val_loss: 2.9983 - val_mean_absolute_error: 1.3555
Epoch 6/250
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

In [9]:
def mae_calculator(predictions, testing_y):
    n = len(predictions)
    sum = 0
    for x in range(n):
        sum += abs(predictions[x] - testing_y[x])
    return sum/n

In [13]:
def rmse_calculator(predictions, testing_y):
    n = len(predictions)
    sum = 0
    for x in range(n):
        sum += (predictions[x] - testing_y[x])**2
    return (sum/n)**0.5

In [14]:
# get the output of the network for each testing instance
# NOTE: there will be 10 values per instance, one for each output neuron
predictions = network.predict(testing_X)

# calculate the accuracy of the predictions
mae = mae_calculator(predictions, testing_y)
print("Mean Absolute Error:",float(mae))

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 875us/step
Mean Absolute Error: 0.8398232460021973


In [15]:
rmse = rmse_calculator(predictions, testing_y)
print("Root Mean Square Error:",float(rmse))

Root Mean Square Error: 1.1475147008895874
