<a href="https://colab.research.google.com/github/JRopes/CrystalEnergyPredictionWithInvariants/blob/main/DF_T2L_CO_GaussianProcess_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**SETUP**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/Colab_Notebooks/Dissertation/Prediction_Prototyping') 

In [None]:
!pip install ipython-autotime



**IMPORTING LIBRARY DEPENDENCIES**

In [None]:
import numpy as np
import pickle
import math
import pandas as pd

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RationalQuadratic, RBF

In [None]:
import DataImporter_CO
import DataPreprocessing

In [None]:
%load_ext autotime

time: 199 µs (started: 2021-05-06 17:49:33 +00:00)


**IMPORTING DATA**

In [None]:
feature_dir_path = '/content/drive/MyDrive/Colab_Notebooks/Dissertation/Data/T2L_CO'
label_file_path = '/content/drive/MyDrive/Colab_Notebooks/Dissertation/Data/T2L_density_energy.csv' 

(feature_data, label_data, x_labels) = DataImporter_CO.DataFrameImport(feature_dir_path,label_file_path)

File with greatest Domain: T2L_CO_03404.csv || Number of Density Functions: 8
time: 1min 5s (started: 2021-05-06 17:49:33 +00:00)


In [None]:
feature_data = DataPreprocessing.DataFiller(feature_data,feature_data[1,1,0],feature_data[1,0,48])

time: 1.56 s (started: 2021-05-06 17:50:38 +00:00)


In [None]:
pickle.dump(feature_data, open("feature_data_gaussian.p", "wb"))

pickle.dump(label_data, open("label_data_gaussian.p","wb"))

time: 195 ms (started: 2021-05-06 17:50:40 +00:00)


In [None]:
def data():

  feature_data = pickle.load(open("feature_data_gaussian.p","rb"))
  label_data = pickle.load(open("label_data_gaussian.p","rb"))

  shape = feature_data.shape

  serial_feature_data = np.zeros((shape[0],(shape[1] * shape[2])))

  df_length = np.ma.size(feature_data,2)

  for i in range(shape[0]):
    for j in range(shape[1]):
      for z in range(shape[2]):
              
        serial_feature_data[i,((j * df_length) + z)] = feature_data[i,j,z]


  pure_label_data = np.zeros((len(label_data)))

  for i in range(len(label_data)):
    pure_label_data[i] = label_data[i][1]

  serial_feature_data = np.nan_to_num(serial_feature_data)

  ## Standard Scaler
  feature_scaler = preprocessing.StandardScaler()
  label_scaler = preprocessing.StandardScaler()

  X_scaled = (feature_scaler.fit_transform(serial_feature_data))
  y_scaled = label_scaler.fit_transform(pure_label_data.reshape(-1,1))

  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.1,shuffle=True)

  return X_train, y_train, X_test, y_test, label_scaler

time: 24 ms (started: 2021-05-06 17:50:40 +00:00)


**ARCHITECTURE**

In [None]:
kernel = RationalQuadratic() + RBF()

gpr = GaussianProcessRegressor(kernel=kernel)

time: 1.94 ms (started: 2021-05-06 17:50:40 +00:00)


**TRAINING**

In [None]:
X_train, y_train, X_test, y_test, label_scaler = data()

time: 2.57 s (started: 2021-05-06 17:50:40 +00:00)


In [None]:
gpr.fit(X_train, y_train)

GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                         kernel=RationalQuadratic(alpha=1, length_scale=1) + RBF(length_scale=1),
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)

time: 24min 15s (started: 2021-05-06 17:50:43 +00:00)


**EVALUATION**

In [None]:
predictions = gpr.predict(X_test).reshape(-1,1)

time: 6.84 s (started: 2021-05-06 18:14:58 +00:00)


In [None]:
predictions = label_scaler.inverse_transform(predictions)
y_test = label_scaler.inverse_transform(y_test)

time: 2 ms (started: 2021-05-06 18:15:05 +00:00)


In [None]:
average_loss = 0
average_loss_percentage = 0
average_loss_percentage_rel_range = 0
counter = 0
rms = 0

error_ranges = np.array((0,0,0,0,0,0))

max_value = -999999.99
min_value = 999999.99

for label in y_test:
    if(label > max_value):
        max_value = label
        
    if(label < min_value):
        min_value = label
        
label_range = abs(max_value - min_value)


for i,prediction in enumerate(predictions):
    percentage_difference = abs((abs(prediction - y_test[i]) / y_test[i]) * 100)
    percentage_difference2 = abs((abs(prediction - y_test[i]) / label_range) * 100)
    loss = abs(prediction - y_test[i])
    average_loss += loss

    rms += loss**2

    if(loss <= 1.0):
      error_ranges[0] += 1
    elif(loss <= 2.0):
      error_ranges[1] += 1
    elif(loss <= 4.0):
      error_ranges[2] += 1
    elif(loss <= 8.0):
      error_ranges[3] += 1
    elif(loss <= 10.0):
      error_ranges[4] += 1
    else:
      error_ranges[5] += 1

    average_loss_percentage += percentage_difference
    average_loss_percentage_rel_range += percentage_difference2
    counter += 1

rms = math.sqrt(rms / counter)

print()
print("SUMMARY:")
print()
print("Root Mean Squared Error: " + str(rms))
print("Mean Absolute Error: " + str(average_loss / counter))
print("Mean Absolute Percentage Error: " + str(average_loss_percentage / counter) + "%")
print("Mean Absolute Percentage Error relative to Label Range: " + str(average_loss_percentage_rel_range / counter) + "%")
print("Accuracy: " + str(100 - (average_loss_percentage / counter)) + "%")
print()
print("BREAKDOWN:")
print("   Error <= 1.0 kJ/mol: " + str(error_ranges[0]) + " or " + str((error_ranges[0] / counter) * 100) + "% of Test Set")
print("   Error <= 2.0 kJ/mol: " + str(error_ranges[1]) + " or " + str((error_ranges[1] / counter) * 100) + "% of Test Set")
print("   Error <= 4.0 kJ/mol: " + str(error_ranges[2]) + " or " + str((error_ranges[2] / counter) * 100) + "% of Test Set")
print("   Error <= 8.0 kJ/mol: " + str(error_ranges[3]) + " or " + str((error_ranges[3] / counter) * 100) + "% of Test Set")
print("   Error <= 10.0.0 kJ/mol: " + str(error_ranges[4]) + " or " + str((error_ranges[4] / counter) * 100) + "% of Test Set")
print("   Error > 10.0 kJ/mol: " + str(error_ranges[5]) + " or " + str((error_ranges[5] / counter) * 100) + "% of Test Set")
print("----------------------------------------------------------------------------------------------")


SUMMARY:

Root Mean Squared Error: 9.289373349144867
Mean Absolute Error: [7.02804962]
Mean Absolute Percentage Error: [5.01757692]%
Mean Absolute Percentage Error relative to Label Range: [7.38595063]%
Accuracy: [94.98242308]%

BREAKDOWN:
   Error <= 1.0 kJ/mol: 60 or 10.56338028169014% of Test Set
   Error <= 2.0 kJ/mol: 57 or 10.035211267605634% of Test Set
   Error <= 4.0 kJ/mol: 107 or 18.838028169014084% of Test Set
   Error <= 8.0 kJ/mol: 139 or 24.471830985915492% of Test Set
   Error <= 10.0.0 kJ/mol: 69 or 12.147887323943662% of Test Set
   Error > 10.0 kJ/mol: 136 or 23.943661971830984% of Test Set
----------------------------------------------------------------------------------------------
time: 99.2 ms (started: 2021-05-06 18:15:05 +00:00)
