<a href="https://colab.research.google.com/github/JRopes/CrystalEnergyPredictionWithInvariants/blob/main/AMD_T2L_CON_GaussianProcess_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**SETUP**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/Colab_Notebooks/Dissertation/Prediction_Prototyping') 

In [None]:
!pip install ipython-autotime



**IMPORTING LIBRARY DEPENDENCIES**

In [None]:
import numpy as np
import pickle
import math
import pandas as pd

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RationalQuadratic, RBF, ConstantKernel, Matern, ExpSineSquared

In [None]:
%load_ext autotime

time: 156 µs (started: 2021-05-08 14:55:56 +00:00)


**IMPORTING DATA**

In [None]:
feature_dir_path = '/content/drive/MyDrive/Colab_Notebooks/Dissertation/Data/PublishedData/AMDs_T2_1000_CON.csv'

time: 2.22 ms (started: 2021-05-08 14:55:56 +00:00)


In [None]:
RAW_DATA = pd.read_csv(feature_dir_path)
data = RAW_DATA.to_numpy()

time: 2.18 s (started: 2021-05-08 14:55:56 +00:00)


In [None]:
label_dir_path = '/content/drive/MyDrive/Colab_Notebooks/Dissertation/Data/AMDs_T2_1000.csv'

time: 875 µs (started: 2021-05-08 14:55:58 +00:00)


In [None]:
RAW_LABEL_DATA = pd.read_csv(label_dir_path)
label_data = RAW_LABEL_DATA.to_numpy()

time: 2.06 s (started: 2021-05-08 14:55:58 +00:00)


In [None]:
features = data[:,1:]
labels = label_data[:,1]

time: 1.84 ms (started: 2021-05-08 14:56:00 +00:00)


In [None]:
pickle.dump(features, open("amd_feature_data_forest.p", "wb"))

pickle.dump(labels, open("amd_label_data_forest.p","wb"))

time: 1.54 s (started: 2021-05-08 14:56:00 +00:00)


In [None]:
def data():
  feature_data = pickle.load(open("amd_feature_data_forest.p","rb"))
  label_data = pickle.load(open("amd_label_data_forest.p","rb"))

  feature_data = np.nan_to_num(feature_data)

  feature_data = feature_data[:,:100]

  ## MinMax Scaler
  feature_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
  X_scaled = feature_scaler.fit_transform(feature_data)
  
  label_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
  y_scaled = label_scaler.fit_transform(label_data.reshape(-1,1))

  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.1, shuffle=True)

  return X_train, y_train, X_test, y_test, label_scaler

time: 12.3 ms (started: 2021-05-08 14:56:02 +00:00)


**ARCHITECTURE**

In [None]:
X_train, y_train, X_test, y_test, label_scaler = data()

time: 1.3 s (started: 2021-05-08 14:56:02 +00:00)


In [None]:
kernel = RationalQuadratic()

time: 1.45 ms (started: 2021-05-08 14:56:03 +00:00)


In [None]:
gpr = GaussianProcessRegressor(kernel=kernel)

time: 1.46 ms (started: 2021-05-08 14:56:03 +00:00)


**TRAINING**

In [None]:
gpr.fit(X_train, y_train)

GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                         kernel=RationalQuadratic(alpha=1, length_scale=1),
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)

time: 9min 9s (started: 2021-05-08 14:56:03 +00:00)


**EVALUATING**

In [None]:
mean_predictions, std_predictions = gpr.predict(X_test, return_std=True)

time: 10.1 s (started: 2021-05-08 15:05:12 +00:00)


In [None]:
std_predictions = std_predictions.reshape(-1,1)

time: 1.48 ms (started: 2021-05-08 15:05:22 +00:00)


In [None]:
scaler = np.divide(std_predictions, mean_predictions)

time: 1.46 ms (started: 2021-05-08 15:05:22 +00:00)


In [None]:
mean_predictions = label_scaler.inverse_transform(mean_predictions)
std_predictions = np.multiply(scaler,mean_predictions)
y_test = label_scaler.inverse_transform(y_test)

time: 5.72 ms (started: 2021-05-08 15:05:22 +00:00)


In [None]:
average_loss = 0
average_loss_percentage = 0
average_loss_percentage_rel_range = 0
counter = 0
rms = 0

error_ranges = np.array((0,0,0,0,0,0))

max_value = -999999.99
min_value = 999999.99

for label in y_test:
    if(label > max_value):
        max_value = label
        
    if(label < min_value):
        min_value = label
        
label_range = abs(max_value - min_value)


for i,prediction in enumerate(mean_predictions):
    percentage_difference = abs((abs(prediction - y_test[i]) / y_test[i]) * 100)
    percentage_difference2 = abs((abs(prediction - y_test[i]) / label_range) * 100)
    loss = abs(prediction - y_test[i])
    average_loss += loss

    rms += loss**2

    if(loss <= 1.0):
      error_ranges[0] += 1
    elif(loss <= 2.0):
      error_ranges[1] += 1
    elif(loss <= 4.0):
      error_ranges[2] += 1
    elif(loss <= 8.0):
      error_ranges[3] += 1
    elif(loss <= 10.0):
      error_ranges[4] += 1
    else:
      error_ranges[5] += 1

    average_loss_percentage += percentage_difference
    average_loss_percentage_rel_range += percentage_difference2
    counter += 1

rms = math.sqrt(rms / counter)

print()
print("SUMMARY:")
print()
print("Root Mean Squared Error: " + str(rms))
print("Mean Absolute Error: " + str(average_loss / counter))
print("Mean Absolute Percentage Error: " + str(average_loss_percentage / counter) + "%")
print("Mean Absolute Percentage Error relative to Label Range: " + str(average_loss_percentage_rel_range / counter) + "%")
print("Accuracy: " + str(100 - (average_loss_percentage / counter)) + "%")
print()
print("BREAKDOWN:")
print("   Error <= 1.0 kJ/mol: " + str(error_ranges[0]) + " or " + str((error_ranges[0] / counter) * 100) + "% of Test Set")
print("   Error <= 2.0 kJ/mol: " + str(error_ranges[1]) + " or " + str((error_ranges[1] / counter) * 100) + "% of Test Set")
print("   Error <= 4.0 kJ/mol: " + str(error_ranges[2]) + " or " + str((error_ranges[2] / counter) * 100) + "% of Test Set")
print("   Error <= 8.0 kJ/mol: " + str(error_ranges[3]) + " or " + str((error_ranges[3] / counter) * 100) + "% of Test Set")
print("   Error <= 10.0.0 kJ/mol: " + str(error_ranges[4]) + " or " + str((error_ranges[4] / counter) * 100) + "% of Test Set")
print("   Error > 10.0 kJ/mol: " + str(error_ranges[5]) + " or " + str((error_ranges[5] / counter) * 100) + "% of Test Set")
print("----------------------------------------------------------------------------------------------")


SUMMARY:

Root Mean Squared Error: 6.966903909952713
Mean Absolute Error: [5.30526982]
Mean Absolute Percentage Error: [3.81922028]%
Mean Absolute Percentage Error relative to Label Range: [5.87889735]%
Accuracy: [96.18077972]%

BREAKDOWN:
   Error <= 1.0 kJ/mol: 84 or 14.788732394366196% of Test Set
   Error <= 2.0 kJ/mol: 71 or 12.5% of Test Set
   Error <= 4.0 kJ/mol: 113 or 19.8943661971831% of Test Set
   Error <= 8.0 kJ/mol: 176 or 30.985915492957744% of Test Set
   Error <= 10.0.0 kJ/mol: 47 or 8.274647887323944% of Test Set
   Error > 10.0 kJ/mol: 77 or 13.556338028169016% of Test Set
----------------------------------------------------------------------------------------------
time: 89.4 ms (started: 2021-05-08 15:05:22 +00:00)


**Prediction with Uncertainty**

In [None]:
instance = 23

time: 845 µs (started: 2021-05-08 15:05:23 +00:00)


In [None]:
print("Prediction: " + str(mean_predictions[instance,0]) + " +- " + str(-1.96 * std_predictions[instance,0]) + " with 95% Confidence || True Label: " + str(y_test[instance]))

Prediction: -138.58976700140823 +- 25.419707622430987 with 95% Confidence || True Label: [-132.7562]
time: 2.99 ms (started: 2021-05-08 15:05:23 +00:00)
