In [1]:
from utils import get_smiles_encodings, load_data, smile_to_hot
from model_regression import LinearRegression_RidgeRegression
from sklearn.linear_model import SGDRegressor
from datetime import datetime
import numpy as np

In [2]:
file_smiles = './dataset/QM9.txt'
file_properties = './dataset/properties_QM9.npz'
smiles, alphabet, largest_molecule_len = get_smiles_encodings(file_smiles)
properties = np.load(file_properties)['properties'].astype(np.float32)

X_train, X_test, y_train, y_test = load_data(smiles, properties)
print("X - train : ", X_train.shape)
print("y - train : ", y_train.shape)
X_hot = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_train])
X_hot_test = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_test])
print("X_hot : ", X_hot.shape)
print("X_hot_test : ", X_hot_test.shape)

X - train :  (92428,)
y - train :  (92428, 4)
X_hot :  (92428, 14)
X_hot_test :  (39612, 14)


In [3]:
model_regression_0 = SGDRegressor()
model_regression_0.fit(X_hot, y_train[:,0])
y_pred_0 = model_regression_0.predict(X_hot_test)

model_regression_1 = SGDRegressor()
model_regression_1.fit(X_hot, y_train[:,1])
y_pred_1 = model_regression_1.predict(X_hot_test)

model_regression_2 = SGDRegressor()
model_regression_2.fit(X_hot, y_train[:,2])
y_pred_2 = model_regression_2.predict(X_hot_test)

model_regression_3 = SGDRegressor()
model_regression_3.fit(X_hot, y_train[:,3])
y_pred_3 = model_regression_3.predict(X_hot_test)

In [4]:
y_test_0 = y_test[:,0]
y_test_1 = y_test[:,1]
y_test_2 = y_test[:,2]
y_test_3 = y_test[:,3]    

In [5]:
sum_0 = 0
for i in range(y_test_0.shape[0]):
    sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))
sum_0 = sum_0 / y_test_0.shape[0]


print("Number of unique values in y_test[:,0] : ", np.unique(y_test_0).shape[0])
print("Min of y_test[:,0] : ", np.min(y_test_0))
print("Max of y_test[:,0] : ", np.max(y_test_0))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_0).shape[0])
print("Min of y_pred : ", np.min(y_pred_0))
print("Max of y_pred : ", np.max(y_pred_0))
print()
print("Error is on avearage : ", sum_0)

Number of unique values in y_test[:,0] :  6400
Min of y_test[:,0] :  -2.8142
Max of y_test[:,0] :  3.7569

Number of unique values in y_pred :  1337
Min of y_pred :  -2.8522280510071707
Max of y_pred :  3.165364309197873

Error is on avearage :  0.3448765325444228


In [6]:
sum_1 = 0
sum_1_5 = 0
for i in range(y_test_1.shape[0]):
    sum_1 = sum_1 + np.abs(np.abs(y_pred_1[i]) - np.abs(y_test_1[i]))
    if(y_pred_1[i] == y_test_1[i]):
        sum_1_5 = sum_1_5 + 1
sum_1 = sum_1 / y_test_1.shape[0]
sum_1_5_r = sum_1_5 / y_test_1.shape[0] * 100

print("Number of unique values in y_test[:,1] : ", np.unique(y_test_1).shape[0])
print("Min of y_test[:,1] : ", np.min(y_test_1))
print("Max of y_test[:,1] : ", np.max(y_test_1))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_1).shape[0])
print("Min of y_pred : ", np.min(y_pred_1))
print("Max of y_pred : ", np.max(y_pred_1))
print()
print("Error is on avearage : ", sum_1)
print("There were ", sum_1_5, " correct predictions, out of ", y_test_1.shape[0], ", which is ", 100 - sum_1_5_r,"% error rate")
print()
print("Since the number of unique values is small, let's round the y_pred, to see if we get a better approximation : ")
print()

y_pred_1_r = np.around(y_pred_1)

sum_1_r = 0
sum_1_5_r = 0
for i in range(y_pred_1_r.shape[0]):
    sum_1_r = sum_1_r + np.abs(np.abs(y_pred_1_r[i]) - np.abs(y_test_3[i]))
    if(y_pred_1_r[i] == y_test_3[i]):
        sum_1_5_r = sum_1_5_r + 1
sum_1_r = sum_1_r / y_test_3.shape[0]
sum_1_5_r_ = sum_1_5_r / y_test_3.shape[0] * 100

print("Number of unique values in y_pred_rounded : ", np.unique(y_pred_1_r).shape[0])
print("Min of y_pred_rounded : ", np.min(y_pred_1_r))
print("Max of y_pred_rounded : ", np.max(y_pred_1_r))
print()
print("Error is on avearage : ", sum_1_r)
print("There were ", sum_1_5_r, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_1_5_r_,"% error rate")

Number of unique values in y_test[:,1] :  7
Min of y_test[:,1] :  0.0
Max of y_test[:,1] :  6.0

Number of unique values in y_pred :  1337
Min of y_pred :  -0.9758315077815384
Max of y_pred :  3.0245899473201288

Error is on avearage :  0.7322987430294942
There were  0  correct predictions, out of  39612 , which is  100.0 % error rate

Since the number of unique values is small, let's round the y_pred, to see if we get a better approximation : 

Number of unique values in y_pred_rounded :  5
Min of y_pred_rounded :  -1.0
Max of y_pred_rounded :  3.0

Error is on avearage :  1.1298596384933859
There were  11783  correct predictions, out of  39612 , which is  70.25396344542058 % error rate


In [7]:
sum_2 = 0
for i in range(y_test_2.shape[0]):
    sum_2 = sum_2 + np.abs(np.abs(y_pred_2[i]) - np.abs(y_test_2[i]))
sum_2 = sum_2 / y_test_2.shape[0]


print("Number of unique values in y_test[:,2] : ", np.unique(y_test_2).shape[0])
print("Min of y_test[:,2] : ", np.min(y_test_2))
print("Max of y_test[:,2] : ", np.max(y_test_2))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_2).shape[0])
print("Min of y_pred : ", np.min(y_pred_2))
print("Max of y_pred : ", np.max(y_pred_2))
print()
print("Error is on avearage : ", sum_2)

Number of unique values in y_test[:,2] :  222
Min of y_test[:,2] :  104.152
Max of y_test[:,2] :  152.037

Number of unique values in y_pred :  1337
Min of y_pred :  104.17306302868457
Max of y_pred :  150.84238673691465

Error is on avearage :  0.019636341478826654


In [8]:
sum_3 = 0
sum_3_5 = 0
for i in range(y_test_3.shape[0]):
    sum_3 = sum_3 + np.abs(np.abs(y_pred_3[i]) - np.abs(y_test_3[i]))
    if(y_pred_3[i] == y_test_3[i]):
        sum_3_5 = sum_3_5 + 1
sum_3 = sum_3 / y_test_1.shape[0]
sum_3_5_r = sum_3_5 / y_test_1.shape[0] * 100

print("Number of unique values in y_test[:,3] : ", np.unique(y_test_3).shape[0])
print("Min of y_test[:,3] : ", np.min(y_test_3))
print("Max of y_test[:,3] : ", np.max(y_test_3))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_3).shape[0])
print("Min of y_pred : ", np.min(y_pred_3))
print("Max of y_pred : ", np.max(y_pred_3))
print()
print("Error is on avearage : ", sum_3)
print("There were ", sum_3_5, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_3_5_r,"% error rate")
print()
print("Since the number of unique values is small, let's round the y_pred, to see if we get a better approximation : ")
print()

y_pred_3_r = np.around(y_pred_3)

sum_3_r = 0
sum_3_5_r = 0
for i in range(y_test_3.shape[0]):
    sum_3_r = sum_3_r + np.abs(np.abs(y_pred_3_r[i]) - np.abs(y_test_3[i]))
    if(y_pred_3_r[i] == y_test_3[i]):
        sum_3_5_r = sum_3_5_r + 1
sum_3_r = sum_3_r / y_test_3.shape[0]
sum_3_5_r_ = sum_3_5_r / y_test_3.shape[0] * 100

print("Number of unique values in y_pred_rounded : ", np.unique(y_pred_3_r).shape[0])
print("Min of y_pred_rounded : ", np.min(y_pred_3_r))
print("Max of y_pred_rounded : ", np.max(y_pred_3_r))
print()
print("Error is on avearage : ", sum_3_r)
print("There were ", sum_3_5_r, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_3_5_r_,"% error rate")

Number of unique values in y_test[:,3] :  9
Min of y_test[:,3] :  0.0
Max of y_test[:,3] :  8.0

Number of unique values in y_pred :  1337
Min of y_pred :  -0.11277882231187136
Max of y_pred :  5.978366428615146

Error is on avearage :  0.12943434018428215
There were  0  correct predictions, out of  39612 , which is  100.0 % error rate

Since the number of unique values is small, let's round the y_pred, to see if we get a better approximation : 

Number of unique values in y_pred_rounded :  7
Min of y_pred_rounded :  0.0
Max of y_pred_rounded :  6.0

Error is on avearage :  0.07146824194688478
There were  37004  correct predictions, out of  39612 , which is  6.583863475714438 % error rate
