In [1]:
from utils import get_smiles_encodings, load_data, smile_to_hot
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import RidgeClassifier
from datetime import datetime
import numpy as np

In [2]:
file_smiles = './dataset/QM9.txt'
file_properties = './dataset/properties_QM9.npz'
smiles, alphabet, largest_molecule_len = get_smiles_encodings(file_smiles)
properties = np.load(file_properties)['properties'].astype(np.float32)
properties_txt = ['logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids',
                  'Number of rotatable bonds (RBN): the number of bonds which allow free rotation around themselves',
                  'Molecular weight (MW): the weight of a molecule based on the atomic masses of all atoms in the molecule',
                  'Number of the rings (RN): the number of connected sets of atoms and bonds in which every atom and bond is a member of a cycle']

X_train, X_test, y_train, y_test = load_data(smiles, properties)
print("X - train : ", X_train.shape)
print("y - train : ", y_train.shape)
X_hot = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_train])
X_hot_test = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_test])
print("X_hot : ", X_hot.shape)
print("X_hot_test : ", X_hot_test.shape)
X_hot_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_train])
X_hot_test_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_test])
print("X_hot_integer : ", X_hot_integer.shape)
print("X_hot_integer_test : ", X_hot_test_integer.shape)

X - train :  (92428,)
y - train :  (92428, 4)
X_hot :  (92428, 14)
X_hot_test :  (39612, 14)
X_hot_integer :  (92428, 22)
X_hot_integer_test :  (39612, 22)


In [3]:
# 0 
model_regression_0 = SGDRegressor()
model_regression_0.fit(X_hot, y_train[:,0])
y_pred_0 = model_regression_0.predict(X_hot_test)
# 1
model_regression_1 = SGDRegressor()
model_regression_1.fit(X_hot, y_train[:,1])
y_pred_1 = model_regression_1.predict(X_hot_test)

model_regression_1_class = RidgeClassifier()
model_regression_1_class.fit(X_hot, y_train[:,1])
y_pred_1_class = model_regression_1_class.predict(X_hot_test)
# 2 
model_regression_2 = SGDRegressor()
model_regression_2.fit(X_hot, y_train[:,2])
y_pred_2 = model_regression_2.predict(X_hot_test)
# 3 
model_regression_3 = SGDRegressor()
model_regression_3.fit(X_hot, y_train[:,3])
y_pred_3 = model_regression_3.predict(X_hot_test)

model_regression_3_class = RidgeClassifier()
model_regression_3_class.fit(X_hot, y_train[:,3])
y_pred_3_class = model_regression_3_class.predict(X_hot_test)

In [4]:
y_test_0 = y_test[:,0]
y_test_1 = y_test[:,1]
y_test_2 = y_test[:,2]
y_test_3 = y_test[:,3]    

In [5]:
sum_0 = 0
for i in range(y_test_0.shape[0]):
    sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))
sum_0 = sum_0 / y_test_0.shape[0]

print(properties_txt[0])
print()
print("Number of unique values in y_test[:,0] : ", np.unique(y_test_0).shape[0])
print("Min of y_test[:,0] : ", np.min(y_test_0))
print("Max of y_test[:,0] : ", np.max(y_test_0))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_0).shape[0])
print("Min of y_pred : ", np.min(y_pred_0))
print("Max of y_pred : ", np.max(y_pred_0))
print()
print("Error is on avearage : ", sum_0)
print("Let's say the target value is : 1.0, then the model would guess a value in the following interval :")
print(" -> [", 1.0 - sum_0,",",1.0 + sum_0,"]")

logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids

Number of unique values in y_test[:,0] :  6400
Min of y_test[:,0] :  -2.8142
Max of y_test[:,0] :  3.7569

Number of unique values in y_pred :  1337
Min of y_pred :  -2.9187293554990483
Max of y_pred :  3.1766602205853713

Error is on avearage :  0.34547616181137525
Let's say the target value is : 1.0, then the model would guess a value in the following interval :
 -> [ 0.6545238381886247 , 1.3454761618113753 ]


In [6]:
print(properties_txt[1])
print()
print("Since this property is discrete, I rounded the values we got, but I tried a classification method as well")
print()

y_pred_1_r = np.around(y_pred_1)

sum_1_r = 0
sum_1_5_r = 0
for i in range(y_pred_1_r.shape[0]):
    sum_1_r = sum_1_r + np.abs(np.abs(y_pred_1_r[i]) - np.abs(y_test_1[i]))
    if(y_pred_1_r[i] == y_test_1[i]):
        sum_1_5_r = sum_1_5_r + 1
sum_1_r = sum_1_r / y_test_3.shape[0]
sum_1_5_r_ = sum_1_5_r / y_test_3.shape[0] * 100

print("Number of unique values in y_pred_rounded : ", np.unique(y_pred_1_r).shape[0])
print("Min of y_pred_rounded : ", np.min(y_pred_1_r))
print("Max of y_pred_rounded : ", np.max(y_pred_1_r))
print()
print("Error is on avearage : ", sum_1_r)
print("There were ", sum_1_5_r, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_1_5_r_,"% error rate")
print()
print("Classification method")

sum_1_r = 0
sum_1_5_r = 0
for i in range(y_pred_1_r.shape[0]):
    sum_1_r = sum_1_r + np.abs(np.abs(y_pred_1_class[i]) - np.abs(y_test_3[i]))
    if(y_pred_1_class[i] == y_test_1[i]):
        sum_1_5_r = sum_1_5_r + 1
sum_1_r = sum_1_r / y_test_3.shape[0]
sum_1_5_r_ = sum_1_5_r / y_test_3.shape[0] * 100
print("There were ", sum_1_5_r, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_1_5_r_,"% error rate")

Number of rotatable bonds (RBN): the number of bonds which allow free rotation around themselves

Since this property is discrete, I rounded the values we got, but I tried a classification method as well

Number of unique values in y_pred_rounded :  5
Min of y_pred_rounded :  -1.0
Max of y_pred_rounded :  3.0

Error is on avearage :  0.680349389074018
There were  16980  correct predictions, out of  39612 , which is  57.13420175704332 % error rate

Classification method
There were  12554  correct predictions, out of  39612 , which is  68.30758356053721 % error rate


In [7]:
sum_2 = 0
sum_2_r = 0
for i in range(y_test_2.shape[0]):
    sum_2 = sum_2 + np.abs(np.abs(y_pred_2[i]) - np.abs(y_test_2[i]))
    sum_2_r = sum_2_r + np.abs(100 - (np.abs(y_pred_2[i]) * 100 / np.abs(y_test_2[i])))
sum_2 = sum_2 / y_test_2.shape[0]
sum_2_r = sum_2_r / y_test_2.shape[0]

print(properties_txt[2])
print()
print("Number of unique values in y_test[:,2] : ", np.unique(y_test_2).shape[0])
print("Min of y_test[:,2] : ", np.min(y_test_2))
print("Max of y_test[:,2] : ", np.max(y_test_2))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_2).shape[0])
print("Min of y_pred : ", np.min(y_pred_2))
print("Max of y_pred : ", np.max(y_pred_2))
print()
print("Error is on avearage : ", sum_2)
print("Error rate is on avearage : ", sum_2_r, "% -> for each guess, the model is of by ", sum_2_r, "%")
print("Let's say the target value is : 150, then the model would guess a value in the following interval :")
print(" -> [", 150 - sum_2,",",150 + sum_2,"]")

Molecular weight (MW): the weight of a molecule based on the atomic masses of all atoms in the molecule

Number of unique values in y_test[:,2] :  222
Min of y_test[:,2] :  104.152
Max of y_test[:,2] :  152.037

Number of unique values in y_pred :  1337
Min of y_pred :  104.18146129934982
Max of y_pred :  150.8290170767943

Error is on avearage :  0.019987376763212793
Error rate is on avearage :  0.015285681200489658 % -> for each guess, the model is of by  0.015285681200489658 %
Let's say the target value is : 150, then the model would guess a value in the following interval :
 -> [ 149.98001262323677 , 150.01998737676323 ]


In [8]:
print(properties_txt[3])
print()
print("Since this property is discrete, I rounded the values we got, but I tried a classification method as well")
print()

y_pred_3_r = np.around(y_pred_3)

sum_3_r = 0
sum_3_5_r = 0
for i in range(y_test_3.shape[0]):
    sum_3_r = sum_3_r + np.abs(np.abs(y_pred_3_r[i]) - np.abs(y_test_3[i]))
    if(y_pred_3_r[i] == y_test_3[i]):
        sum_3_5_r = sum_3_5_r + 1
sum_3_r = sum_3_r / y_test_3.shape[0]
sum_3_5_r_ = sum_3_5_r / y_test_3.shape[0] * 100

print("Number of unique values in y_pred_rounded : ", np.unique(y_pred_3_r).shape[0])
print("Min of y_pred_rounded : ", np.min(y_pred_3_r))
print("Max of y_pred_rounded : ", np.max(y_pred_3_r))
print()
print("Error is on avearage : ", sum_3_r)
print("There were ", sum_3_5_r, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_3_5_r_,"% error rate")
print()
print("Classification method")

sum_1_r = 0
sum_1_5_r = 0
for i in range(y_pred_1_r.shape[0]):
    sum_1_r = sum_1_r + np.abs(np.abs(y_pred_3_class[i]) - np.abs(y_test_3[i]))
    if(y_pred_3_class[i] == y_test_3[i]):
        sum_1_5_r = sum_1_5_r + 1
sum_1_r = sum_1_r / y_test_3.shape[0]
sum_1_5_r_ = sum_1_5_r / y_test_3.shape[0] * 100
print("There were ", sum_1_5_r, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_1_5_r_,"% error rate")


Number of the rings (RN): the number of connected sets of atoms and bonds in which every atom and bond is a member of a cycle

Since this property is discrete, I rounded the values we got, but I tried a classification method as well

Number of unique values in y_pred_rounded :  7
Min of y_pred_rounded :  0.0
Max of y_pred_rounded :  6.0

Error is on avearage :  0.07666868625668989
There were  36643  correct predictions, out of  39612 , which is  7.495203473694843 % error rate

Classification method
There were  30834  correct predictions, out of  39612 , which is  22.159951529839432 % error rate


In [12]:
max_iter_ = [100, 500, 1000, 2000]
learning_rate_ = ['constant', 'optimal', 'invscaling', 'adaptive']
best_0 = [100,0,0]
best_1 = [100,0,0]
best_1_int = [100,0,0]
best_2 = [100,0,0]
best_3 = [100,0,0]

for max_iterR in max_iter_:
    for lr in learning_rate_:
        # 0 
        model_regression_0 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_0.fit(X_hot, y_train[:,0])
        y_pred_0 = model_regression_0.predict(X_hot_test)
        
        # 1
        model_regression_1 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_1.fit(X_hot, y_train[:,1])
        y_pred_1 = model_regression_1.predict(X_hot_test)
        y_pred_1 = np.round(y_pred_1)

        # 2 
        model_regression_2 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_2.fit(X_hot, y_train[:,2])
        y_pred_2 = model_regression_2.predict(X_hot_test)
        
        # 3 
        model_regression_3 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_3.fit(X_hot, y_train[:,3])
        y_pred_3 = model_regression_3.predict(X_hot_test)
        y_pred_3 = np.round(y_pred_3)
        
        # calcul res
        sum_0 = 0
        sum_1 = 0
        sum_2 = 0
        sum_3 = 0
        for i in range(y_test_0.shape[0]):

            # 0
            sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))

            # 1
            if(y_pred_1[i] == y_test_1[i]):
                sum_1 = sum_1 + 1

            # 2
            sum_2 = sum_2 + np.abs(np.abs(y_pred_2[i]) - np.abs(y_test_2[i]))

            # 3
            if(y_pred_3[i] == y_test_3[i]):
                sum_3 = sum_3 + 1

        sum_0 = sum_0 / y_test_0.shape[0]
        sum_1_r = 100 - (sum_1 / y_test_1.shape[0] * 100)
        sum_2 = sum_2 / y_test_2.shape[0]
        sum_3_r = 100 - (sum_3 / y_test_3.shape[0] * 100)
        
        
        # change res if needed
        if sum_0 < best_0[0]:
            best_0 = [sum_0, max_iterR, lr]

        if sum_1_r < best_1[0]:
            best_1 = [sum_1_r, max_iterR, lr]
            
        if sum_2 < best_2[0]:
            best_2 = [sum_2, max_iterR, lr]

        if sum_3_r < best_3[0]:
            best_3 = [sum_3_r, max_iterR, lr]

        print(max_iterR, " ", lr, "   - done")
        
print()                
print(best_0)
print(best_1)
print(best_2)
print(best_3)
        

100   constant    - done




100   optimal    - done
100   invscaling    - done




100   adaptive    - done
500   constant    - done
500   optimal    - done
500   invscaling    - done
500   adaptive    - done
1000   constant    - done
1000   optimal    - done
1000   invscaling    - done
1000   adaptive    - done
2000   constant    - done
2000   optimal    - done
2000   invscaling    - done
2000   adaptive    - done

[0.3440146150363204, 1000, 'invscaling']
[56.381904473391906, 2000, 'invscaling']
[0.02038320193443289, 2000, 'constant']
[6.144602645662928, 100, 'invscaling']

[99.96970614965161, 100, 'invscaling']


In [15]:
print(properties_txt[0])
print()
print("model -> max_iter : ", best_0[1], " - learning rate : ", best_0[2])
print()
print("Error is on avearage : ", best_0[0])
print("Let's say the target value is : 1.0, then the model would guess a value in the following interval :")
print(" -> [", 1.0 - best_0[0],",",1.0 + best_0[0],"]")

logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids
model -> max_iter :  1000  - learning rate :  invscaling

Error is on avearage :  0.3440146150363204
Let's say the target value is : 1.0, then the model would guess a value in the following interval :
 -> [ 0.6559853849636796 , 1.3440146150363204 ]


In [None]:
print(properties_txt[1])
print()
print("Since this property is discrete, I rounded the values we got :")
print("Error is on avearage : ", sum_1_r)
print("There were ", sum_1_5_r, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_1_5_r_,"% error rate")
print()