In [1]:
from utils import get_smiles_encodings, load_data, smile_to_hot
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier
from datetime import datetime
import numpy as np

In [2]:
file_smiles = './dataset/QM9.txt'
file_properties = './dataset/properties_QM9.npz'
smiles, alphabet, largest_molecule_len = get_smiles_encodings(file_smiles)
properties = np.load(file_properties)['properties'].astype(np.float32)
properties_txt = ['logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids',
                  'Number of rotatable bonds (RBN): the number of bonds which allow free rotation around themselves',
                  'Molecular weight (MW): the weight of a molecule based on the atomic masses of all atoms in the molecule',
                  'Number of the rings (RN): the number of connected sets of atoms and bonds in which every atom and bond is a member of a cycle']

X_train, X_test, y_train, y_test = load_data(smiles, properties)
print("X - train : ", X_train.shape)
print("y - train : ", y_train.shape)
X_hot = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_train])
X_hot_test = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_test])
print("X_hot : ", X_hot.shape)
print("X_hot_test : ", X_hot_test.shape)
X_hot_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_train])
X_hot_test_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_test])
print("X_hot_integer : ", X_hot_integer.shape)
print("X_hot_integer_test : ", X_hot_test_integer.shape)

X - train :  (92428,)
y - train :  (92428, 4)
X_hot :  (92428, 14)
X_hot_test :  (39612, 14)
X_hot_integer :  (92428, 22)
X_hot_integer_test :  (39612, 22)


In [3]:
model_regression_0 = MLPRegressor()
model_regression_0.fit(X_hot, y_train[:,0])
y_pred_0 = model_regression_0.predict(X_hot_test)

model_regression_1 = MLPClassifier(max_iter=400)
model_regression_1.fit(X_hot_integer, y_train[:,1])
y_pred_1 = model_regression_1.predict(X_hot_test_integer)

model_regression_2 = MLPRegressor()
model_regression_2.fit(X_hot, y_train[:,2])
y_pred_2 = model_regression_2.predict(X_hot_test)

model_regression_3 = MLPClassifier(max_iter=400)
model_regression_3.fit(X_hot_integer, y_train[:,3])
y_pred_3 = model_regression_3.predict(X_hot_test_integer)

In [4]:
y_test_0 = y_test[:,0]
y_test_1 = y_test[:,1]
y_test_2 = y_test[:,2]
y_test_3 = y_test[:,3]    

In [5]:
sum_0 = 0
for i in range(y_test_0.shape[0]):
    sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))
sum_0 = sum_0 / y_test_0.shape[0]

print(properties_txt[0])
print()
print("Number of unique values in y_test[:,0] : ", np.unique(y_test_0).shape[0])
print("Min of y_test[:,0] : ", np.min(y_test_0))
print("Max of y_test[:,0] : ", np.max(y_test_0))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_0).shape[0])
print("Min of y_pred : ", np.min(y_pred_0))
print("Max of y_pred : ", np.max(y_pred_0))
print()
print("Error is on avearage : ", sum_0)
print("Let's say the target value is : 1.0, then the model would guess a value in the following interval :")
print(" -> [", 1.0 - sum_0,",",1.0 + sum_0,"]")

logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids

Number of unique values in y_test[:,0] :  6400
Min of y_test[:,0] :  -2.8142
Max of y_test[:,0] :  3.7569

Number of unique values in y_pred :  1339
Min of y_pred :  -2.516537764728384
Max of y_pred :  3.7080498391464216

Error is on avearage :  0.26377195681773097
Let's say the target value is : 1.0, then the model would guess a value in the following interval :
 -> [ 0.736228043182269 , 1.263771956817731 ]


In [6]:
sum_1 = 0
sum_1_5 = 0
for i in range(y_test_1.shape[0]):
    sum_1 = sum_1 + np.abs(np.abs(y_pred_1[i]) - np.abs(y_test_1[i]))
    if(y_pred_1[i] == y_test_1[i]):
        sum_1_5 = sum_1_5 + 1
sum_1 = sum_1 / y_test_1.shape[0]
sum_1_5_r = sum_1_5 / y_test_1.shape[0] * 100

print(properties_txt[1])
print()
print("Number of unique values in y_test[:,1] : ", np.unique(y_test_1).shape[0])
print("Min of y_test[:,1] : ", np.min(y_test_1))
print("Max of y_test[:,1] : ", np.max(y_test_1))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_1).shape[0])
print("Min of y_pred : ", np.min(y_pred_1))
print("Max of y_pred : ", np.max(y_pred_1))
print()
print("Error is on avearage : ", sum_1)
print("There were ", sum_1_5, " correct predictions, out of ", y_test_1.shape[0], ", which is ", 100 - sum_1_5_r,"% error rate")

Number of rotatable bonds (RBN): the number of bonds which allow free rotation around themselves

Number of unique values in y_test[:,1] :  7
Min of y_test[:,1] :  0.0
Max of y_test[:,1] :  6.0

Number of unique values in y_pred :  7
Min of y_pred :  0.0
Max of y_pred :  6.0

Error is on avearage :  0.5411743915985056
There were  22391  correct predictions, out of  39612 , which is  43.474199737453304 % error rate


In [7]:
sum_2 = 0
sum_2_r = 0
for i in range(y_test_2.shape[0]):
    sum_2 = sum_2 + np.abs(np.abs(y_pred_2[i]) - np.abs(y_test_2[i]))
    sum_2_r = sum_2_r + np.abs(100 - (np.abs(y_pred_2[i]) * 100 / np.abs(y_test_2[i])))
sum_2 = sum_2 / y_test_2.shape[0]
sum_2_r = sum_2_r / y_test_2.shape[0]

print(properties_txt[2])
print()
print("Number of unique values in y_test[:,2] : ", np.unique(y_test_2).shape[0])
print("Min of y_test[:,2] : ", np.min(y_test_2))
print("Max of y_test[:,2] : ", np.max(y_test_2))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_2).shape[0])
print("Min of y_pred : ", np.min(y_pred_2))
print("Max of y_pred : ", np.max(y_pred_2))
print()
print("Error is on avearage : ", sum_2)
print("Error rate is on avearage : ", sum_2_r, "% -> for each guess, the model is of by ", sum_2_r, "%")
print("Let's say the target value is : 150, then the model would guess a value in the following interval :")
print(" -> [", 150 - sum_2,",",150 + sum_2,"]")

Molecular weight (MW): the weight of a molecule based on the atomic masses of all atoms in the molecule

Number of unique values in y_test[:,2] :  222
Min of y_test[:,2] :  104.152
Max of y_test[:,2] :  152.037

Number of unique values in y_pred :  1338
Min of y_pred :  104.09959678950202
Max of y_pred :  151.24252066853614

Error is on avearage :  0.041148850219538353
Error rate is on avearage :  0.03234002179561573 % -> for each guess, the model is of by  0.03234002179561573 %
Let's say the target value is : 150, then the model would guess a value in the following interval :
 -> [ 149.95885114978046 , 150.04114885021954 ]


In [8]:
sum_3 = 0
sum_3_5 = 0
for i in range(y_test_3.shape[0]):
    sum_3 = sum_3 + np.abs(np.abs(y_pred_3[i]) - np.abs(y_test_3[i]))
    if(y_pred_3[i] == y_test_3[i]):
        sum_3_5 = sum_3_5 + 1
sum_3 = sum_3 / y_test_1.shape[0]
sum_3_5_r = sum_3_5 / y_test_1.shape[0] * 100

print(properties_txt[3])
print()
print("Number of unique values in y_test[:,3] : ", np.unique(y_test_3).shape[0])
print("Min of y_test[:,3] : ", np.min(y_test_3))
print("Max of y_test[:,3] : ", np.max(y_test_3))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_3).shape[0])
print("Min of y_pred : ", np.min(y_pred_3))
print("Max of y_pred : ", np.max(y_pred_3))
print()
print("Error is on avearage : ", sum_3)
print("There were ", sum_3_5, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_3_5_r,"% error rate")


Number of the rings (RN): the number of connected sets of atoms and bonds in which every atom and bond is a member of a cycle

Number of unique values in y_test[:,3] :  9
Min of y_test[:,3] :  0.0
Max of y_test[:,3] :  8.0

Number of unique values in y_pred :  9
Min of y_pred :  0.0
Max of y_pred :  8.0

Error is on avearage :  0.22132182167020095
There were  31619  correct predictions, out of  39612 , which is  20.17822881954963 % error rate


In [9]:
lr_ = [0.1, 0.01, 0.001, 0.0001]
couches_ = [1, 5, 10, 15, 20]
neurones_ = [1, 2, 3, 4, 5, 6]
best_0 = [100,0,0,0]
best_1 = [100,0,0,0]
best_2 = [100,0,0,0]
best_3 = [100,0,0,0]

for lr in lr_:
    for couche in couches_:
        for neurone in neurones_:
            
            model_regression_0 = MLPRegressor(hidden_layer_sizes=(couche,neurone), learning_rate_init=lr)
            model_regression_0.fit(X_hot, y_train[:,0])
            y_pred_0 = model_regression_0.predict(X_hot_test)
            
            model_regression_1 = MLPClassifier(max_iter=400, hidden_layer_sizes=(couche,neurone), learning_rate_init=lr)
            model_regression_1.fit(X_hot_integer, y_train[:,1])
            y_pred_1 = model_regression_1.predict(X_hot_test_integer)
            
            model_regression_2 = MLPRegressor(hidden_layer_sizes=(couche,neurone), learning_rate_init=lr)
            model_regression_2.fit(X_hot, y_train[:,2])
            y_pred_2 = model_regression_2.predict(X_hot_test)

            model_regression_3 = MLPClassifier(max_iter=400, hidden_layer_sizes=(couche,neurone), learning_rate_init=lr)
            model_regression_3.fit(X_hot_integer, y_train[:,3])
            y_pred_3 = model_regression_3.predict(X_hot_test_integer)
            
            # calcul res
            sum_0 = 0
            sum_1 = 0
            sum_2 = 0
            sum_3 = 0
            for i in range(y_test_0.shape[0]):
                
                # 0
                sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))
                
                # 1
                if(y_pred_1[i] == y_test_1[i]):
                    sum_1 = sum_1 + 1
                    
                # 2
                sum_2 = sum_2 + np.abs(np.abs(y_pred_2[i]) - np.abs(y_test_2[i]))
                
                # 3
                if(y_pred_3[i] == y_test_3[i]):
                    sum_3 = sum_3 + 1
                 
            sum_0 = sum_0 / y_test_0.shape[0]
            sum_1_r = 100 - (sum_1 / y_test_1.shape[0] * 100)
            sum_2 = sum_2 / y_test_2.shape[0]
            sum_3_r = 100 - (sum_3 / y_test_3.shape[0] * 100)
            
            # change res if needed
            if sum_0 < best_0[0]:
                best_0 = [sum_0, couche, neurone, lr]
                
            if sum_1_r < best_1[0]:
                best_1 = [sum_1_r, couche, neurone, lr]
                
            if sum_2 < best_2[0]:
                best_2 = [sum_2, couche, neurone, lr]
                
            if sum_3_r < best_3[0]:
                best_3 = [sum_3_r, couche, neurone, lr]
            
            print(lr, " ", couche, " ",neurone, "   - done")
            
print()                
print(best_0)
print(best_1)
print(best_2)
print(best_3)

0.1   1   1    - done
0.1   1   2    - done
0.1   1   3    - done
0.1   1   4    - done
0.1   1   5    - done
0.1   1   6    - done
0.1   5   1    - done
0.1   5   2    - done
0.1   5   3    - done
0.1   5   4    - done
0.1   5   5    - done
0.1   5   6    - done
0.1   10   1    - done
0.1   10   2    - done
0.1   10   3    - done
0.1   10   4    - done
0.1   10   5    - done
0.1   10   6    - done
0.1   15   1    - done
0.1   15   2    - done
0.1   15   3    - done
0.1   15   4    - done
0.1   15   5    - done
0.1   15   6    - done
0.1   20   1    - done
0.1   20   2    - done
0.1   20   3    - done
0.1   20   4    - done
0.1   20   5    - done
0.1   20   6    - done
0.01   1   1    - done
0.01   1   2    - done
0.01   1   3    - done
0.01   1   4    - done
0.01   1   5    - done
0.01   1   6    - done
0.01   5   1    - done
0.01   5   2    - done
0.01   5   3    - done
0.01   5   4    - done
0.01   5   5    - done
0.01   5   6    - done
0.01   10   1    - done
0.01   10   2    - don



0.001   1   2    - done
0.001   1   3    - done
0.001   1   4    - done
0.001   1   5    - done




0.001   1   6    - done
0.001   5   1    - done
0.001   5   2    - done




0.001   5   3    - done
0.001   5   4    - done
0.001   5   5    - done
0.001   5   6    - done
0.001   10   1    - done
0.001   10   2    - done




0.001   10   3    - done
0.001   10   4    - done
0.001   10   5    - done
0.001   10   6    - done




0.001   15   1    - done
0.001   15   2    - done
0.001   15   3    - done
0.001   15   4    - done
0.001   15   5    - done
0.001   15   6    - done
0.001   20   1    - done
0.001   20   2    - done
0.001   20   3    - done
0.001   20   4    - done
0.001   20   5    - done
0.001   20   6    - done




0.0001   1   1    - done




0.0001   1   2    - done
0.0001   1   3    - done
0.0001   1   4    - done
0.0001   1   5    - done




0.0001   1   6    - done
0.0001   5   1    - done




0.0001   5   2    - done




0.0001   5   3    - done




0.0001   5   4    - done




0.0001   5   5    - done




0.0001   5   6    - done




0.0001   10   1    - done




0.0001   10   2    - done




0.0001   10   3    - done




0.0001   10   4    - done




0.0001   10   5    - done




0.0001   10   6    - done




0.0001   15   1    - done




0.0001   15   2    - done




0.0001   15   3    - done




0.0001   15   4    - done




0.0001   15   5    - done




0.0001   15   6    - done




0.0001   20   1    - done




0.0001   20   2    - done




0.0001   20   3    - done




0.0001   20   4    - done




0.0001   20   5    - done




0.0001   20   6    - done

[0.26404827656252755, 20, 6, 0.001]
[45.78915480157528, 15, 4, 0.001]
[2.9858224688390417e-06, 1, 5, 0.001]
[35.03483792790064, 20, 5, 0.001]


In [None]:
best_0 = [100,0,0,0]
best_1 = [100,0,0,0]
best_2 = [100,0,0,0]
best_3 = [100,0,0,0]

model_regression_0 = MLPRegressor(hidden_layer_sizes=(20,6), learning_rate_init=0.001)
model_regression_0.fit(X_hot, y_train[:,0])
y_pred_0 = model_regression_0.predict(X_hot_test)

model_regression_1 = MLPClassifier(max_iter=400, hidden_layer_sizes=(15,4), learning_rate_init=0.001)
model_regression_1.fit(X_hot_integer, y_train[:,1])
y_pred_1 = model_regression_1.predict(X_hot_test_integer)

model_regression_2 = MLPRegressor(hidden_layer_sizes=(1,5), learning_rate_init=0.001)
model_regression_2.fit(X_hot, y_train[:,2])
y_pred_2 = model_regression_2.predict(X_hot_test)

model_regression_3 = MLPClassifier(max_iter=400, hidden_layer_sizes=(20,5), learning_rate_init=0.001)
model_regression_3.fit(X_hot_integer, y_train[:,3])
y_pred_3 = model_regression_3.predict(X_hot_test_integer)

# calcul res
sum_0 = 0
sum_1 = 0
sum_2 = 0
sum_3 = 0
for i in range(y_test_0.shape[0]):

    # 0
    sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))

    # 1
    if(y_pred_1[i] == y_test_1[i]):
        sum_1 = sum_1 + 1

    # 2
    sum_2 = sum_2 + np.abs(np.abs(y_pred_2[i]) - np.abs(y_test_2[i]))

    # 3
    if(y_pred_3[i] == y_test_3[i]):
        sum_3 = sum_3 + 1
        
if sum_0 < best_0[0]:
    best_0 = [sum_0, 20, 6, 0.001]

if sum_1_r < best_1[0]:
    best_1 = [sum_1_r, 15, 4, 0.001]

if sum_2 < best_2[0]:
    best_2 = [sum_2, 1, 5, 0.001]

if sum_3_r < best_3[0]:
    best_3 = [sum_3_r, 20, 5, 0.001]

In [17]:
model_regression_1 = MLPClassifier(max_iter=2000, hidden_layer_sizes=(200,15), learning_rate_init=0.001, alpha=1e-5)
model_regression_1.fit(X_hot_integer, y_train[:,1])
y_pred_1 = model_regression_1.predict(X_hot_test_integer)

model_regression_3 = MLPClassifier(max_iter=2000, hidden_layer_sizes=(200,15), learning_rate_init=0.001, alpha=1e-5)
model_regression_3.fit(X_hot, y_train[:,3])
y_pred_3 = model_regression_3.predict(X_hot_test)

# calcul res
sum_1 = 0
sum_3 = 0
for i in range(y_test_0.shape[0]):

    # 1
    if(y_pred_1[i] == y_test_1[i]):
        sum_1 = sum_1 + 1

    # 3
    if(y_pred_3[i] == y_test_3[i]):
        sum_3 = sum_3 + 1

sum_1_r = 100 - (sum_1 / y_test_1.shape[0] * 100)
sum_3_r = 100 - (sum_3 / y_test_3.shape[0] * 100)

if sum_1_r < best_1[0]:
    best_1 = [sum_1_r, 200, 15, 0.001]
if sum_3_r < best_3[0]:
    best_3 = [sum_3_r, 200, 15, 0.001]
print(sum_1_r)
print(sum_3_r)

38.637281631828735
5.462991012824389


In [18]:
model_regression_0 = MLPRegressor(max_iter=2000, hidden_layer_sizes=(200,20), learning_rate_init=0.001)
model_regression_0.fit(X_hot, y_train[:,0])
y_pred_0 = model_regression_0.predict(X_hot_test)

for i in range(y_test_0.shape[0]):
    sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))
sum_0 = sum_0 / y_test_0.shape[0]

if sum_0 < best_0[0]:
    best_0 = [sum_0, 200, 20, 0.001]
print(sum_0)

0.26348173629438465


In [19]:
import json #ne jamais faire ça avec les import 

def load_dict_errors(path="all_errors.json"):
    with open(path, "r") as f:
        dict_errors = json.load(f)
        
    return dict_errors

dict_errors_0_2 = load_dict_errors(path="all_errors_0_2.json")
dict_errors_1_3 = load_dict_errors(path="all_errors_1_3.json")
print("dict_errors_0_2", dict_errors_0_2)
print("dict_errors_1_3", dict_errors_1_3)


def save_dict_errors(dict_errors, path="all_errors.json"):
    with open(path, "w") as f:
        json.dump(dict_errors, f)

dict_errors_0_2['neural_network'] = [best_0[0], best_2[0]]
dict_errors_1_3['neural_network'] = [best_1[0], best_3[0]]

print("dict_errors, after changes : ",dict_errors_0_2)
print("dict_errors, after changes : ",dict_errors_1_3)

save_dict_errors(dict_errors_0_2, path="all_errors_0_2.json")
save_dict_errors(dict_errors_1_3, path="all_errors_1_3.json")

dict_errors_0_2 {'lienar_regression': [0.34613815200664955, 0.016720012631161744]}
dict_errors_1_3 {'GradBoostRegressor': [64.05887104917701, 6.879228516611121], 'SVR': [57.472483085933554, 7.290719983843275], 'lienar_regression': [56.68231848934666, 5.364535999192171]}
dict_errors, after changes :  {'lienar_regression': [0.34613815200664955, 0.016720012631161744], 'neural_network': [0.26404827656252755, 2.9858224688390417e-06]}
dict_errors, after changes :  {'GradBoostRegressor': [64.05887104917701, 6.879228516611121], 'SVR': [57.472483085933554, 7.290719983843275], 'lienar_regression': [56.68231848934666, 5.364535999192171], 'neural_network': [45.78915480157528, 35.03483792790064]}
