In [1]:
from utils import get_smiles_encodings, load_data, smile_to_hot
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import RidgeClassifier
from datetime import datetime
import numpy as np

In [2]:
file_smiles = './dataset/QM9.txt'
file_properties = './dataset/properties_QM9.npz'
smiles, alphabet, largest_molecule_len = get_smiles_encodings(file_smiles)
properties = np.load(file_properties)['properties'].astype(np.float32)
properties_txt = ['logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids',
                  'Number of rotatable bonds (RBN): the number of bonds which allow free rotation around themselves',
                  'Molecular weight (MW): the weight of a molecule based on the atomic masses of all atoms in the molecule',
                  'Number of the rings (RN): the number of connected sets of atoms and bonds in which every atom and bond is a member of a cycle']

X_train, X_test, y_train, y_test = load_data(smiles, properties)
print("X - train : ", X_train.shape)
print("y - train : ", y_train.shape)
X_hot = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_train])
X_hot_test = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_test])
print("X_hot : ", X_hot.shape)
print("X_hot_test : ", X_hot_test.shape)
X_hot_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_train])
X_hot_test_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_test])
print("X_hot_integer : ", X_hot_integer.shape)
print("X_hot_integer_test : ", X_hot_test_integer.shape)

X - train :  (92428,)
y - train :  (92428, 4)
X_hot :  (92428, 14)
X_hot_test :  (39612, 14)
X_hot_integer :  (92428, 22)
X_hot_integer_test :  (39612, 22)


In [3]:
y_test_0 = y_test[:,0]
y_test_1 = y_test[:,1]
y_test_2 = y_test[:,2]
y_test_3 = y_test[:,3]    

In [17]:
max_iter_ = [100, 500, 1000, 2000, 5000]
learning_rate_ = ['constant', 'optimal', 'invscaling', 'adaptive']
best_0 = [100,0,0]
best_1 = [100,0,0]
best_2 = [100,0,0]
best_3 = [100,0,0]

for max_iterR in max_iter_:
    for lr in learning_rate_:
        # 0 
        model_regression_0 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_0.fit(X_hot, y_train[:,0])
        y_pred_0 = model_regression_0.predict(X_hot_test)
        
        # 1
        model_regression_1 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_1.fit(X_hot, y_train[:,1])
        y_pred_1 = model_regression_1.predict(X_hot_test)
        y_pred_1 = np.round(y_pred_1)

        # 2 
        model_regression_2 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_2.fit(X_hot, y_train[:,2])
        y_pred_2 = model_regression_2.predict(X_hot_test)
        
        # 3 
        model_regression_3 = SGDRegressor(max_iter = max_iterR, learning_rate = lr)
        model_regression_3.fit(X_hot, y_train[:,3])
        y_pred_3 = model_regression_3.predict(X_hot_test)
        y_pred_3 = np.round(y_pred_3)
        
        # calcul res
        sum_0 = 0
        sum_1 = 0
        sum_2 = 0
        sum_3 = 0
        for i in range(y_test_0.shape[0]):

            # 0
            sum_0 = sum_0 + np.abs(np.abs(y_pred_0[i]) - np.abs(y_test_0[i]))

            # 1
            if(y_pred_1[i] == y_test_1[i]):
                sum_1 = sum_1 + 1

            # 2
            sum_2 = sum_2 + np.abs(np.abs(y_pred_2[i]) - np.abs(y_test_2[i]))

            # 3
            if(y_pred_3[i] == y_test_3[i]):
                sum_3 = sum_3 + 1

        sum_0 = sum_0 / y_test_0.shape[0]
        sum_1_r = 100 - (sum_1 / y_test_1.shape[0] * 100)
        sum_2 = sum_2 / y_test_2.shape[0]
        sum_3_r = 100 - (sum_3 / y_test_3.shape[0] * 100)
        
        
        # change res if needed
        if sum_0 < best_0[0]:
            best_0 = [sum_0, max_iterR, lr]

        if sum_1_r < best_1[0]:
            best_1 = [sum_1_r, max_iterR, lr]
            
        if sum_2 < best_2[0]:
            best_2 = [sum_2, max_iterR, lr]

        if sum_3_r < best_3[0]:
            best_3 = [sum_3_r, max_iterR, lr]

        print(max_iterR, " ", lr, "   - done")
        
print()                
print(best_0)
print(best_1)
print(best_2)
print(best_3)
        

100   constant    - done
100   optimal    - done
100   invscaling    - done
100   adaptive    - done
500   constant    - done
500   optimal    - done
500   invscaling    - done
500   adaptive    - done
1000   constant    - done
1000   optimal    - done
1000   invscaling    - done
1000   adaptive    - done
2000   constant    - done
2000   optimal    - done
2000   invscaling    - done
2000   adaptive    - done
5000   constant    - done
5000   optimal    - done
5000   invscaling    - done
5000   adaptive    - done

[0.3430473929364795, 500, 'invscaling']
[56.67474502675957, 500, 'invscaling']
[0.013206376984375394, 2000, 'constant']
[5.3948298495405425, 100, 'invscaling']


In [5]:
print(properties_txt[0])
print()
print("model -> max_iter : ", best_0[1], " - learning rate : ", best_0[2])
print()
print("Error is on avearage : ", best_0[0])
print("Let's say the target value is : 1.0, then the model would guess a value in the following interval :")
print(" -> [", 1.0 - best_0[0],",",1.0 + best_0[0],"]")

logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids

model -> max_iter :  2000  - learning rate :  invscaling

Error is on avearage :  0.34321702049967723
Let's say the target value is : 1.0, then the model would guess a value in the following interval :
 -> [ 0.6567829795003228 , 1.3432170204996772 ]


In [6]:
print(properties_txt[1])
print()
print("model -> max_iter : ", best_1[1], " - learning rate : ", best_1[2])
print()
print("Error rate is : ", best_1[0], "%")

Number of rotatable bonds (RBN): the number of bonds which allow free rotation around themselves

model -> max_iter :  2000  - learning rate :  invscaling

Error rate is :  56.280924972230636 %


In [18]:
print(properties_txt[2])
print()
print("model -> max_iter : ", best_2[1], " - learning rate : ", best_2[2])
print()
print("Error is on avearage : ", best_2[0])
print("Let's say the target value is : 150, then the model would guess a value in the following interval :")
print(" -> [", 150.0 - best_2[0],",",150.0 + best_2[0],"]")

Molecular weight (MW): the weight of a molecule based on the atomic masses of all atoms in the molecule

model -> max_iter :  2000  - learning rate :  constant

Error is on avearage :  0.013206376984375394
Let's say the target value is : 150, then the model would guess a value in the following interval :
 -> [ 149.98679362301561 , 150.01320637698439 ]


In [8]:
print(properties_txt[3])
print()
print("model -> max_iter : ", best_3[1], " - learning rate : ", best_3[2])
print()
print("Error rate is : ", best_3[0], "%")

Number of the rings (RN): the number of connected sets of atoms and bonds in which every atom and bond is a member of a cycle

model -> max_iter :  500  - learning rate :  invscaling

Error rate is :  5.710390790669493 %


In [9]:
import json #ne jamais faire ça avec les import 

def load_dict_errors(path="all_errors.json"):
    with open(path, "r") as f:
        dict_errors = json.load(f)
        
    return dict_errors

dict_errors_0_2 = load_dict_errors(path="all_errors_0_2.json")
dict_errors_1_3 = load_dict_errors(path="all_errors_1_3.json")
print("dict_errors_0_2", dict_errors_0_2)
print("dict_errors_1_3", dict_errors_1_3)


def save_dict_errors(dict_errors, path="all_errors.json"):
    with open(path, "w") as f:
        json.dump(dict_errors, f)

dict_errors_0_2['lienar_regression'] = [best_0[0], best_2[0]]
dict_errors_1_3['lienar_regression'] = [best_1[0], best_3[0]]

print("dict_errors, after changes : ",dict_errors_0_2)
print("dict_errors, after changes : ",dict_errors_1_3)

save_dict_errors(dict_errors_0_2, path="all_errors_0_2.json")
save_dict_errors(dict_errors_1_3, path="all_errors_1_3.json")


dict_errors_0_2 {'lienar_regression': [0.34613815200664955, 0.016720012631161744], 'neural_network': [0.26404827656252755, 2.9858224688390417e-06]}
dict_errors_1_3 {'GradBoostRegressor': [64.05887104917701, 6.879228516611121], 'SVR': [57.472483085933554, 7.290719983843275], 'lienar_regression': [56.68231848934666, 5.364535999192171], 'neural_network': [45.78915480157528, 35.03483792790064], 'neer_neighboor': [54.907603756437446, 7.081187518933646]}
dict_errors, after changes :  {'lienar_regression': [0.34321702049967723, 0.017421293770233204], 'neural_network': [0.26404827656252755, 2.9858224688390417e-06]}
dict_errors, after changes :  {'GradBoostRegressor': [64.05887104917701, 6.879228516611121], 'SVR': [57.472483085933554, 7.290719983843275], 'lienar_regression': [56.280924972230636, 5.710390790669493], 'neural_network': [45.78915480157528, 35.03483792790064], 'neer_neighboor': [54.907603756437446, 7.081187518933646]}
