In [1]:
from utils import get_smiles_encodings, load_data, smile_to_hot
from sklearn.neighbors import KNeighborsClassifier
from datetime import datetime
import numpy as np

In [2]:
file_smiles = './dataset/QM9.txt'
file_properties = './dataset/properties_QM9.npz'
smiles, alphabet, largest_molecule_len = get_smiles_encodings(file_smiles)
properties = np.load(file_properties)['properties'].astype(np.float32)
properties_txt = ['logP: represents a measure of the tendency of a compound to move from the aqueous phase into lipids',
                  'Number of rotatable bonds (RBN): the number of bonds which allow free rotation around themselves',
                  'Molecular weight (MW): the weight of a molecule based on the atomic masses of all atoms in the molecule',
                  'Number of the rings (RN): the number of connected sets of atoms and bonds in which every atom and bond is a member of a cycle']

X_train, X_test, y_train, y_test = load_data(smiles, properties)
print("X - train : ", X_train.shape)
print("y - train : ", y_train.shape)
X_hot = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_train])
X_hot_test = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[1].sum(axis=0) for x in X_test])
print("X_hot : ", X_hot.shape)
print("X_hot_test : ", X_hot_test.shape)
X_hot_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_train])
X_hot_test_integer = np.array([smile_to_hot(x, largest_molecule_len, alphabet)[0] for x in X_test])
print("X_hot_integer : ", X_hot_integer.shape)
print("X_hot_integer_test : ", X_hot_test_integer.shape)

X - train :  (92428,)
y - train :  (92428, 4)
X_hot :  (92428, 14)
X_hot_test :  (39612, 14)
X_hot_integer :  (92428, 22)
X_hot_integer_test :  (39612, 22)


In [3]:
y_train_0 = y_train[:,0]
y_train_1 = y_train[:,1]
y_train_2 = y_train[:,2]
y_train_3 = y_train[:,3]  

y_test_0 = y_test[:,0]
y_test_1 = y_test[:,1]
y_test_2 = y_test[:,2]
y_test_3 = y_test[:,3]  

nn = 5

In [4]:
nearest_neighboors_1 = KNeighborsClassifier(n_neighbors=nn)
nearest_neighboors_1.fit(X_hot, y_train_1)
y_pred_1 = nearest_neighboors_1.predict(X_hot_test)

In [5]:
nearest_neighboors_1_int = KNeighborsClassifier(n_neighbors=nn)
nearest_neighboors_1_int.fit(X_hot_integer, y_train_1)
y_pred_1_int = nearest_neighboors_1_int.predict(X_hot_test_integer)

KeyboardInterrupt: 

In [None]:
nearest_neighboors_3 = KNeighborsClassifier(n_neighbors=nn)
nearest_neighboors_3.fit(X_hot, y_train_3)
y_pred_3 = nearest_neighboors_3.predict(X_hot_test)

In [None]:
nearest_neighboors_3_int = KNeighborsClassifier(n_neighbors=nn)
nearest_neighboors_3_int.fit(X_hot_integer, y_train_3)
y_pred_3_int = nearest_neighboors_3_int.predict(X_hot_test_integer)

In [None]:
sum_1 = 0
sum_1_5 = 0
for i in range(y_test_1.shape[0]):
    sum_1 = sum_1 + np.abs(np.abs(y_pred_1[i]) - np.abs(y_test_1[i]))
    if(y_pred_1[i] == y_test_1[i]):
        sum_1_5 = sum_1_5 + 1
sum_1 = sum_1 / y_test_1.shape[0]
sum_1_5_r = sum_1_5 / y_test_1.shape[0] * 100

print(properties_txt[1])
print()
print("DATATYPE : X - hot")
print()
print("Number of unique values in y_test[:,1] : ", np.unique(y_test_1).shape[0])
print("Min of y_test[:,1] : ", np.min(y_test_1))
print("Max of y_test[:,1] : ", np.max(y_test_1))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_1).shape[0])
print("Min of y_pred : ", np.min(y_pred_1))
print("Max of y_pred : ", np.max(y_pred_1))
print()
print("Error is on avearage : ", sum_1)
print("There were ", sum_1_5, " correct predictions, out of ", y_test_1.shape[0], ", which is ", 100 - sum_1_5_r,"% error rate")

In [None]:
sum_1 = 0
sum_1_5 = 0
for i in range(y_test_1.shape[0]):
    sum_1 = sum_1 + np.abs(np.abs(y_pred_1_int[i]) - np.abs(y_test_1[i]))
    if(y_pred_1_int[i] == y_test_1[i]):
        sum_1_5 = sum_1_5 + 1
sum_1 = sum_1 / y_test_1.shape[0]
sum_1_5_r = sum_1_5 / y_test_1.shape[0] * 100

print(properties_txt[1])
print()
print("DATATYPE : X - hot - integer")
print()
print("Number of unique values in y_test[:,1] : ", np.unique(y_test_1).shape[0])
print("Min of y_test[:,1] : ", np.min(y_test_1))
print("Max of y_test[:,1] : ", np.max(y_test_1))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_1).shape[0])
print("Min of y_pred : ", np.min(y_pred_1))
print("Max of y_pred : ", np.max(y_pred_1))
print()
print("Error is on avearage : ", sum_1)
print("There were ", sum_1_5, " correct predictions, out of ", y_test_1.shape[0], ", which is ", 100 - sum_1_5_r,"% error rate")

In [None]:
sum_3 = 0
sum_3_5 = 0
for i in range(y_test_3.shape[0]):
    sum_3 = sum_3 + np.abs(np.abs(y_pred_3[i]) - np.abs(y_test_3[i]))
    if(y_pred_3[i] == y_test_3[i]):
        sum_3_5 = sum_3_5 + 1
sum_3 = sum_3 / y_test_1.shape[0]
sum_3_5_r = sum_3_5 / y_test_1.shape[0] * 100

print(properties_txt[3])
print()
print("DATATYPE : X - hot")
print()
print("Number of unique values in y_test[:,3] : ", np.unique(y_test_3).shape[0])
print("Min of y_test[:,3] : ", np.min(y_test_3))
print("Max of y_test[:,3] : ", np.max(y_test_3))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_3).shape[0])
print("Min of y_pred : ", np.min(y_pred_3))
print("Max of y_pred : ", np.max(y_pred_3))
print()
print("Error is on avearage : ", sum_3)
print("There were ", sum_3_5, " correct predictions, out of ", y_test_3.shape[0], ", which is ", 100 - sum_3_5_r,"% error rate")

In [None]:
sum_1 = 0
sum_1_5 = 0
for i in range(y_test_1.shape[0]):
    sum_1 = sum_1 + np.abs(np.abs(y_pred_3_int[i]) - np.abs(y_test_3[i]))
    if(y_pred_3_int[i] == y_test_3[i]):
        sum_1_5 = sum_1_5 + 1
sum_1 = sum_1 / y_test_1.shape[0]
sum_1_5_r = sum_1_5 / y_test_1.shape[0] * 100

print(properties_txt[3])
print()
print("DATATYPE : X - hot - integer")
print()
print("Number of unique values in y_test[:,1] : ", np.unique(y_test_1).shape[0])
print("Min of y_test[:,1] : ", np.min(y_test_1))
print("Max of y_test[:,1] : ", np.max(y_test_1))
print()
print("Number of unique values in y_pred : ", np.unique(y_pred_1).shape[0])
print("Min of y_pred : ", np.min(y_pred_1))
print("Max of y_pred : ", np.max(y_pred_1))
print()
print("Error is on avearage : ", sum_1)
print("There were ", sum_1_5, " correct predictions, out of ", y_test_1.shape[0], ", which is ", 100 - sum_1_5_r,"% error rate")