In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
import csv
import sklearn.model_selection
import sklearn.ensemble
import rdkit.Chem.Descriptors
from rdkit import Chem

In [2]:
# load the data into a dataframe
DataFrame = pd.read_csv('tested_molecules-1.csv')

In [3]:
# Randomly select 20% of the data to be the test set. The other 80% will be the training set.
# Devide the training set into the molecules (X) and the labels (y). 
Train, Test = sklearn.model_selection.train_test_split(DataFrame, test_size = 0.2)
X = Train['SMILES']
y = Train['ALDH1_inhibition']
print(X)
print(y)

887       CCN(c1cccc(C)c1)S(=O)(=O)c1ccc2c(c1)CCN2C(C)=O
172      CC(C)c1ccc(/C=N/NC(=O)C(=O)NCCC[NH+]2CCOCC2)cc1
75     CC(C)OC(=O)C[C@]12CN3C[NH+](C1)C[C@@](CC(=O)OC...
647                   N#Cc1ccccc1Sc1ccccc1C(=O)Nc1ccccc1
493                       CCC(C)(C)NC(=O)Nc1cc(Cl)ccc1OC
                             ...                        
830                    O=C(Nc1ccccc1N1CCCCC1)c1cccc(F)c1
332                             Cc1nc(N)nc(COc2ccccc2)n1
410                          CC1(C)CN(c2cccc(C#N)c2)C1=O
0      COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...
223    Cn1c(=O)n(C)c2cc(NS(=O)(=O)c3ccc(C(C)(C)C)cc3)...
Name: SMILES, Length: 800, dtype: object
887    0
172    1
75     1
647    0
493    0
      ..
830    0
332    0
410    0
0      1
223    1
Name: ALDH1_inhibition, Length: 800, dtype: int64


In [4]:
# Create a matrix and add the descriptors for the training set.
feature_matrix_train = pd.DataFrame()
descripted_X = []
for molecule in X:
    mol = rdkit.Chem.MolFromSmiles(molecule)
    desc_value = rdkit.Chem.Descriptors.MaxAbsPartialCharge(mol)
    descripted_X.append(desc_value)
feature_matrix_train['MaxAbsPartialCharge'] = descripted_X

descripted_X = []
for molecule in X:
    mol = Chem.MolFromSmiles(molecule)
    desc_value = rdkit.Chem.Descriptors.MinAbsPartialCharge(mol)
    descripted_X.append(desc_value)
feature_matrix_train['MinAbsPartialCharge'] = descripted_X
print(feature_matrix_train)

     MaxAbsPartialCharge  MinAbsPartialCharge
0               0.312060             0.263818
1               0.369753             0.328837
2               0.462925             0.306844
3               0.321963             0.256310
4               0.494577             0.319256
..                   ...                  ...
795             0.369860             0.255323
796             0.485614             0.223136
797             0.310804             0.233977
798             0.496477             0.249868
799             0.328070             0.294998

[800 rows x 2 columns]


In [6]:
# Train a random forest using the feature matrix and the labels.
forest = sklearn.ensemble.RandomForestClassifier()
predictor = forest.fit(feature_matrix_train, y)

In [7]:
# Store the correct labels for the test set in an array.
y_test = Test['ALDH1_inhibition']
test_correct_values = []
for y in y_test:
    test_correct_values.append(y)
print(test_correct_values)

[0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]


In [8]:
# Create a matrix and add the descriptors for the test set.
feature_matrix_test = pd.DataFrame()
X_test = Test['SMILES']
descripted_X_test = []
for molecule in X_test:
    mol = rdkit.Chem.MolFromSmiles(molecule)
    desc_value = rdkit.Chem.Descriptors.MaxAbsPartialCharge(mol)
    descripted_X_test.append(desc_value)
feature_matrix_test['MaxAbsPartialCharge'] = descripted_X_test

descripted_X_test = []
for molecule in X_test:
    mol = Chem.MolFromSmiles(molecule)
    desc_value = rdkit.Chem.Descriptors.MinAbsPartialCharge(mol)
    descripted_X_test.append(desc_value)
feature_matrix_test['MinAbsPartialCharge'] = descripted_X_test
print(feature_matrix_test)

     MaxAbsPartialCharge  MinAbsPartialCharge
0               0.507967             0.130266
1               0.397811             0.323515
2               0.507966             0.281549
3               0.293174             0.265672
4               0.742162             0.415988
..                   ...                  ...
195             0.316213             0.144016
196             0.507243             0.454165
197             0.345008             0.321447
198             0.488917             0.162102
199             0.454828             0.289185

[200 rows x 2 columns]


In [11]:
# Apply the random forest to your test matrix and compare the predictions with the correct labels.
predictions = forest.predict(feature_matrix_test)
TP = 0
TN = 0
FP = 0
FN = 0
i = 0
for prediction in predictions:
    if prediction == 0:
        if test_correct_values[i] == 0:
            TN += 1
            i += 1
        elif test_correct_values[i] == 1:
            FN += 1
            i += 1
    elif prediction == 1:
        if test_correct_values[i] == 0:
            FP += 1
            i += 1
        elif test_correct_values[i] == 1:
            TP += 1
            i += 1
print('True positive:', TP)
print('True negative:', TN)
print('False positive:', FP)
print('False negative:', FN)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0]
[0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0

In [42]:
# run after first iteration
print('run 1:')
print('True positive:', TP)
print('True negative:', TN)
print('False positive:', FP)
print('False negative:', FN)

run 1:
True positive: 13
True negative: 118
False positive: 23
False negative: 46


In [51]:
# run after second iteration
print('run 2:')
print('True positive:', TP)
print('True negative:', TN)
print('False positive:', FP)
print('False negative:', FN)

run 2:
True positive: 17
True negative: 103
False positive: 39
False negative: 41


In [None]:
# run after third iteration
print('run 3:')
print('True positive:', TP)
print('True negative:', TN)
print('False positive:', FP)
print('False negative:', FN)

In [None]:
# run after fourth iteration
print('run 4:')
print('True positive:', TP)
print('True negative:', TN)
print('False positive:', FP)
print('False negative:', FN)

In [None]:
# run after fifth iteration
print('run 5:')
print('True positive:', TP)
print('True negative:', TN)
print('False positive:', FP)
print('False negative:', FN)