In [1]:
# Import required libraries for data manipulation and analysis
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

In [2]:
#Import required sklearn functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance
from collections import defaultdict

In [3]:
#Import sklearn classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
#Import library to oversample 
from imblearn.over_sampling import RandomOverSampler

In [5]:
#Import RDKit and Mordred libraries
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import Calculator, descriptors

In [6]:
#Allows figures to be visualized in jupyter notebook
%matplotlib inline

In [7]:
# Sets Pandas Display to Monitor Code
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

In [8]:
# Create Mordred Calculator
calc = Calculator(descriptors, ignore_3D=True)

In [9]:
# Read Training/Test data input File
data = pd.read_csv('BorylationTrainingTest.csv')

In [10]:
# Assigns Reactants Mordred Info
reactants = data['Substrate']

reactants_mol_list = []
for inChi_reactants in reactants:
  reactants_mol = Chem.MolFromInchi(inChi_reactants)
  reactants_mol_list.append(reactants_mol)

reactants_img = Draw.MolsToGridImage(reactants_mol_list, molsPerRow=4)

  % (maxMols))


In [11]:
# Puts reactants into Pandas Type
reactant_data = []
reactant_data = calc.pandas(reactants_mol_list)
reactant_data.info()

100%|██████████| 642/642 [00:58<00:00, 11.00it/s]


<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 642 entries, 0 to 641
Columns: 1613 entries, ABC to mZagreb2
dtypes: bool(2), float64(900), int64(324), object(387)
memory usage: 7.9+ MB


In [12]:
#Joins Mordred parameters with experimental, atomic charges, and JChem for Excel parameters
add_reactants = pd.concat((data, reactant_data), axis=1)

In [13]:
#Force any non-numeric entries as NaN and replace them with 0
final_data = add_reactants.apply(pd.to_numeric, errors='coerce')
final_data.fillna(0, inplace=True)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 642 entries, 0 to 641
Columns: 1682 entries, Substrate to mZagreb2
dtypes: bool(2), float64(1329), int64(351)
memory usage: 8.2 MB


In [14]:
# Seperate dataset as response variable (Product Ratio) and feature variables
#Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
X = final_data.drop('Product_Ratio' , axis = 1)
y = final_data['Product_Ratio']

In [15]:
final_selected_features = ['Hirshfeld Heavy Atom Charge', 'Hirshfeld Carbon Charge',
       'Hirshfeld Hydrogen Charge', 'ESP Heavy Atom Charge',
       'NPA Hydrogen Charge', 'Mulliken Heavy Charge',
       'Mulliken Hydrogen Charge', 'Steric Effect Index',
       'Atomic_Polarizability', 'Distance Degree', 'Dreiding Energy', 'AATS0d',
       'AATS0i', 'ATSC1d', 'AATSC2dv', 'AATSC1pe', 'GATS2pe', 'BCUTZ-1l',
       'BCUTse-1l']

In [16]:
final = X[['Hirshfeld Heavy Atom Charge', 'Hirshfeld Carbon Charge',
       'Hirshfeld Hydrogen Charge', 'ESP Heavy Atom Charge',
       'NPA Hydrogen Charge', 'Mulliken Heavy Charge',
       'Mulliken Hydrogen Charge', 'Steric Effect Index',
       'Atomic_Polarizability', 'Distance Degree', 'Dreiding Energy', 'AATS0d',
       'AATS0i', 'ATSC1d', 'AATSC2dv', 'AATSC1pe', 'GATS2pe', 'BCUTZ-1l',
       'BCUTse-1l']]

In [17]:
#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('BorylationValidation.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[final_selected_features]


100%|██████████| 59/59 [00:09<00:00,  6.15it/s]


In [18]:
#This cell details the machine learning algorithms tested during initial model testing

#Perform training/test set split
X_train, X_test, y_train, y_test = train_test_split(final, y, test_size=0.20, random_state=37)

#Apply over-sampling to dataset
ros = RandomOverSampler(random_state=10)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

X_train = X_resampled[final_selected_features]
X_test = X_test[final_selected_features] 

#MLP CLassifier
MLP = MLPClassifier(alpha=1, max_iter=1000)
MLP.fit(X_train, y_resampled)
pred_MLP = MLP.predict(X_test)
print("MLP Classifier Results")
print(classification_report(y_test, pred_MLP))
print("  ")

#SVM1 CLassifier
SVM1 = SVC(kernel="linear", C=0.025)
SVM1.fit(X_train, y_resampled)
pred_SVM1 = SVM1.predict(X_test)
print("SVM1 Classifier Results")
print(classification_report(y_test, pred_SVM1))
print("  ")

#SVM2 CLassifier
SVM2 = SVC(gamma=2, C=1)
SVM2.fit(X_train, y_resampled)
pred_SVM2 = SVM2.predict(X_test)
print("SVM2 Classifier Results")
print(classification_report(y_test, pred_SVM2))
print("  ")

#Gaussian Process CLassifier
Gauss = GaussianProcessClassifier(1.0 * RBF(1.0))
Gauss.fit(X_train, y_resampled)
pred_Gauss = Gauss.predict(X_test)
print("Gaussian Process Classifier Results")
print(classification_report(y_test, pred_Gauss))
print("  ")

#Decision Tree CLassifier
DT = DecisionTreeClassifier(max_depth=9)
DT.fit(X_train, y_resampled)
pred_DT = DT.predict(X_test)
print("Decision Tree Classifier Results")
print(classification_report(y_test, pred_DT))
print("  ")

#Ada Boost CLassifier
ada = AdaBoostClassifier()
ada.fit(X_train, y_resampled)
pred_ada = ada.predict(X_test)
print("Ada Boost Classifier Results")
print(classification_report(y_test, pred_ada))
print("  ")

#Naive Bayes CLassifier
NB = GaussianNB()
NB.fit(X_train, y_resampled)
pred_NB = NB.predict(X_test)
print("Naive Bayes Classifier Results")
print(classification_report(y_test, pred_NB))
print("  ")

#Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
rfc.fit(X_train, y_resampled)
pred_rfc = rfc.predict(X_test)
print("Random Forest Classifier Results")
print(classification_report(y_test, pred_rfc))

MLP Classifier Results
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       109
           1       0.06      0.05      0.05        20

    accuracy                           0.72       129
   macro avg       0.44      0.45      0.44       129
weighted avg       0.71      0.72      0.71       129

  
SVM1 Classifier Results
              precision    recall  f1-score   support

           0       0.92      0.43      0.59       109
           1       0.21      0.80      0.33        20

    accuracy                           0.49       129
   macro avg       0.56      0.62      0.46       129
weighted avg       0.81      0.49      0.55       129

  
SVM2 Classifier Results
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       109
           1       0.00      0.00      0.00        20

    accuracy                           0.80       129
   macro avg       0.42      0.47      0.44       1

In [19]:
#This cell runs the models ten times, evaluates classification via sklearn metrics and downloads the results to a csv file
MLPResults_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

SVM1Results_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

SVM2Results_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

GaussResults_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

DTResults_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

adaResults_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

NBResults_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

rfcResults_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

matrix_list = []
for x in range(1, 11):

    #MLP CLassifier
    MLP = MLPClassifier(alpha=1, max_iter=1000)
    MLP.fit(X_train, y_resampled)
    pred_MLP = MLP.predict(X_test)
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_MLP)
    cf_matrix = np.reshape(cf_matrix,(1,4))
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_MLP)
    prec = precision_score(y_test, pred_MLP, average=None)
    recall = recall_score(y_test, pred_MLP, average=None)
    F1 = f1_score(y_test, pred_MLP, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    MLPResults_df = MLPResults_df.append(pd.DataFrame(comb, columns=MLPResults_df.columns), ignore_index=True)
    
    #SVM1 CLassifier
    SVM1 = SVC(kernel="linear", C=0.025)
    SVM1.fit(X_train, y_resampled)
    pred_SVM1 = SVM1.predict(X_test)
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_SVM1)
    cf_matrix = np.reshape(cf_matrix,(1,4))   
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_SVM1)
    prec = precision_score(y_test, pred_SVM1, average=None)
    recall = recall_score(y_test, pred_SVM1, average=None)
    F1 = f1_score(y_test, pred_SVM1, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    SVM1Results_df = SVM1Results_df.append(pd.DataFrame(comb, columns=SVM1Results_df.columns), ignore_index=True)
    
    #SVM2 CLassifier
    SVM2 = SVC(gamma=2, C=1)
    SVM2.fit(X_train, y_resampled)
    pred_SVM2 = SVM2.predict(X_test)
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_SVM2)
    cf_matrix = np.reshape(cf_matrix,(1,4))   
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_SVM2)
    prec = precision_score(y_test, pred_SVM2, average=None)
    recall = recall_score(y_test, pred_SVM2, average=None)
    F1 = f1_score(y_test, pred_SVM1, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    SVM2Results_df = SVM2Results_df.append(pd.DataFrame(comb, columns=SVM2Results_df.columns), ignore_index=True)
    
    #Gaussian Process CLassifier
    Gauss = GaussianProcessClassifier(1.0 * RBF(1.0))
    Gauss.fit(X_train, y_resampled)
    pred_Gauss = Gauss.predict(X_test)  
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_Gauss)
    cf_matrix = np.reshape(cf_matrix,(1,4))   
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_Gauss)
    prec = precision_score(y_test, pred_Gauss, average=None)
    recall = recall_score(y_test, pred_Gauss, average=None)
    F1 = f1_score(y_test, pred_Gauss, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    GaussResults_df = GaussResults_df.append(pd.DataFrame(comb, columns=GaussResults_df.columns), ignore_index=True)
    
    #Decision Tree CLassifier
    DT = DecisionTreeClassifier(max_depth=9)
    DT.fit(X_train, y_resampled)
    pred_DT = DT.predict(X_test)
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_DT)
    cf_matrix = np.reshape(cf_matrix,(1,4))   
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_DT)
    prec = precision_score(y_test, pred_DT, average=None)
    recall = recall_score(y_test, pred_DT, average=None)
    F1 = f1_score(y_test, pred_DT, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    DTResults_df = DTResults_df.append(pd.DataFrame(comb, columns=DTResults_df.columns), ignore_index=True)
    
    #Ada Boost CLassifier
    ada = AdaBoostClassifier()
    ada.fit(X_train, y_resampled)
    pred_ada = ada.predict(X_test)
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_ada)
    cf_matrix = np.reshape(cf_matrix,(1,4))   
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_ada)
    prec = precision_score(y_test, pred_ada, average=None)
    recall = recall_score(y_test, pred_ada, average=None)
    F1 = f1_score(y_test, pred_ada, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    adaResults_df = adaResults_df.append(pd.DataFrame(comb, columns=adaResults_df.columns), ignore_index=True)  
    
    #Naive Bayes CLassifier
    NB = GaussianNB()
    NB.fit(X_train, y_resampled)
    pred_NB = NB.predict(X_test)
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_NB)
    cf_matrix = np.reshape(cf_matrix,(1,4))   
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_NB)
    prec = precision_score(y_test, pred_NB, average=None)
    recall = recall_score(y_test, pred_NB, average=None)
    F1 = f1_score(y_test, pred_NB, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    NBResults_df = NBResults_df.append(pd.DataFrame(comb, columns=NBResults_df.columns), ignore_index=True) 
    
    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_train, y_resampled)
    pred_rfc = rfc.predict(X_test)    
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_rfc)
    cf_matrix = np.reshape(cf_matrix,(1,4))   
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_rfc)
    prec = precision_score(y_test, pred_rfc, average=None)
    recall = recall_score(y_test, pred_rfc, average=None)
    F1 = f1_score(y_test, pred_rfc, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    rfcResults_df = rfcResults_df.append(pd.DataFrame(comb, columns=rfcResults_df.columns), ignore_index=True) 

#Print the classification evaluations for model
print(MLPResults_df)
print(SVM1Results_df)
print(SVM2Results_df)
print(GaussResults_df)
print(DTResults_df)
print(adaResults_df)
print(NBResults_df)
print(rfcResults_df)


#Write the results onto a CSV file, currently commented out 
MLPResults_df.to_csv("10Runs_ClassificationScreening.csv", index=False)
SVM1Results_df.to_csv("10Runs_ClassificationScreening.csv", index=False, mode="a")
SVM2Results_df.to_csv("10Runs_ClassificationScreening.csv", index=False, mode="a")
GaussResults_df.to_csv("10Runs_ClassificationScreening.csv", index=False, mode="a")
DTResults_df.to_csv("10Runs_ClassificationScreening.csv", index=False, mode="a")
adaResults_df.to_csv("10Runs_ClassificationScreening.csv", index=False, mode="a")
NBResults_df.to_csv("10Runs_ClassificationScreening.csv", index=False, mode="a")
rfcResults_df.to_csv("10Runs_ClassificationScreening.csv", index=False, mode="a")

   True Neg  False Pos  False Neg  True Pos       acc  precision 0  \
0      63.0       46.0       11.0       9.0  0.558140     0.851351   
1      77.0       32.0       11.0       9.0  0.666667     0.875000   
2      42.0       67.0        7.0      13.0  0.426357     0.857143   
3      70.0       39.0       12.0       8.0  0.604651     0.853659   
4      52.0       57.0        8.0      12.0  0.496124     0.866667   
5      12.0       97.0        2.0      18.0  0.232558     0.857143   
6      70.0       39.0       10.0      10.0  0.620155     0.875000   
7      98.0       11.0       18.0       2.0  0.775194     0.844828   
8      94.0       15.0       17.0       3.0  0.751938     0.846847   
9      61.0       48.0       10.0      10.0  0.550388     0.859155   

   precision 1  recall 0  recall 1      F1 0      F1 1  
0     0.163636  0.577982      0.45  0.688525  0.240000  
1     0.219512  0.706422      0.45  0.781726  0.295082  
2     0.162500  0.385321      0.65  0.531646  0.260000  
3

In [20]:
#This cell runs the model ten times, evaluates classification via sklearn metrics and downloads the results to a csv file
totalResults_df = pd.DataFrame(columns = ["True Neg","False Pos","False Neg","True Pos", 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])
val_tot = pd.DataFrame()
matrix_list = []


for x in range(1, 11):
    #Perform training/test set split
    X_train, X_test, y_train, y_test = train_test_split(final, y, test_size=0.20, random_state=37)

    #Apply over-sampling to dataset
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

    X_train = X_resampled[final_selected_features]
    X_test = X_test[final_selected_features] 
    
    # Applies random forest classifier to the dataset
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_train, y_resampled)
    
    #Make model predictions
    pred_rfc = rfc.predict(X_test)
    
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred_rfc)
    cf_matrix = np.reshape(cf_matrix,(1,4))
    
    #Calculate sklearn classification metrics
    acc = accuracy_score(y_test, pred_rfc)
    prec = precision_score(y_test, pred_rfc, average=None)
    recall = recall_score(y_test, pred_rfc, average=None)
    F1 = f1_score(y_test, pred_rfc, average=None)
    comb = np.concatenate((cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    totalResults_df = totalResults_df.append(pd.DataFrame(comb, columns=totalResults_df.columns), ignore_index=True)
    
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)     

#Print the classification evaluations for model
#print(totalResults_df)    

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)

#Write the results onto a CSV file 
totalResults_df.to_csv("10Runs_FullResults.csv", index=False)
Val_results.to_csv("10Runs_FullResults.csv", index=False, mode="a")

Val_results

Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,1,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0


In [21]:
#Let's check unknown compounds for borlation using the reduced matrix
unknownSubstrates=pd.read_csv('NonselectiveValidation.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)
New_Substrate_img = Draw.MolsToGridImage(New_Substrate_mol_list, molsPerRow=4)
# Puts the new unknown substrate into Pandas Type (Nice columns with Mordred info)
New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[final_selected_features]

#Display Prediction
ynew = rfc.predict(Xnew)
prediction_df = pd.DataFrame(ynew, columns = ['Prediction'])
unknownSubstrates_prod = unknownSubstrates['selectivity']

results = pd.concat(( unknownSubstrates_prod, prediction_df), axis=1)
results

100%|██████████| 16/16 [00:02<00:00,  5.68it/s]


Unnamed: 0,selectivity,Prediction
0,83,1
1,0,0
2,0,0
3,17,0
4,80,1
5,20,1
6,0,0
7,0,0
8,0,0
9,0,0


In [22]:
#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('Staurosporine3-29.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[['Hirshfeld Heavy Atom Charge', 'Hirshfeld Carbon Charge',
       'Hirshfeld Hydrogen Charge', 'ESP Heavy Atom Charge',
       'NPA Hydrogen Charge', 'Mulliken Heavy Charge',
       'Mulliken Hydrogen Charge', 'Steric Effect Index',
       'Atomic_Polarizability', 'Distance Degree', 'Dreiding Energy', 'AATS0d',
       'AATS0i', 'ATSC1d', 'AATSC2dv', 'AATSC1pe', 'GATS2pe', 'BCUTZ-1l',
       'BCUTse-1l']]


100%|██████████| 15/15 [00:23<00:00,  1.59s/it]


In [23]:
val_tot = pd.DataFrame()
matrix_list = []


for x in range(1,11):
    #Perform training/test set split
    X_train, X_test, y_train, y_test = train_test_split(final, y, test_size=0.20, random_state=37)

    #Apply over-sampling to dataset
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    
    # Applies random forest classifier to the dataset
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_resampled, y_resampled)
       
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)     

#Print the classification evaluations for model
#print(totalResults_df)    

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)

Val_results

Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0
