In [1]:
# Import required libraries for data manipulation and analysis
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import shap
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

In [2]:
#Import required sklearn functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance
from collections import defaultdict

In [3]:
#Import sklearn classifiers
from sklearn.ensemble import RandomForestClassifier

In [4]:
#Import library to oversample 
from imblearn.over_sampling import RandomOverSampler

In [5]:
#Import RDKit and Mordred libraries
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import Calculator, descriptors

In [6]:
#Allows figures to be visualized in jupyter notebook
%matplotlib inline

In [7]:
#Functions used in the study

#Remove those numbers from analysis data
def filter_rows_by_values1(df, col, values):
    return df[~df[col].isin(values)]

#Remove those numbers from analysis data
def filter_rows_by_values2(df, col, values):
    return df[df[col].isin(values)]

def classificationMetrics(results, y_test, pred):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average=None, zero_division=0)
    recall = recall_score(y_test, pred, average=None)
    F1 = f1_score(y_test, pred, average=None)           
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred)
    cf_matrix = np.reshape(cf_matrix,(1,4))
    comb = np.concatenate((x, cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    results = results.append(pd.DataFrame(comb, columns=results.columns), ignore_index=True)
    return results


In [8]:
# Sets Pandas Display to Monitor Code
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [9]:
# Create Mordred Calculator
calc = Calculator(descriptors, ignore_3D=True)

In [10]:
# Read Training/Test data input File
data = pd.read_csv('BorylationTrainingTest 1-10-25.csv')

#group the compounds by numbers
data['grouped'] = data.groupby('Substrate', sort=False).ngroup()

data = data[['Product_Ratio','grouped','Hirshfeld Heavy Atom Charge','CM5 Charge','Hirshfeld Carbon Charge','Hirshfeld Hydrogen Charge',
       'ESP Heavy Atom Charge','ESP Carbon Charge','ESP Hydrogen Charge','NPA Carbon Charge','NPA Hydrogen Charge','MBS Heavy Atom Charge',
       'MBS Carbon Charge','MBS Hydrogen Charge','Mulliken Heavy Charge','Mulliken Carbon Charge','Mulliken Hydrogen Charge']]

data.head()

Unnamed: 0,Product_Ratio,grouped,Hirshfeld Heavy Atom Charge,CM5 Charge,Hirshfeld Carbon Charge,...,MBS Carbon Charge,MBS Hydrogen Charge,Mulliken Heavy Charge,Mulliken Carbon Charge,Mulliken Hydrogen Charge
0,1,0,-0.003389,-0.000578,-0.101759,...,-0.25419,0.080765,-0.22032,-0.621212,0.132089
1,0,0,0.003094,0.000249,-0.057714,...,-0.138428,0.073874,0.044556,-0.224805,0.13468
2,0,0,-5.2e-05,-3.2e-05,-0.059257,...,-0.141245,0.070512,0.012779,-0.261495,0.137137
3,0,0,0.000299,0.000312,-0.058954,...,-0.141973,0.070891,0.162986,-0.115385,0.139186
4,1,1,0.002028,0.013777,-0.102309,...,-0.255789,0.08557,-0.086611,-0.497717,0.137539


In [11]:
#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('validation1-9-25.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[['Hirshfeld Heavy Atom Charge','CM5 Charge','Hirshfeld Carbon Charge','Hirshfeld Hydrogen Charge',
       'ESP Heavy Atom Charge','ESP Carbon Charge','ESP Hydrogen Charge','NPA Carbon Charge','NPA Hydrogen Charge','MBS Heavy Atom Charge',
       'MBS Carbon Charge','MBS Hydrogen Charge','Mulliken Heavy Charge','Mulliken Carbon Charge','Mulliken Hydrogen Charge']]

val_tot = pd.DataFrame()

chargesResults_df = pd.DataFrame(columns =  ['x',"True Neg","False Pos","False Neg","True Pos",'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

chargesmaxacc_comb = pd.DataFrame()
val_tot = pd.DataFrame()

for_range = range(1, 11)
for x in for_range:

    #Get numbers to represent compounds
    arr = np.arange(0, 188,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
   
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(data, "grouped", set_numbers)
    test_data = filter_rows_by_values2(data, "grouped", set_numbers)
    
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio' , axis = 1)
    training_X = training_X.drop('grouped' , axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio' , axis = 1)
    test_X = test_X.drop('grouped' , axis = 1)
    test_y = test_data['Product_Ratio']
   
    #Apply over-sampling to dataset
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(training_X, training_y)  
    
    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_resampled, y_resampled)
    pred_charges = rfc.predict(test_X)
      
    chargesResults_df = classificationMetrics(chargesResults_df, test_y, pred_charges)
    
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)
       
    #Determine the mean accuracy of the different dendrogram settings
    acc_mean = chargesResults_df['acc'].mean()
    acc_std = chargesResults_df['acc'].std()
    precision_0_mean = chargesResults_df['precision 0'].mean()
    precision_0_std = chargesResults_df['precision 0'].std()
    precision_1_mean = chargesResults_df['precision 1'].mean()
    precision_1_std = chargesResults_df['precision 1'].std()
    recall_0_mean = chargesResults_df['recall 0'].mean()
    recall_0_std = chargesResults_df['recall 0'].std()
    recall_1_mean = chargesResults_df['recall 1'].mean()
    recall_1_std = chargesResults_df['recall 1'].std()
    F1_0_mean = chargesResults_df['F1 0'].mean()
    F1_0_std = chargesResults_df['F1 0'].std()
    F1_1_mean = chargesResults_df['F1 1'].mean()
    F1_1_std = chargesResults_df['F1 1'].std()
    true_neg_mean = chargesResults_df['True Neg'].mean()
    true_neg_std = chargesResults_df['True Neg'].std()
    false_pos_mean = chargesResults_df['False Pos'].mean()
    false_pos_std = chargesResults_df['False Pos'].std()        
    false_neg_mean = chargesResults_df['False Neg'].mean()
    false_neg_std = chargesResults_df['False Neg'].std()      
    true_pos_mean = chargesResults_df['True Pos'].mean() 
    true_pos_std = chargesResults_df['True Pos'].std()   

#chargesaverage_df = pd.concat([acc_mean , precision_0_mean, precision_1_mean, recall_0_mean, recall_1_mean, F1_0_mean, F1_1_mean])    
chargeAve = {'metric' : ['accuracy', 'precision 0', 'precision 1', 'recall 0', 'recall 1', 'F1 0','F1 1', 'True Neg', 'False Pos','False Neg', 'True Pos'],
             'average' : [acc_mean, precision_0_mean, precision_1_mean, recall_0_mean,
                          recall_1_mean, F1_0_mean, F1_1_mean, true_neg_mean, false_pos_mean, false_neg_mean, true_pos_mean],
             'std' :[acc_std, precision_0_std, precision_1_std, recall_0_std,
                          recall_1_std, F1_0_std, F1_1_std, true_neg_std, false_pos_std, false_neg_std, true_pos_std]}
                       
chargeResults = pd.DataFrame(chargeAve)
print(chargeResults.T)

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)    
    
print(Val_results)

chargeResults.T.to_csv("chargesOnly.csv")
Val_results.to_csv("chargesOnly.csv", index=False, mode="a")

100%|██████████| 81/81 [00:11<00:00,  7.20it/s]


                0            1            2          3          4   ...  \
metric    accuracy  precision 0  precision 1   recall 0   recall 1  ...   
average   0.809037     0.901415     0.527033   0.856589   0.621622  ...   
std      0.0287775    0.0242189    0.0846274  0.0429867  0.0882699  ...   

                6         7          8          9         10  
metric        F1 1  True Neg  False Pos  False Neg  True Pos  
average   0.562481     128.7       21.8         14        23  
std      0.0502798    10.133    7.16163    3.26599   3.26599  

[3 rows x 11 columns]
    Product_Ratio  1  2  3  4  ...  6  7  8  9  10
0               0  0  0  0  0  ...  0  0  0  0   0
1               0  0  0  0  0  ...  0  0  0  0   0
2               0  0  0  0  0  ...  0  0  0  0   0
3               0  0  0  0  0  ...  0  0  0  0   0
4               0  0  0  0  0  ...  0  0  0  0   0
..            ... .. .. .. ..  ... .. .. .. ..  ..
76              1  1  1  0  1  ...  1  1  0  1   0
77              