In [1]:
# Import required libraries for data manipulation and analysis
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

In [2]:
#Import required sklearn functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance
from collections import defaultdict

In [3]:
#Import sklearn classifiers
from sklearn.ensemble import RandomForestClassifier

In [4]:
#Import library to oversample 
from imblearn.over_sampling import RandomOverSampler

In [5]:
#Import RDKit and Mordred libraries
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import Calculator, descriptors

In [6]:
#Allows figures to be visualized in jupyter notebook
%matplotlib inline

In [7]:
#Functions used in the study

#Remove those numbers from analysis data
def filter_rows_by_values1(df, col, values):
    return df[~df[col].isin(values)]

#Remove those numbers from analysis data
def filter_rows_by_values2(df, col, values):
    return df[df[col].isin(values)]

#Get Mordred calcs
def get_Mordred(data_input):
    # Assigns Reactants Mordred Info
    reactants = data_input['Substrate']
    
    reactants_mol_list = []
    for inChi_reactants in reactants:
      reactants_mol = Chem.MolFromInchi(inChi_reactants)
      reactants_mol_list.append(reactants_mol)

    # Puts reactants into Pandas Type
    reactant_data = []
    reactant_data = calc.pandas(reactants_mol_list)
       
    #Joins Mordred parameters with experimental, atomic charges, and JChem for Excel parameters
    add_reactants = pd.concat((data_input, reactant_data), axis=1)
    
    #Force any non-numeric entries as NaN and replace them with 0
    int_data = add_reactants.apply(pd.to_numeric, errors='coerce')
    
    output = int_data.fillna(0)#, inplace=True)

    return output

#Remove zero varience
def remove_zero_varience(values):
   sel = VarianceThreshold()
   _ = sel.fit(values)
   mask = sel.get_support()
   values = values.loc[:,mask] 
   return values

def remove_95correlated(correlated):
    #Remove any features that are greater than 95% correlated
    corr_matrix = correlated.corr()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

    correlated = correlated.drop(to_drop, axis = 1)
    corr_matrix = correlated.corr()
    return correlated

def remove_nonimportant(X_values, y_values):
    # Specifys Random Forest and the Number of Trees, SelectFromModel will
    # select features which are most important
    feature_names = [f"feature {i}" for i in range(X_values.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(X_values, y_values)

    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time

    threshold = np.sort(importances)[-100]
    
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(X_values, y_values)

    # Select the final features set 
    sel.get_support()
    selected_feat= X_values.columns[(sel.get_support())]

    # Prints the names of the final selected features
    print(selected_feat)
    X_values = X_values[selected_feat]
    
    return X_values

def dendrogram(X_values, y):
    corr = spearmanr(X_values).correlation
    # Ensure the correlation matrix is symmetric
    corr = (corr + corr.T) / 2
    np.fill_diagonal(corr, 1)
    distance_matrix = 1 - np.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))
  
    trained_cluster_ids = hierarchy.fcluster(dist_linkage, y, criterion="distance")
    trained_cluster_id_to_feature_ids = defaultdict(list) 
    for idx, trained_cluster_id in enumerate(trained_cluster_ids):
        trained_cluster_id_to_feature_ids[trained_cluster_id].append(idx)
    
    trained_selected_features = [v[0] for v in trained_cluster_id_to_feature_ids.values()]
    final_selected_features = X_values.columns[trained_selected_features]
    X_train = X_values[final_selected_features]
    return X_train

def classificationMetrics(results, y_test, pred):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average=None, zero_division=0)
    recall = recall_score(y_test, pred, average=None)
    F1 = f1_score(y_test, pred, average=None)           
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred)
    cf_matrix = np.reshape(cf_matrix,(1,4))
    comb = np.concatenate((x, y, cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    results = results.append(pd.DataFrame(comb, columns=results.columns), ignore_index=True)
    return results


In [8]:
# Sets Pandas Display to Monitor Code
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [9]:
# Create Mordred Calculator
calc = Calculator(descriptors, ignore_3D=True)

In [10]:
# Read Training/Test data input File
full = pd.read_csv('BorylationTrainingTest 8-29-24.csv')

In [11]:
#group the compounds by numbers
full['grouped'] = full.groupby('Substrate', sort=False).ngroup()
full[['Substrate','grouped']]

Unnamed: 0,Substrate,grouped
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
4,InChI=1S/C12H27N/c1-4-7-10-13(11-8-5-2)12-9-6-...,1
...,...,...
1028,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200
1029,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200
1030,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200
1031,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200


In [12]:
#Convert substrates to Mordred features
full = get_Mordred(full)

100%|██████████| 1033/1033 [01:31<00:00, 11.31it/s]


In [13]:
final_selected_features = ['Hirshfeld Heavy Atom Charge', 'Hirshfeld Carbon Charge', 'Hirshfeld Hydrogen Charge', 'ESP Heavy Atom Charge', 'ESP Hydrogen Charge', 
                           'NPA Hydrogen Charge', 'Mulliken Heavy Charge', 'Mulliken Hydrogen Charge', 'Steric Effect Index', 'Atomic_Polarizability', 'Distance Degree', 'Dreiding Energy',
                           'AATS2d', 'AATS4p', 'AATS5p', 'AATSC2d', 'AATSC3v', 'AATSC5v', 'MATS4s', 'MATS1pe', 'GATS5Z', 'BCUTse-1l']

#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('Ligand_validation.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[final_selected_features]

val_tot = pd.DataFrame()

for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 200,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(full, "grouped", set_numbers)
    test_data = filter_rows_by_values2(full, "grouped", set_numbers)
   
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio' , axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio' , axis = 1)
    test_y = test_data['Product_Ratio']
   
    #Apply over-sampling to training set
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(training_X, training_y)    
    X_train = X_resampled[final_selected_features]

    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_train, y_resampled)
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)        

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)

Val_results

100%|██████████| 16/16 [00:01<00:00, 13.12it/s]


Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,1,1,1,1,1,1,1,1,1,1,1
1,0,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1
5,0,1,1,1,1,1,1,1,1,1,1
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,0,1,1,1,1,1,1,1,1,1,1
9,1,1,1,1,1,1,1,1,1,1,1


In [14]:
# Read Training/Test data input File
ligand = pd.read_csv('BorylationTrainingTest_ligand 8-29-24.csv')

In [15]:
#group the compounds by numbers
ligand['grouped'] = ligand.groupby('Substrate', sort=False).ngroup()
ligand[['Substrate','grouped']]

Unnamed: 0,Substrate,grouped
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
4,InChI=1S/C12H27N/c1-4-7-10-13(11-8-5-2)12-9-6-...,1
...,...,...
1088,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200
1089,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200
1090,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200
1091,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,200


In [16]:
#Convert substrates to Mordred features
ligand = get_Mordred(ligand)

100%|██████████| 1093/1093 [01:34<00:00, 11.56it/s]


In [17]:
ligResults_df = pd.DataFrame(columns =  ['x', 'y',  "True Neg","False Pos","False Neg","True Pos",'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

ligmaxacc_comb = pd.DataFrame()
val_tot = pd.DataFrame()
prod = pd.DataFrame()
test_index_total = pd.DataFrame()

model_columns = pd.DataFrame()
for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 200,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(ligand, "grouped", set_numbers)
    test_data = filter_rows_by_values2(ligand, "grouped", set_numbers)
  
    #Remove features that dont change
    training_data = remove_zero_varience(training_data)
    
    #Remove features that are more than 95% correlated
    training_data = remove_95correlated(training_data)
    
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop(['Product_Ratio','grouped'], axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop(['Product_Ratio','grouped'], axis = 1)
    test_y = test_data['Product_Ratio']
   
    #Remove features that are considered less important
    feature_names = [f"feature {i}" for i in range(training_X.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(training_X, training_y)
    
    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time
    
    threshold = np.sort(importances)[-100] 
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(training_X, training_y)
     
    # Select the reduced features set 
    sel.get_support()
    selected_feat= training_X.columns[(sel.get_support())]
    
    reduced1_X = training_X[selected_feat]
    test_X = test_X[selected_feat]
    
    #Apply over-sampling to dataset
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(reduced1_X, training_y) 
    
    for y in [0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25,
              0.30, 0.30, 0.30, 0.30, 0.30, 0.30, 0.30, 0.30, 0.30, 0.30, 
              0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35,
              0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 
              0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45,
              0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50,
              0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55
             ]:
    
        #Make final training and test set and save them as df's  
        X_train = dendrogram(X_resampled, y)
        test_X = test_X[X_train.columns]
        training_columns_list = X_train.columns.tolist()
        training_columns_list = (x, y, training_columns_list)
        training_columns_list = (pd.DataFrame(training_columns_list).T)

        #Random Forest Classifier
        rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
        rfc.fit(X_train, y_resampled)
        pred_rfc = rfc.predict(test_X)
        ligResults_df = classificationMetrics(ligResults_df, test_y, pred_rfc)

        #Evaluate model by going line by line
        ynew = rfc.predict(test_X)
        prediction_df = pd.DataFrame(ynew,  columns = [(x,y)])

        val_pred_T = prediction_df.T
        val_tot = val_tot.append(val_pred_T)

        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = ligResults_df.groupby('y')['acc'].mean()
        acc_std = ligResults_df.groupby('y')['acc'].std()
        precision_0_mean = ligResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = ligResults_df.groupby('y')['precision 0'].std()
        precision_1_mean = ligResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = ligResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = ligResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = ligResults_df.groupby('y')['recall 0'].std()
        recall_1_mean = ligResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = ligResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = ligResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = ligResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = ligResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = ligResults_df.groupby('y')['F1 1'].std()
        true_neg_mean = ligResults_df.groupby('y')['True Neg'].mean()
        true_neg_std = ligResults_df.groupby('y')['True Neg'].std()
        false_pos_mean = ligResults_df.groupby('y')['False Pos'].mean()
        false_pos_std = ligResults_df.groupby('y')['False Pos'].std()        
        false_neg_mean = ligResults_df.groupby('y')['False Neg'].mean()
        false_neg_std = ligResults_df.groupby('y')['False Neg'].std()      
        true_pos_mean = ligResults_df.groupby('y')['True Pos'].mean() 
        true_pos_std = ligResults_df.groupby('y')['True Pos'].std()   
 

        average_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std,
                                   true_neg_mean, true_neg_std,
                                   false_pos_mean, false_pos_std,
                                   false_neg_mean, false_neg_std,
                                   true_pos_mean, true_pos_std], axis=1)

        average_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std', 'true_neg_mean', 'true_neg_std',
                                 'false_pos_mean', 'false_pos_std','false_neg_mean', 'false_neg_std',
                                 'true_pos_mean', 'true_pos_std']                                 


        rfcmaxacc = average_df[average_df.acc_mean == average_df.acc_mean.max()]
        rfcmaxacc_copy  = rfcmaxacc.copy()
        rfcmaxacc_copy['x_col'] = x
        
        model_columns = model_columns.append(training_columns_list)

    ligmaxacc_comb = ligmaxacc_comb.append(rfcmaxacc_copy)  

#Write the results onto a CSV file, currently commented out 
ligmaxacc_comb.to_csv("10Runs_lig.csv")
model_columns = model_columns.rename(columns = {0:'x', 1:'y', 2: 'features'})
model_columns = model_columns.drop_duplicates(subset = ['x','y'])
model_columns.to_csv("10Runs_lig.csv", mode="a")

In [18]:
ligmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,true_neg_mean,true_neg_std,false_pos_mean,false_pos_std,false_neg_mean,false_neg_std,true_pos_mean,true_pos_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0.35,0.848585,0.006836,0.937138,0.004333,0.611871,0.013314,0.865868,0.005785,0.784444,0.014999,0.900089,0.004617,0.687462,0.013299,144.6,0.966092,22.4,0.966092,9.7,0.674949,35.3,0.674949,1
0.35,0.847239,0.009692,0.935187,0.004934,0.610632,0.020418,0.866063,0.010112,0.777449,0.017067,0.899275,0.006735,0.683883,0.017098,142.9,2.403944,22.1,1.68273,9.9,0.718185,34.6,0.994723,2
0.3,0.852268,0.01151,0.934976,0.005401,0.615228,0.019952,0.874638,0.018919,0.764411,0.031861,0.903671,0.009319,0.681075,0.012255,149.7,10.68402,21.3,2.4516,10.433333,1.330889,33.9,1.709003,3
0.3,0.856555,0.013043,0.925619,0.017258,0.64117,0.05062,0.892191,0.035045,0.714771,0.092103,0.907896,0.011195,0.669043,0.025903,151.25,9.625687,18.25,5.785703,12.275,3.464009,31.225,4.943203,4
0.3,0.845244,0.025898,0.915185,0.026209,0.620378,0.062251,0.887959,0.032534,0.680317,0.10805,0.900764,0.017688,0.643192,0.057707,146.26,13.249274,18.34,5.180852,13.48,3.94989,29.32,5.863899,5
0.3,0.842718,0.024405,0.919283,0.025654,0.610092,0.061475,0.880059,0.034726,0.698453,0.106777,0.898532,0.016981,0.645009,0.052906,147.1,12.234572,20.066667,6.144769,12.85,3.878778,30.483333,5.958875,6
0.3,0.838804,0.024708,0.917765,0.024081,0.595189,0.067978,0.877074,0.033083,0.687959,0.102385,0.896341,0.016714,0.632714,0.057874,149.528571,12.814984,21.042857,6.19826,13.371429,3.826572,30.057143,5.620577,7
0.3,0.836951,0.023727,0.917633,0.02251,0.587003,0.067365,0.875051,0.031515,0.685298,0.095948,0.895287,0.015956,0.627404,0.055997,150.1,12.083675,21.525,5.955478,13.45,3.582305,29.8,5.297241,8
0.3,0.840003,0.024079,0.919605,0.021963,0.589819,0.064244,0.877332,0.030499,0.689912,0.091477,0.897481,0.016341,0.631529,0.054218,151.433333,12.008471,21.233333,5.690619,13.2,3.455008,29.8,4.994829,9
0.3,0.84249,0.024146,0.92117,0.021384,0.591765,0.061567,0.879296,0.029685,0.692967,0.087404,0.8993,0.016514,0.634363,0.052285,154.05,13.866153,21.15,5.437199,13.11,3.296447,29.99,4.776827,10


In [19]:
final_selected_features = ['Active Catalyst-Ligand', 'Ligand', 'Buried_Vol', 'PyramidalizationAR', 'SASA_area', 'Sterimol_L', 'Hirshfeld Heavy Atom Charge',
                           'Hirshfeld Carbon Charge', 'Hirshfeld Hydrogen Charge', 'ESP Heavy Atom Charge', 'ESP Hydrogen Charge', 'NPA Hydrogen Charge', 'Mulliken Heavy Charge', 'Mulliken Hydrogen Charge', 'Steric Effect Index', 'Atomic_Polarizability', 'Distance Degree', 'Dreiding Energy', 'MaxZ', 'Sigma Electronegativity', 'AATS3v',
                           'AATSC2d', 'MATS2c', 'MATS5Z', 'MATS1pe', 'GATS5v', 'GATS5se', 'BCUTZ-1l']

#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('Ligand_validation.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[final_selected_features]

val_tot = pd.DataFrame()

for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 200,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(ligand, "grouped", set_numbers)
    test_data = filter_rows_by_values2(ligand, "grouped", set_numbers)
   
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio', axis = 1)
    training_X = training_X.drop('grouped', axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio', axis = 1)
    test_X = test_X.drop('grouped', axis = 1)
    test_y = test_data['Product_Ratio']
    
    #Apply over-sampling to training set
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(training_X, training_y)    
    X_train = X_resampled[final_selected_features]

    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=15)
    rfc.fit(X_train, y_resampled)
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)        

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)

#Write the results onto a CSV file 
#totalResults_df.to_csv("10Runs_FullResults.csv", index=False)
#Val_results.to_csv("10Runs_FullResults.csv", index=False, mode="a")

Val_results

100%|██████████| 16/16 [00:04<00:00,  3.55it/s]


Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,1,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,1,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1
5,0,1,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0
9,1,1,1,1,1,1,1,1,1,1,1


In [20]:
Val_results.to_csv("10Runs_lig.csv", mode="a")

In [21]:
selected_features = ['Active Catalyst-Ligand', 'Ligand', 'Buried_Vol', 'PyramidalizationAR', 'SASA_area', 'Sterimol_L', 'Hirshfeld Heavy Atom Charge',
                           'Hirshfeld Carbon Charge', 'Hirshfeld Hydrogen Charge', 'ESP Heavy Atom Charge', 'ESP Hydrogen Charge', 'NPA Hydrogen Charge', 'Mulliken Heavy Charge', 'Mulliken Hydrogen Charge', 'Steric Effect Index', 'Atomic_Polarizability', 'Distance Degree', 'Dreiding Energy', 'MaxZ', 'Sigma Electronegativity', 'AATS3v',
                           'AATSC2d', 'MATS2c', 'MATS5Z', 'MATS1pe', 'GATS5v', 'GATS5se', 'BCUTZ-1l']


#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('validation8-26-24.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[selected_features]

val_tot = pd.DataFrame()

for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 200,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(ligand, "grouped", set_numbers)
    test_data = filter_rows_by_values2(ligand, "grouped", set_numbers)
   
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio' , axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio' , axis = 1)
    test_y = test_data['Product_Ratio']
   
    #Apply over-sampling to training set
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(training_X, training_y)    
    X_train = X_resampled[selected_features]

    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_train, y_resampled)
    
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)        

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)

#Write the results onto a CSV file 

Val_results.to_csv("10Runs_lig.csv", mode="a")

Val_results

100%|██████████| 81/81 [00:10<00:00,  7.55it/s]


Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
5,1,1,1,1,1,1,1,1,1,1,0
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,1,1,1,1,1,1,1
9,0,0,0,0,0,0,0,0,0,0,0
