In [1]:
# Import required libraries for data manipulation and analysis
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

In [2]:
#Import required sklearn functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance
from collections import defaultdict

In [3]:
#Import sklearn classifiers
from sklearn.ensemble import RandomForestClassifier

In [4]:
#Import library to oversample 
from imblearn.over_sampling import RandomOverSampler

In [5]:
#Import RDKit and Mordred libraries
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import Calculator, descriptors

In [6]:
#Allows figures to be visualized in jupyter notebook
%matplotlib inline

In [7]:
#Functions used in the study

#Remove those numbers from analysis data
def filter_rows_by_values1(df, col, values):
    return df[~df[col].isin(values)]

#Remove those numbers from analysis data
def filter_rows_by_values2(df, col, values):
    return df[df[col].isin(values)]

#Get Mordred calcs
def get_Mordred(data_input):
    # Assigns Reactants Mordred Info
    reactants = data_input['Substrate']
    
    reactants_mol_list = []
    for inChi_reactants in reactants:
      reactants_mol = Chem.MolFromInchi(inChi_reactants)
      reactants_mol_list.append(reactants_mol)

    # Puts reactants into Pandas Type
    reactant_data = []
    reactant_data = calc.pandas(reactants_mol_list)
       
    #Joins Mordred parameters with experimental, atomic charges, and JChem for Excel parameters
    add_reactants = pd.concat((data_input, reactant_data), axis=1)
    
    #Force any non-numeric entries as NaN and replace them with 0
    int_data = add_reactants.apply(pd.to_numeric, errors='coerce')
    
    output = int_data.fillna(0)#, inplace=True)

    return output

#Remove zero varience
def remove_zero_varience(values):
   sel = VarianceThreshold()
   _ = sel.fit(values)
   mask = sel.get_support()
   values = values.loc[:,mask] 
   return values

def remove_95correlated(correlated):
    #Remove any features that are greater than 95% correlated
    corr_matrix = correlated.corr()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

    correlated = correlated.drop(to_drop, axis = 1)
    corr_matrix = correlated.corr()
    return correlated

def remove_nonimportant(X_values, y_values):
    # Specifys Random Forest and the Number of Trees, SelectFromModel will
    # select features which are most important
    feature_names = [f"feature {i}" for i in range(X_values.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(X_values, y_values)

    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time

    threshold = np.sort(importances)[-100]
    
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(X_values, y_values)

    # Select the final features set 
    sel.get_support()
    selected_feat= X_values.columns[(sel.get_support())]

    # Prints the names of the final selected features
    print(selected_feat)
    X_values = X_values[selected_feat]
    
    return X_values

def dendrogram(X_values, y):
    corr = spearmanr(X_values).correlation
    # Ensure the correlation matrix is symmetric
    corr = (corr + corr.T) / 2
    np.fill_diagonal(corr, 1)
    distance_matrix = 1 - np.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))
  
    trained_cluster_ids = hierarchy.fcluster(dist_linkage, y, criterion="distance")
    trained_cluster_id_to_feature_ids = defaultdict(list) 
    for idx, trained_cluster_id in enumerate(trained_cluster_ids):
        trained_cluster_id_to_feature_ids[trained_cluster_id].append(idx)
    
    trained_selected_features = [v[0] for v in trained_cluster_id_to_feature_ids.values()]
    final_selected_features = X_values.columns[trained_selected_features]
    X_train = X_values[final_selected_features]
    return X_train

def classificationMetrics(results, y_test, pred):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average=None, zero_division=0)
    recall = recall_score(y_test, pred, average=None)
    F1 = f1_score(y_test, pred, average=None)           
    #Calculate confusion matrix
    cf_matrix = confusion_matrix(y_test, pred)
    cf_matrix = np.reshape(cf_matrix,(1,4))
    comb = np.concatenate((x, y, cf_matrix, acc, prec, recall, F1), axis=None)
    comb = [comb]
    results = results.append(pd.DataFrame(comb, columns=results.columns), ignore_index=True)
    return results


In [8]:
# Sets Pandas Display to Monitor Code
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [9]:
# Create Mordred Calculator
calc = Calculator(descriptors, ignore_3D=True)

In [10]:
# Read Training/Test data input File
full = pd.read_csv('BorylationTrainingTest 9-26-24.csv')

In [11]:
#group the compounds by numbers
full['grouped'] = full.groupby('Substrate', sort=False).ngroup()
full[['Substrate','grouped']]

Unnamed: 0,Substrate,grouped
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
4,"InChI=1S/C7H15N/c1-2-8-6-4-3-5-7-8/h2-7H2,1H3",1
...,...,...
1022,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1023,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1024,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1025,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198


In [12]:
#Convert substrates to Mordred features
full = get_Mordred(full)

100%|██████████| 1027/1027 [01:45<00:00,  9.74it/s]


In [13]:
final_selected_features = ['Hirshfeld Heavy Atom Charge', 'Hirshfeld Carbon Charge', 'Hirshfeld Hydrogen Charge', 'ESP Heavy Atom Charge', 'ESP Hydrogen Charge', 
                           'NPA Hydrogen Charge', 'Mulliken Heavy Charge', 'Mulliken Hydrogen Charge', 'Steric Effect Index', 'Atomic_Polarizability', 'Distance Degree', 'Dreiding Energy', 'AATS0i', 'AATS4i', 
                           'ATSC1d', 'AATSC5s', 'AATSC5v', 'AATSC1pe', 'MATS3Z', 'GATS3c', 'BCUTZ-1l', 'IC2', 'SIC1']

#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('Ligand_validation.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[final_selected_features]

val_tot = pd.DataFrame()

for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 198,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(full, "grouped", set_numbers)
    test_data = filter_rows_by_values2(full, "grouped", set_numbers)
   
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio' , axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio' , axis = 1)
    test_y = test_data['Product_Ratio']
   
    #Apply over-sampling to training set
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(training_X, training_y)    
    X_train = X_resampled[final_selected_features]

    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_train, y_resampled)
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)        

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)

Val_results

100%|██████████| 16/16 [00:01<00:00, 11.54it/s]


Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,1,1,1,1,1,1,1,1,1,1,1
1,0,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1
5,0,1,1,0,1,1,0,0,1,1,1
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,0,1,1,1,1,1,1,1,1,1,1
9,1,1,1,1,1,1,1,1,1,1,1


In [14]:
# Read Training/Test data input File
ligand = pd.read_csv('BorylationTrainingTest_ligand 9-26-24.csv')

In [15]:
#group the compounds by numbers
ligand['grouped'] = ligand.groupby('Substrate', sort=False).ngroup()
ligand[['Substrate','grouped']]

Unnamed: 0,Substrate,grouped
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
4,"InChI=1S/C7H15N/c1-2-8-6-4-3-5-7-8/h2-7H2,1H3",1
...,...,...
1082,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1083,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1084,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1085,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198


In [16]:
#Convert substrates to Mordred features
ligand = get_Mordred(ligand)

100%|██████████| 1087/1087 [01:48<00:00, 10.00it/s]


In [17]:
ligResults_df = pd.DataFrame(columns =  ['x', 'y',  "True Neg","False Pos","False Neg","True Pos",'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

ligmaxacc_comb = pd.DataFrame()
val_tot = pd.DataFrame()
prod = pd.DataFrame()
test_index_total = pd.DataFrame()

model_columns = pd.DataFrame()
for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 198,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(ligand, "grouped", set_numbers)
    test_data = filter_rows_by_values2(ligand, "grouped", set_numbers)
  
    #Remove features that dont change
    training_data = remove_zero_varience(training_data)
    
    #Remove features that are more than 95% correlated
    training_data = remove_95correlated(training_data)
    
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop(['Product_Ratio','grouped'], axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop(['Product_Ratio','grouped'], axis = 1)
    test_y = test_data['Product_Ratio']
   
    #Remove features that are considered less important
    feature_names = [f"feature {i}" for i in range(training_X.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(training_X, training_y)
    
    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time
    
    threshold = np.sort(importances)[-100] 
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(training_X, training_y)
     
    # Select the reduced features set 
    sel.get_support()
    selected_feat= training_X.columns[(sel.get_support())]
    
    reduced1_X = training_X[selected_feat]
    test_X = test_X[selected_feat]
    
    #Apply over-sampling to dataset
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(reduced1_X, training_y) 
    
    for y in [
              0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35,
              0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 
              0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45,
              0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50
             ]:
    
        #Make final training and test set and save them as df's  
        X_train = dendrogram(X_resampled, y)
        test_X = test_X[X_train.columns]
        training_columns_list = X_train.columns.tolist()
        training_columns_list = (x, y, training_columns_list)
        training_columns_list = (pd.DataFrame(training_columns_list).T)

        #Random Forest Classifier
        rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
        rfc.fit(X_train, y_resampled)
        pred_rfc = rfc.predict(test_X)
        ligResults_df = classificationMetrics(ligResults_df, test_y, pred_rfc)

        #Evaluate model by going line by line
        ynew = rfc.predict(test_X)
        prediction_df = pd.DataFrame(ynew,  columns = [(x,y)])

        val_pred_T = prediction_df.T
        val_tot = val_tot.append(val_pred_T)

        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = ligResults_df.groupby('y')['acc'].mean()
        acc_std = ligResults_df.groupby('y')['acc'].std()
        precision_0_mean = ligResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = ligResults_df.groupby('y')['precision 0'].std()
        precision_1_mean = ligResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = ligResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = ligResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = ligResults_df.groupby('y')['recall 0'].std()
        recall_1_mean = ligResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = ligResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = ligResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = ligResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = ligResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = ligResults_df.groupby('y')['F1 1'].std()
        true_neg_mean = ligResults_df.groupby('y')['True Neg'].mean()
        true_neg_std = ligResults_df.groupby('y')['True Neg'].std()
        false_pos_mean = ligResults_df.groupby('y')['False Pos'].mean()
        false_pos_std = ligResults_df.groupby('y')['False Pos'].std()        
        false_neg_mean = ligResults_df.groupby('y')['False Neg'].mean()
        false_neg_std = ligResults_df.groupby('y')['False Neg'].std()      
        true_pos_mean = ligResults_df.groupby('y')['True Pos'].mean() 
        true_pos_std = ligResults_df.groupby('y')['True Pos'].std()   
 

        average_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std,
                                   true_neg_mean, true_neg_std,
                                   false_pos_mean, false_pos_std,
                                   false_neg_mean, false_neg_std,
                                   true_pos_mean, true_pos_std], axis=1)

        average_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std', 'true_neg_mean', 'true_neg_std',
                                 'false_pos_mean', 'false_pos_std','false_neg_mean', 'false_neg_std',
                                 'true_pos_mean', 'true_pos_std']                                 


        rfcmaxacc = average_df[average_df.acc_mean == average_df.acc_mean.max()]
        rfcmaxacc_copy  = rfcmaxacc.copy()
        rfcmaxacc_copy['x_col'] = x
        
        model_columns = model_columns.append(training_columns_list)

    ligmaxacc_comb = ligmaxacc_comb.append(rfcmaxacc_copy)  

#Write the results onto a CSV file, currently commented out 
ligmaxacc_comb.to_csv("10Runs_lig.csv")
model_columns = model_columns.rename(columns = {0:'x', 1:'y', 2: 'features'})
model_columns = model_columns.drop_duplicates(subset = ['x','y'])
model_columns.to_csv("10Runs_lig.csv", mode="a")

In [18]:
ligmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,true_neg_mean,true_neg_std,false_pos_mean,false_pos_std,false_neg_mean,false_neg_std,true_pos_mean,true_pos_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0.4,0.825,0.006844,0.895001,0.004239,0.584158,0.016188,0.881081,0.007263,0.6175,0.016874,0.887972,0.00462,0.600252,0.014079,130.4,1.074968,17.6,1.074968,15.3,0.674949,24.7,0.674949,1
0.4,0.831086,0.009707,0.898617,0.0068,0.590561,0.018431,0.886593,0.008459,0.620288,0.023701,0.892549,0.006688,0.604931,0.019044,133.0,2.828427,17.0,1.123903,15.0,0.973329,24.5,0.945905,2
0.4,0.836702,0.011919,0.90611,0.012595,0.578385,0.024996,0.888562,0.008314,0.62543,0.027156,0.897216,0.008953,0.600562,0.02031,148.166667,21.948346,18.5,2.460025,15.1,1.09387,25.233333,1.50134,3
0.4,0.843722,0.01647,0.908847,0.012042,0.587017,0.028699,0.89629,0.015612,0.620323,0.026803,0.902477,0.012252,0.602638,0.020486,154.8,22.224034,17.7,2.603745,15.275,1.061868,24.975,1.423025,4
0.4,0.842875,0.015047,0.906199,0.012369,0.593906,0.029751,0.897422,0.014263,0.617234,0.027854,0.901731,0.011155,0.604677,0.020649,151.62,20.844311,17.18,2.568967,15.46,1.146601,24.94,1.376331,5
0.4,0.836824,0.019613,0.905816,0.011313,0.584227,0.035387,0.888556,0.024194,0.625473,0.031456,0.896946,0.015017,0.602899,0.019645,148.3,20.427881,18.366667,3.612439,15.216667,1.180228,25.45,1.701694,6
0.4,0.830178,0.024517,0.900777,0.016316,0.572028,0.044665,0.884938,0.024158,0.609639,0.049137,0.892648,0.017512,0.589132,0.038823,146.228571,19.570883,18.771429,3.498121,15.885714,1.996685,24.828571,2.21308,7
0.4,0.829617,0.023112,0.902144,0.015843,0.569093,0.042849,0.882678,0.023507,0.615701,0.049658,0.892153,0.016508,0.590262,0.036992,146.9125,18.386287,19.3375,3.628103,15.7375,1.953535,25.2625,2.406446,8
0.4,0.831508,0.022477,0.906056,0.018638,0.571665,0.041143,0.880795,0.02284,0.632969,0.067917,0.893037,0.015787,0.599111,0.043079,147.711111,17.47309,19.844444,3.720309,15.133333,2.522706,26.311111,3.74959,9
0.35,0.831819,0.022126,0.90721,0.018559,0.573185,0.048674,0.879694,0.022657,0.640061,0.072579,0.893016,0.015237,0.603056,0.050164,146.52,16.111656,19.98,3.966985,14.87,2.78036,26.63,3.797008,10


In [19]:
final_selected_features = ['Active Catalyst-Ligand', 'Ligand', 'Buried_Vol', 'SASA_area', 'Sterimol_L', 'Hirshfeld Heavy Atom Charge', 'Hirshfeld Carbon Charge', 'Hirshfeld Hydrogen Charge', 
                           'ESP Heavy Atom Charge', 'ESP Hydrogen Charge', 'Mulliken Heavy Charge', 'Mulliken Hydrogen Charge', 'Steric Effect Index', 'Atomic_Polarizability', 'Distance Degree', 
                           'Dreiding Energy', 'AATS1Z', 'AATS3i', 'GATS3c', 'GATS4c', 'GATS5c', 'RPCG']

#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('Ligand_validation.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[final_selected_features]

val_tot = pd.DataFrame()

for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 198,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(ligand, "grouped", set_numbers)
    test_data = filter_rows_by_values2(ligand, "grouped", set_numbers)
   
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio', axis = 1)
    training_X = training_X.drop('grouped', axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio', axis = 1)
    test_X = test_X.drop('grouped', axis = 1)
    test_y = test_data['Product_Ratio']
    
    #Apply over-sampling to training set
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(training_X, training_y)    
    X_train = X_resampled[final_selected_features]

    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=15)
    rfc.fit(X_train, y_resampled)
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)        

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)


Val_results

100%|██████████| 16/16 [00:01<00:00,  8.74it/s]


Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,1,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,1,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1
5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0
9,1,1,1,1,1,1,1,1,1,1,1


In [None]:
Val_results.to_csv("10Runs_lig.csv", mode="a")

In [20]:
selected_features = ['Active Catalyst-Ligand', 'Ligand', 'Buried_Vol', 'SASA_area', 'Sterimol_L', 'Hirshfeld Heavy Atom Charge', 'Hirshfeld Carbon Charge', 'Hirshfeld Hydrogen Charge', 
                           'ESP Heavy Atom Charge', 'ESP Hydrogen Charge', 'Mulliken Heavy Charge', 'Mulliken Hydrogen Charge', 'Steric Effect Index', 'Atomic_Polarizability', 'Distance Degree', 
                           'Dreiding Energy', 'AATS1Z', 'AATS3i', 'GATS3c', 'GATS4c', 'GATS5c', 'RPCG']

#Loads validation dataset for borlation using the final reduced features 
unknownSubstrates=pd.read_csv('validation8-26-24.csv')

# Convert validation substrates Inchi's to Mordred and combine into Dataframe with atomic charges and JChem paramters
New_Substrate = unknownSubstrates['Substrate']
New_Substrate_mol_list = []
for inChi_New_Substrate in New_Substrate:
  New_Substrate_mol = Chem.MolFromInchi(inChi_New_Substrate)
  New_Substrate_mol_list.append(New_Substrate_mol)

New_Substrate_data = []
New_Substrate_data = calc.pandas(New_Substrate_mol_list)
New_Substrate_data = New_Substrate_data.apply(pd.to_numeric, errors='coerce')
New_Substrate_data.fillna(0, inplace=True)                                                                  
XnewSec = pd.concat((unknownSubstrates, New_Substrate_data), axis=1)
Xnew = XnewSec[selected_features]

val_tot = pd.DataFrame()

for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 198,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(ligand, "grouped", set_numbers)
    test_data = filter_rows_by_values2(ligand, "grouped", set_numbers)
   
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio' , axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio' , axis = 1)
    test_y = test_data['Product_Ratio']
   
    #Apply over-sampling to training set
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(training_X, training_y)    
    X_train = X_resampled[selected_features]

    #Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
    rfc.fit(X_train, y_resampled)
    
    #Evaluate the model on validation set
    ynew = rfc.predict(Xnew)
    validation_prediction_df = pd.DataFrame(ynew, columns = [(x)])
    validation_prediction_df.merge(validation_prediction_df, on=x)
    val_pred_T = validation_prediction_df.T
    val_tot = val_tot.append(val_pred_T)        

#Print the validation evaluations for model
unknownSubstrates_prod = unknownSubstrates['Product_Ratio']
total_val_results_transposed = val_tot.T
Val_results = pd.concat((unknownSubstrates_prod, total_val_results_transposed), axis=1)

#Write the results onto a CSV file 

Val_results.to_csv("10Runs_lig.csv", mode="a")

Val_results

100%|██████████| 81/81 [00:12<00:00,  6.37it/s]


Unnamed: 0,Product_Ratio,1,2,3,4,5,6,7,8,9,10
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,1,1,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
5,1,1,1,1,1,1,1,1,1,1,1
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,1,1,1,1,1,1,1
9,0,0,0,0,0,0,0,0,0,0,0
