In [1]:
# Import required libraries for data manipulation and analysis
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

In [2]:
#Import required sklearn functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance
from collections import defaultdict

In [3]:
#Import sklearn classifiers
from sklearn.ensemble import RandomForestClassifier

In [4]:
#Import library to oversample 
from imblearn.over_sampling import RandomOverSampler

In [5]:
#Import RDKit and Mordred libraries
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import Calculator, descriptors

In [6]:
#Allows figures to be visualized in jupyter notebook
%matplotlib inline

In [7]:
#Functions used in the study

#Remove those numbers from analysis data
def filter_rows_by_values1(df, col, values):
    return df[~df[col].isin(values)]

#Remove those numbers from analysis data
def filter_rows_by_values2(df, col, values):
    return df[df[col].isin(values)]

#Get Mordred calcs
def get_Mordred(data_input):
    # Assigns Reactants Mordred Info
    reactants = data_input['Substrate']
    
    reactants_mol_list = []
    for inChi_reactants in reactants:
      reactants_mol = Chem.MolFromInchi(inChi_reactants)
      reactants_mol_list.append(reactants_mol)

    # Puts reactants into Pandas Type
    reactant_data = []
    reactant_data = calc.pandas(reactants_mol_list)
       
    #Joins Mordred parameters with experimental, atomic charges, and JChem for Excel parameters
    add_reactants = pd.concat((data_input, reactant_data), axis=1)
    
    #Force any non-numeric entries as NaN and replace them with 0
    int_data = add_reactants.apply(pd.to_numeric, errors='coerce')
    
    output = int_data.fillna(0)#, inplace=True)

    return output

#Remove zero varience
def remove_zero_varience(values):
   sel = VarianceThreshold()
   _ = sel.fit(values)
   mask = sel.get_support()
   values = values.loc[:,mask] 
   return values

def remove_95correlated(correlated):
    #Remove any features that are greater than 95% correlated
    corr_matrix = correlated.corr()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

    correlated = correlated.drop(to_drop, axis = 1)
    corr_matrix = correlated.corr()
    return correlated

def remove_nonimportant(X_values, y_values):
    # Specifys Random Forest and the Number of Trees, SelectFromModel will
    # select features which are most important
    feature_names = [f"feature {i}" for i in range(X_values.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(X_values, y_values)

    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time

    threshold = np.sort(importances)[-100]
    
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(X_values, y_values)

    # Select the final features set 
    sel.get_support()
    selected_feat= X_values.columns[(sel.get_support())]

    # Prints the names of the final selected features
    print(selected_feat)
    X_values = X_values[selected_feat]
    
    return X_values

def dendrogram(X_values, y):
    corr = spearmanr(X_values).correlation
    # Ensure the correlation matrix is symmetric
    corr = (corr + corr.T) / 2
    np.fill_diagonal(corr, 1)
    distance_matrix = 1 - np.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))
  
    trained_cluster_ids = hierarchy.fcluster(dist_linkage, y, criterion="distance")
    trained_cluster_id_to_feature_ids = defaultdict(list) 
    for idx, trained_cluster_id in enumerate(trained_cluster_ids):
        trained_cluster_id_to_feature_ids[trained_cluster_id].append(idx)
    
    trained_selected_features = [v[0] for v in trained_cluster_id_to_feature_ids.values()]
    final_selected_features = X_values.columns[trained_selected_features]
    X_train = X_values[final_selected_features]
    return X_train

def classificationMetrics(results, y_test, pred):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average=None, zero_division=0)
    recall = recall_score(y_test, pred, average=None)
    F1 = f1_score(y_test, pred, average=None)
    comb = np.concatenate((x, y, acc, prec, recall, F1), axis=None)
    comb = [comb]
    results = results.append(pd.DataFrame(comb, columns=results.columns), ignore_index=True)
    return results

    ave_df = pd.concat([acc_mean , precision_0_mean, precision_1_mean, recall_0_mean, 
                        recall_1_mean, F1_0_mean, F1_1_mean], axis=1)
    return ave_df
        

In [8]:
# Sets Pandas Display to Monitor Code
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [9]:
# Create Mordred Calculator
calc = Calculator(descriptors, ignore_3D=True)

In [10]:
# Read Training/Test data input File
data = pd.read_csv('BorylationTrainingTest 1-10-25.csv')
data

Unnamed: 0,Substrate,Product,Boronic Ester,Active Catalyst-Ligand,Catalyst,...,Ring Atom Count,Rot Bond Count,Sigma Electronegativity,Wiener Index,Product_Ratio
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-6-7-8-9-10-11-12-15-16-1...,2,6,1,...,0,5,7.387931,84,1
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-7-8-9-10-11-12(2)15-16-1...,2,6,1,...,0,5,7.470532,84,0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-7-9-10-11-12(8-2)15-16-1...,2,6,1,...,0,5,7.493894,84,0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-7-9-11-12(10-8-2)15-16-1...,2,6,1,...,0,5,7.496149,84,0
4,"InChI=1S/C7H15N/c1-2-8-6-4-3-5-7-8/h2-7H2,1H3","InChI=1S/C13H26BNO2/c1-12(2)13(3,4)17-14(16-12...",2,7,3,...,6,1,7.937482,64,1
...,...,...,...,...,...,...,...,...,...,...,...
966,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,InChI=1S/C21H29BO2/c1-14-8-10-17(12-19-15(2)9-...,2,5,6,...,10,1,7.447806,331,0
967,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,InChI=1S/C21H29BO2/c1-13(2)17-12-18-14(3)9-10-...,2,5,6,...,10,1,8.374628,331,0
968,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,InChI=1S/C21H29BO2/c1-13(2)16-11-18-14(3)9-10-...,2,5,6,...,10,1,8.291631,331,0
969,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,InChI=1S/C21H29BO2/c1-14(2)16-9-10-17(18-11-8-...,2,5,6,...,10,1,7.959360,331,0


In [11]:
#group the compounds by numbers
data['grouped'] = data.groupby('Substrate', sort=False).ngroup()
data[['Substrate','grouped']]

Unnamed: 0,Substrate,grouped
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
4,"InChI=1S/C7H15N/c1-2-8-6-4-3-5-7-8/h2-7H2,1H3",1
...,...,...
966,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,188
967,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,188
968,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,188
969,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,188


In [12]:
#Convert substrates to Mordred features
data = get_Mordred(data)

100%|██████████| 971/971 [02:15<00:00,  7.17it/s]


In [13]:
rfcResults_df = pd.DataFrame(columns =  ['x', 'y', 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

rfcmaxacc_comb = pd.DataFrame()
val_tot = pd.DataFrame()
prod = pd.DataFrame()
test_index_total = pd.DataFrame()

model_columns = pd.DataFrame()
for_range = range(1, 11)
for x in for_range:
    
    #randomize the target values
    data['Product_Ratio'] = data['Product_Ratio'].sample(frac=1).values
    
    #Get numbers to represent compounds
    arr = np.arange(0, 188,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(data, "grouped", set_numbers)
    training_data = training_data.drop('grouped', axis = 1)    
    test_data = filter_rows_by_values2(data, "grouped", set_numbers)
    test_data = test_data.drop('grouped', axis = 1)    
        
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio', axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio', axis = 1)
    test_y = test_data['Product_Ratio']
    
    #Remove features that are considered less important
    feature_names = [f"feature {i}" for i in range(training_X.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(training_X, training_y)
    
    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time
    
    threshold = np.sort(importances)[-100] 
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(training_X, training_y)
     
    # Select the reduced features set 
    sel.get_support()
    selected_feat= training_X.columns[(sel.get_support())]
    
    reduced1_X = training_X[selected_feat]
    test_X = test_X[selected_feat]
    
    #Apply over-sampling to dataset
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(reduced1_X, training_y) 
    
    for y in [ 
              0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35,
              0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 
              0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45,
              0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50
             ]:
    
        #Make final training and test set and save them as df's  
        X_train = dendrogram(X_resampled, y)
        test_X = test_X[X_train.columns]
        training_columns_list = X_train.columns.tolist()
        training_columns_list = (x, y, training_columns_list)
        training_columns_list = (pd.DataFrame(training_columns_list).T)

        #Random Forest Classifier
        rfc = RandomForestClassifier(n_estimators=800,max_depth=9)
        rfc.fit(X_train, y_resampled)
        pred_rfc = rfc.predict(test_X)
        rfcResults_df = classificationMetrics(rfcResults_df, test_y, pred_rfc)

        #Evaluate model by going line by line
        ynew = rfc.predict(test_X)
        prediction_df = pd.DataFrame(ynew,  columns = [(x,y)])

        val_pred_T = prediction_df.T
        val_tot = val_tot.append(val_pred_T)

        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = rfcResults_df.groupby('y')['acc'].mean()
        acc_std = rfcResults_df.groupby('y')['acc'].std()
        precision_0_mean = rfcResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = rfcResults_df.groupby('y')['precision 0'].std()
        precision_1_mean = rfcResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = rfcResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = rfcResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = rfcResults_df.groupby('y')['recall 0'].std()
        recall_1_mean = rfcResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = rfcResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = rfcResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = rfcResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = rfcResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = rfcResults_df.groupby('y')['F1 1'].std()


        rfcaverage_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std], axis=1)
        
        rfcaverage_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std']                                 
               
        rfcmaxacc = rfcaverage_df[rfcaverage_df.acc_mean == rfcaverage_df.acc_mean.max()]
        rfcmaxacc_copy  = rfcmaxacc.copy()
        rfcmaxacc_copy['x_col'] = x
        
        model_columns = model_columns.append(training_columns_list)

    test_index = pd.DataFrame(test_data.index.values, columns = [x])

    test_index_total = pd.concat([test_index_total, test_index],axis = 1)
    test_y = test_y.rename(x)
    test_y = test_y.reset_index(drop = True)
    prod = prod.append(test_y)
    rfcmaxacc_comb = rfcmaxacc_comb.append(rfcmaxacc_copy)  
    
prod = prod.T
total_results = val_tot.T
 
rfcmaxacc_comb.to_csv("10Runs_rfc_shuffle_y.csv")

In [14]:
rfcmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,...,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.35,0.714286,0.005577,0.77547,0.001753,0.203215,...,0.828993,0.003928,0.131787,0.008866,1
0.35,0.750412,0.037562,0.804225,0.029584,0.224026,...,0.853436,0.02539,0.142258,0.017368,2
0.35,0.746441,0.031242,0.803662,0.024031,0.188542,...,0.851505,0.020923,0.120668,0.036144,3
0.35,0.755183,0.031087,0.807009,0.021548,0.216509,...,0.857223,0.0207,0.130132,0.035626,4
0.4,0.756088,0.030335,0.814051,0.022942,0.217866,...,0.857281,0.020845,0.14499,0.045842,5
0.4,0.757595,0.027914,0.809239,0.02361,0.271027,...,0.857739,0.01905,0.165873,0.06378,6
0.4,0.753504,0.027811,0.802868,0.02691,0.265003,...,0.85526,0.018724,0.15659,0.063667,7
0.35,0.754612,0.026352,0.804842,0.028417,0.252077,...,0.856388,0.017515,0.146703,0.057264,8
0.4,0.754292,0.024862,0.805161,0.024252,0.22947,...,0.856389,0.016821,0.136312,0.069339,9
0.4,0.755319,0.023828,0.803739,0.023401,0.212674,...,0.85737,0.016251,0.124673,0.074792,10
