In [1]:
# Import required libraries for data manipulation and analysis
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

In [2]:
#Import required sklearn functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from collections import defaultdict

In [3]:
#Import sklearn classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [4]:
#Import library to oversample 
from imblearn.over_sampling import RandomOverSampler

In [5]:
#Import RDKit and Mordred libraries
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import Calculator, descriptors

In [6]:
#Allows figures to be visualized in jupyter notebook
%matplotlib inline

In [7]:
#Functions used in the study

#Remove those numbers from analysis data
def filter_rows_by_values1(df, col, values):
    return df[~df[col].isin(values)]

#Remove those numbers from analysis data
def filter_rows_by_values2(df, col, values):
    return df[df[col].isin(values)]

#Get Mordred calcs
def get_Mordred(data_input):
    # Assigns Reactants Mordred Info
    reactants = data_input['Substrate']
    
    reactants_mol_list = []
    for inChi_reactants in reactants:
      reactants_mol = Chem.MolFromInchi(inChi_reactants)
      reactants_mol_list.append(reactants_mol)

    # Puts reactants into Pandas Type
    reactant_data = []
    reactant_data = calc.pandas(reactants_mol_list)
       
    #Joins Mordred parameters with experimental, atomic charges, and JChem for Excel parameters
    add_reactants = pd.concat((data_input, reactant_data), axis=1)
    
    #Force any non-numeric entries as NaN and replace them with 0
    int_data = add_reactants.apply(pd.to_numeric, errors='coerce')
    
    output = int_data.fillna(0)#, inplace=True)

    return output

#Remove zero varience
def remove_zero_varience(values):
   sel = VarianceThreshold()
   _ = sel.fit(values)
   mask = sel.get_support()
   values = values.loc[:,mask] 
   return values

def remove_95correlated(correlated):
    #Remove any features that are greater than 95% correlated
    corr_matrix = correlated.corr()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

    correlated = correlated.drop(to_drop, axis = 1)
    corr_matrix = correlated.corr()
    return correlated

def remove_nonimportant(X_values, y_values):
    # Specifys Random Forest and the Number of Trees, SelectFromModel will
    # select features which are most important
    feature_names = [f"feature {i}" for i in range(X_values.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(X_values, y_values)

    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time

    threshold = np.sort(importances)[-100]
    
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(X_values, y_values)

    # Select the final features set 
    sel.get_support()
    selected_feat= X_values.columns[(sel.get_support())]

    # Prints the names of the final selected features
    print(selected_feat)
    X_values = X_values[selected_feat]
    
    return X_values

def dendrogram(X_values, y):
    corr = spearmanr(X_values).correlation
    # Ensure the correlation matrix is symmetric
    corr = (corr + corr.T) / 2
    np.fill_diagonal(corr, 1)
    distance_matrix = 1 - np.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))
  
    trained_cluster_ids = hierarchy.fcluster(dist_linkage, y, criterion="distance")
    trained_cluster_id_to_feature_ids = defaultdict(list) 
    for idx, trained_cluster_id in enumerate(trained_cluster_ids):
        trained_cluster_id_to_feature_ids[trained_cluster_id].append(idx)
    
    trained_selected_features = [v[0] for v in trained_cluster_id_to_feature_ids.values()]
    final_selected_features = X_values.columns[trained_selected_features]
    X_train = X_values[final_selected_features]
    return X_train

def classificationMetrics(results, y_test, pred):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average=None, zero_division=0)
    recall = recall_score(y_test, pred, average=None)
    F1 = f1_score(y_test, pred, average=None)
    comb = np.concatenate((x, y, acc, prec, recall, F1), axis=None)
    comb = [comb]
    results = results.append(pd.DataFrame(comb, columns=results.columns), ignore_index=True)
    return results


In [8]:
# Sets Pandas Display to Monitor Code
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 200)

In [9]:
# Create Mordred Calculator
calc = Calculator(descriptors, ignore_3D=True)

In [10]:
# Read Training/Test data input File
data = pd.read_csv('BorylationTrainingTest 9-26-24.csv')
data.head()

Unnamed: 0,Substrate,Product,Boronic Ester,Active Catalyst-Ligand,Catalyst,Ligand,Buried_Vol,PyramidalizationAR,PyramidalizationG,SASA_area,SASA_vol,Sterimol_L,SterimolB_1,Sterimol_B_5,Buried_Sterimol_L,Buried_SterimolB_1,Buried_Sterimol_B_5,Hirshfeld Heavy Atom Charge,CM5 Charge,Hirshfeld Carbon Charge,Hirshfeld Hydrogen Charge,ESP Heavy Atom Charge,ESP Carbon Charge,ESP Hydrogen Charge,NPA Carbon Charge,NPA Hydrogen Charge,MBS Heavy Atom Charge,MBS Carbon Charge,MBS Hydrogen Charge,Mulliken Heavy Charge,Mulliken Carbon Charge,Mulliken Hydrogen Charge,Solvent,Temp,Aliphatic Atom Count,Aliphatic Bond Count,Aliphatic Ring Count,Aromatic Atom Count,Aromatic Bond Count,Aromatic Ring Count,Steric Effect Index,Atomic_Polarizability,Balaban Index,Chain Atom Count,Distance Degree,Dreiding Energy,Eccentricity,Harary Index,Hydrogen Acceptor Count,Hydrogen Acceptor Site Count,Heteroatom Aromatic Ring Count,Hydrogen Donor Count,Hydrogen Donor Site Count,Hyper Wiener Index,Largest Ring Size,Max Projection Area,Max Projection Radius,MaxZ,Moleculare Polarizability,Platt Index,Refractivity,Ring Atom Count,Rot Bond Count,Sigma Electronegativity,Wiener Index,Product_Ratio
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-6-7-8-9-10-11-12-15-16-1...,2,6,1,1,43.1,0.683535,10.223844,735.602381,1441.376195,5.941608,4.79083,7.558608,5.941608,4.79083,7.2781,-0.003389,-0.000578,-0.101759,0.032223,-0.078389,-0.182602,0.032537,-0.57256,0.19272,-0.008908,-0.25419,0.080765,-0.22032,-0.621212,0.132089,2,150,8,7,0,0,0,0,1.193207,1.116291,2.53006,8,28,12.13,7,13.742857,0,0,0,0,0,210,0,52.554143,6.587896,5.713901,15.52291,12,38.6102,0,5,7.387931,84,1
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-7-8-9-10-11-12(2)15-16-1...,2,6,1,1,43.1,0.683535,10.223844,735.602381,1441.376195,5.941608,4.79083,7.558608,5.941608,4.79083,7.2781,0.003094,0.000249,-0.057714,0.030404,0.098246,0.182934,-0.042344,-0.38255,0.18757,0.009321,-0.138428,0.073874,0.044556,-0.224805,0.13468,2,150,8,7,0,0,0,0,2.190292,1.116291,2.53006,8,22,12.13,6,13.742857,0,0,0,0,0,210,0,52.554143,6.587896,5.713901,15.52291,12,38.6102,0,5,7.470532,84,0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-7-9-10-11-12(8-2)15-16-1...,2,6,1,1,43.1,0.683535,10.223844,735.602381,1441.376195,5.941608,4.79083,7.558608,5.941608,4.79083,7.2781,-5.2e-05,-3.2e-05,-0.059257,0.029602,-0.011526,0.182934,-0.012016,-0.37899,0.18768,-0.000222,-0.141245,0.070512,0.012779,-0.261495,0.137137,2,150,8,7,0,0,0,0,2.310662,1.116291,2.53006,8,18,12.13,5,13.742857,0,0,0,0,0,210,0,52.554143,6.587896,5.713901,15.52291,12,38.6102,0,5,7.493894,84,0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",InChI=1S/C14H29BO2/c1-7-9-11-12(10-8-2)15-16-1...,2,6,1,1,43.1,0.683535,10.223844,735.602381,1441.376195,5.941608,4.79083,7.558608,5.941608,4.79083,7.2781,0.000299,0.000312,-0.058954,0.029626,-0.008331,0.037009,-0.02267,-0.37717,0.18811,-0.000191,-0.141973,0.070891,0.162986,-0.115385,0.139186,2,150,8,7,0,0,0,0,2.339699,1.116291,2.53006,8,16,12.13,4,13.742857,0,0,0,0,0,210,0,52.554143,6.587896,5.713901,15.52291,12,38.6102,0,5,7.496149,84,0
4,"InChI=1S/C7H15N/c1-2-8-6-4-3-5-7-8/h2-7H2,1H3","InChI=1S/C13H26BNO2/c1-12(2)13(3,4)17-14(16-12...",2,7,3,1,41.0,1.059976,9.242505,814.372698,1707.085805,6.090976,5.055852,9.142,6.090976,5.055852,7.624077,0.002028,0.013777,-0.102309,0.036494,-0.043709,-0.156454,0.023226,-0.58186,0.20119,0.006751,-0.255789,0.08557,-0.086611,-0.497717,0.137539,3,150,8,8,1,0,0,0,2.188081,1.116291,2.125016,2,16,21.2,4,15.783333,1,1,0,0,0,122,6,43.497048,4.404837,6.112425,14.445701,18,36.8787,6,1,7.937482,64,1


In [11]:
#group the compounds by numbers
data['grouped'] = data.groupby('Substrate', sort=False).ngroup()
data[['Substrate','grouped']]

Unnamed: 0,Substrate,grouped
0,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
1,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
2,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
3,"InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",0
4,"InChI=1S/C7H15N/c1-2-8-6-4-3-5-7-8/h2-7H2,1H3",1
...,...,...
1022,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1023,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1024,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198
1025,InChI=1S/C15H18/c1-10(2)13-7-5-11(3)14-8-6-12(...,198


In [12]:
#Convert substrates to Mordred features
data = get_Mordred(data)

100%|██████████| 1027/1027 [01:51<00:00,  9.24it/s]


In [13]:
MLPResults_df = pd.DataFrame(columns = ['x', 'y', 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

SVMResults_df = pd.DataFrame(columns =  ['x', 'y', 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

GaussResults_df = pd.DataFrame(columns =  ['x', 'y', 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

DTResults_df = pd.DataFrame(columns =  ['x', 'y', 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

NBResults_df = pd.DataFrame(columns =  ['x', 'y', 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

LRResults_df = pd.DataFrame(columns =  ['x', 'y', 'acc', 'precision 0',
                                   'precision 1','recall 0', 'recall 1', 'F1 0', 'F1 1'])

MLPmaxacc_comb = pd.DataFrame()
SVMmaxacc_comb = pd.DataFrame()
Gaussmaxacc_comb = pd.DataFrame()
DTmaxacc_comb = pd.DataFrame()
NBmaxacc_comb = pd.DataFrame()
LRmaxacc_comb = pd.DataFrame()

model_columns = pd.DataFrame()
for_range = range(1, 11)
for x in for_range:
    #Get numbers to represent compounds
    arr = np.arange(0, 198,  dtype=int)

    #Get 20% of numbers, without replacement
    set_numbers = np.random.choice(arr, int(len(arr)*0.20), replace=False ) 
    
    #Seperate training (80%) and test data (20%)
    training_data = filter_rows_by_values1(data, "grouped", set_numbers)
    training_data = training_data.drop('grouped', axis = 1)    
    test_data = filter_rows_by_values2(data, "grouped", set_numbers)
    test_data = test_data.drop('grouped', axis = 1)     
     
    #Remove features that dont change
    training_data = remove_zero_varience(training_data)
    
    #Remove features that are more than 95% correlated
    training_data = remove_95correlated(training_data)
    
    # Seperate dataset as response variable (Product Ratio) and feature variables
    #Note: Product Ratio is described as "0" for non-borylating sites and "1" for borylating sites
    training_X = training_data.drop('Product_Ratio' , axis = 1)
    training_y = training_data['Product_Ratio']
    test_X = test_data.drop('Product_Ratio' , axis = 1)
    test_y = test_data['Product_Ratio']
    
    #Remove features that are considered less important
    feature_names = [f"feature {i}" for i in range(training_X.shape[1])]
    forest = RandomForestClassifier(random_state=42)
    forest.fit(training_X, training_y)
    
    start_time = time.time()
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    elapsed_time = time.time() - start_time
    
    threshold = np.sort(importances)[-100] 
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 800, max_depth=30),threshold=threshold)
    sel.fit(training_X, training_y)
     
    # Select the reduced features set 
    sel.get_support()
    selected_feat= training_X.columns[(sel.get_support())]
    
    reduced1_X = training_X[selected_feat]
    test_X = test_X[selected_feat]
    
    #Apply over-sampling to dataset
    ros = RandomOverSampler(random_state=10)
    X_resampled, y_resampled = ros.fit_resample(reduced1_X, training_y) 
    
    for y in [ 
              0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35,
              0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 0.40, 
              0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45,
              0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50
             ]:
    
        #Make final training and test set and save them as df's  
        X_train = dendrogram(X_resampled, y)
        test_X = test_X[X_train.columns]
        training_columns_list = X_train.columns.tolist()
        training_columns_list = (x, y, training_columns_list)
        training_columns_list = (pd.DataFrame(training_columns_list).T)

        #MLP CLassifier
        MLP = MLPClassifier(alpha=1, max_iter=1000)
        MLP.fit(X_train, y_resampled)
        pred_MLP = MLP.predict(test_X)
        MLPResults_df = classificationMetrics(MLPResults_df, test_y, pred_MLP)
        
        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = MLPResults_df.groupby('y')['acc'].mean()
        acc_std = MLPResults_df.groupby('y')['acc'].std()
        precision_0_mean = MLPResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = MLPResults_df.groupby('y')['precision 0'].std()        
        precision_1_mean = MLPResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = MLPResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = MLPResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = MLPResults_df.groupby('y')['recall 0'].std()      
        recall_1_mean = MLPResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = MLPResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = MLPResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = MLPResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = MLPResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = MLPResults_df.groupby('y')['F1 1'].std()
        
        MLPaverage_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std], axis=1)

        MLPaverage_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std']                                 
          
        
                
        MLPmaxacc = MLPaverage_df[MLPaverage_df.acc_mean == MLPaverage_df.acc_mean.max()]
        MLPmaxacc_copy  = MLPmaxacc.copy()
        MLPmaxacc_copy['x_col'] = x       
        
    #SVM CLassifier
        SVM = SVC(kernel="linear", C=0.025)
        SVM.fit(X_train, y_resampled)
        pred_SVM = SVM.predict(test_X)
        SVMResults_df = classificationMetrics(SVMResults_df, test_y, pred_SVM)  
    
        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = SVMResults_df.groupby('y')['acc'].mean()
        acc_std = SVMResults_df.groupby('y')['acc'].std()
        precision_0_mean = SVMResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = SVMResults_df.groupby('y')['precision 0'].std()        
        precision_1_mean = SVMResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = SVMResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = SVMResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = SVMResults_df.groupby('y')['recall 0'].std()      
        recall_1_mean = SVMResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = SVMResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = SVMResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = SVMResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = SVMResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = SVMResults_df.groupby('y')['F1 1'].std()
        
        SVMaverage_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std], axis=1)

        SVMaverage_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std']                                 
                              
        SVMmaxacc = SVMaverage_df[SVMaverage_df.acc_mean == SVMaverage_df.acc_mean.max()]
        SVMmaxacc_copy  = SVMmaxacc.copy()
        SVMmaxacc_copy['x_col'] = x       
                
    #Gaussian Process CLassifier
        Gauss = GaussianProcessClassifier(1.0 * RBF(1.0))
        Gauss.fit(X_train, y_resampled)
        pred_Gauss = Gauss.predict(test_X)
        GaussResults_df = classificationMetrics(GaussResults_df, test_y, pred_Gauss)
    
        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = GaussResults_df.groupby('y')['acc'].mean()
        acc_std = GaussResults_df.groupby('y')['acc'].std()
        precision_0_mean = GaussResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = GaussResults_df.groupby('y')['precision 0'].std()        
        precision_1_mean = GaussResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = GaussResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = GaussResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = GaussResults_df.groupby('y')['recall 0'].std()      
        recall_1_mean = GaussResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = GaussResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = GaussResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = GaussResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = GaussResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = GaussResults_df.groupby('y')['F1 1'].std()
        
        Gaussaverage_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std], axis=1)

        Gaussaverage_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std']                                 

        Gaussmaxacc = Gaussaverage_df[Gaussaverage_df.acc_mean == Gaussaverage_df.acc_mean.max()]
        Gaussmaxacc_copy  = Gaussmaxacc.copy()
        Gaussmaxacc_copy['x_col'] = x
          
    #Decision Tree CLassifier
        DT = DecisionTreeClassifier(max_depth=9)
        DT.fit(X_train, y_resampled)
        pred_DT = DT.predict(test_X)
        DTResults_df = classificationMetrics(DTResults_df, test_y, pred_DT)
 
        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = DTResults_df.groupby('y')['acc'].mean()
        acc_std = DTResults_df.groupby('y')['acc'].std()
        precision_0_mean = DTResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = DTResults_df.groupby('y')['precision 0'].std()        
        precision_1_mean = DTResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = DTResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = DTResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = DTResults_df.groupby('y')['recall 0'].std()      
        recall_1_mean = DTResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = DTResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = DTResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = DTResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = DTResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = DTResults_df.groupby('y')['F1 1'].std()
        
        DTaverage_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std], axis=1)

        DTaverage_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std']                                 
          
        
                
        DTmaxacc = DTaverage_df[DTaverage_df.acc_mean == DTaverage_df.acc_mean.max()]
        DTmaxacc_copy  = DTmaxacc.copy()
        DTmaxacc_copy['x_col'] = x       
        
    #Naive Bayes CLassifier
        NB = GaussianNB()
        NB.fit(X_train, y_resampled)
        pred_NB = NB.predict(test_X)
        NBResults_df = classificationMetrics(NBResults_df, test_y, pred_NB)
 
            #Determine the mean accuracy of the different dendrogram settings
        acc_mean = NBResults_df.groupby('y')['acc'].mean()
        acc_std = NBResults_df.groupby('y')['acc'].std()
        precision_0_mean = NBResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = NBResults_df.groupby('y')['precision 0'].std()        
        precision_1_mean = NBResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = NBResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = NBResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = NBResults_df.groupby('y')['recall 0'].std()      
        recall_1_mean = NBResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = NBResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = NBResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = NBResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = NBResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = NBResults_df.groupby('y')['F1 1'].std()
        
        NBaverage_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std], axis=1)

        NBaverage_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std']                                 

        NBmaxacc = NBaverage_df[NBaverage_df.acc_mean == NBaverage_df.acc_mean.max()]
        NBmaxacc_copy  = NBmaxacc.copy()
        NBmaxacc_copy['x_col'] = x
        
        #Logistic CLassifier
        clf = LogisticRegression(solver = 'liblinear', penalty = "l1", C=0.01)
        clf.fit(X_train, y_resampled)
        pred_LR = clf.predict(test_X)
        LRResults_df = classificationMetrics(LRResults_df, test_y, pred_LR)
        
        
        #Determine the mean accuracy of the different dendrogram settings
        acc_mean = LRResults_df.groupby('y')['acc'].mean()
        acc_std = LRResults_df.groupby('y')['acc'].std()
        precision_0_mean = LRResults_df.groupby('y')['precision 0'].mean()
        precision_0_std = LRResults_df.groupby('y')['precision 0'].std()        
        precision_1_mean = LRResults_df.groupby('y')['precision 1'].mean()
        precision_1_std = LRResults_df.groupby('y')['precision 1'].std()
        recall_0_mean = LRResults_df.groupby('y')['recall 0'].mean()
        recall_0_std = LRResults_df.groupby('y')['recall 0'].std()      
        recall_1_mean = LRResults_df.groupby('y')['recall 1'].mean()
        recall_1_std = LRResults_df.groupby('y')['recall 1'].std()
        F1_0_mean = LRResults_df.groupby('y')['F1 0'].mean()
        F1_0_std = LRResults_df.groupby('y')['F1 0'].std()
        F1_1_mean = LRResults_df.groupby('y')['F1 1'].mean()
        F1_1_std = LRResults_df.groupby('y')['F1 1'].std()
        
        LRaverage_df = pd.concat([acc_mean , acc_std, 
                                   precision_0_mean, precision_0_std, 
                                   precision_1_mean, precision_1_std, 
                                   recall_0_mean, recall_0_std, 
                                   recall_1_mean, recall_1_std,
                                   F1_0_mean, F1_0_std,
                                   F1_1_mean, F1_1_std], axis=1)

        LRaverage_df.columns = ['acc_mean' , 'acc_std', 'precision_0_mean', 'precision_0_std', 
                                 'precision_1_mean', 'precision_1_std', 'recall_0_mean', 'recall_0_std', 
                                 'recall_1_mean','recall_1_std', 'F1_0_mean', 'F1_0_std', 
                                 'F1_1_mean', 'F1_1_std']                                 

        LRmaxacc = LRaverage_df[LRaverage_df.acc_mean == LRaverage_df.acc_mean.max()]
        LRmaxacc_copy  = LRmaxacc.copy()
        LRmaxacc_copy['x_col'] = x
        
             
    MLPmaxacc_comb = MLPmaxacc_comb.append(MLPmaxacc_copy)
    SVMmaxacc_comb = SVMmaxacc_comb.append(SVMmaxacc_copy)
    Gaussmaxacc_comb = Gaussmaxacc_comb.append(Gaussmaxacc_copy)
    DTmaxacc_comb = DTmaxacc_comb.append(DTmaxacc_copy)
    NBmaxacc_comb = NBmaxacc_comb.append(NBmaxacc_copy)
    LRmaxacc_comb = LRmaxacc_comb.append(LRmaxacc_copy)

#print(model_columns)
#model_columns.to_csv("model_columns.csv", index=False)

#Write the results onto a CSV file, currently commented out 
MLPmaxacc_comb.to_csv("10Runs_ClassificationScreening.csv",)
SVMmaxacc_comb.to_csv("10Runs_ClassificationScreening.csv", mode="a")
Gaussmaxacc_comb.to_csv("10Runs_ClassificationScreening.csv", mode="a")
DTmaxacc_comb.to_csv("10Runs_ClassificationScreening.csv", mode="a")
NBmaxacc_comb.to_csv("10Runs_ClassificationScreening.csv", mode="a")
LRmaxacc_comb.to_csv("10Runs_ClassificationScreening.csv", mode="a")


In [14]:
MLPmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.35,0.625806,0.141616,0.861996,0.045822,0.325825,0.064448,0.644898,0.252299,0.553846,0.285578,0.703247,0.173564,0.365015,0.067691,1
0.35,0.650342,0.111328,0.856391,0.037826,0.312011,0.06327,0.687388,0.198797,0.498718,0.238,0.740855,0.133933,0.34931,0.059533,2
0.35,0.639409,0.098395,0.851045,0.033278,0.315916,0.053787,0.667097,0.17702,0.528205,0.210323,0.730719,0.117352,0.369293,0.057821,3
0.35,0.635029,0.104918,0.856295,0.035217,0.309148,0.050657,0.65881,0.184766,0.539103,0.216677,0.725279,0.124975,0.367031,0.055443,4
0.35,0.626622,0.113353,0.861922,0.03804,0.298902,0.052039,0.645448,0.195885,0.550769,0.232145,0.715769,0.133536,0.359988,0.058454,5
0.35,0.625076,0.113527,0.862047,0.037282,0.297159,0.049677,0.643075,0.195084,0.552564,0.22736,0.714198,0.134855,0.360656,0.054654,6
0.4,0.624923,0.115156,0.859443,0.034932,0.299784,0.073688,0.644737,0.196212,0.543223,0.222606,0.714558,0.137495,0.355441,0.058601,7
0.4,0.622211,0.111081,0.857842,0.034788,0.297119,0.069637,0.641186,0.190406,0.544551,0.218459,0.712809,0.132781,0.356016,0.057657,8
0.4,0.611556,0.117466,0.861323,0.037962,0.293754,0.067015,0.622923,0.200947,0.563818,0.228743,0.698348,0.146207,0.357966,0.058692,9
0.4,0.611533,0.115926,0.861911,0.036952,0.289943,0.064938,0.623116,0.197793,0.562564,0.226127,0.699558,0.143262,0.355671,0.057348,10


In [15]:
SVMmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.35,0.612903,0.0,0.886598,0.0,0.314607,0.0,0.585034,0.0,0.717949,0.0,0.704918,0.0,0.4375,0.0,1
0.35,0.638964,0.026738,0.872403,0.014563,0.302231,0.012697,0.643127,0.059602,0.615385,0.105229,0.738365,0.034316,0.403935,0.034437,2
0.35,0.626756,0.02787,0.85938,0.022133,0.303709,0.010495,0.630771,0.051413,0.606838,0.086058,0.726161,0.032858,0.403793,0.027875,3
0.35,0.633002,0.026413,0.860805,0.019249,0.301115,0.01013,0.641288,0.048019,0.596154,0.076539,0.733857,0.031385,0.399336,0.025276,4
0.35,0.639112,0.026601,0.866088,0.020219,0.300152,0.009245,0.647888,0.044867,0.6,0.068724,0.740333,0.030906,0.399469,0.022552,5
0.35,0.626981,0.036551,0.867573,0.018728,0.295126,0.01412,0.629079,0.058913,0.615385,0.071596,0.727616,0.040194,0.397639,0.020962,6
0.35,0.615269,0.044466,0.864082,0.019341,0.286493,0.024983,0.615519,0.063929,0.611722,0.066819,0.717098,0.045329,0.3888,0.029177,7
0.35,0.598845,0.060326,0.869167,0.022583,0.284218,0.024121,0.587048,0.096516,0.641026,0.099934,0.695318,0.071814,0.391126,0.027962,8
0.35,0.600727,0.057088,0.868272,0.021429,0.285159,0.022883,0.590254,0.091388,0.638177,0.0945,0.697855,0.068043,0.391745,0.026403,9
0.35,0.609051,0.059663,0.866313,0.021159,0.283309,0.022401,0.605795,0.098508,0.615385,0.112919,0.707454,0.07071,0.384893,0.032456,10


In [16]:
Gaussmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.35,0.784946,0.0,0.79235,0.0,0.333333,0.0,0.986395,0.0,0.025641,0.0,0.878788,0.0,0.047619,0.0,1
0.35,0.793951,0.009239,0.79964,0.00748,0.166667,0.170996,0.990148,0.003851,0.012821,0.013154,0.884749,0.006116,0.02381,0.024428,2
0.35,0.784661,0.015312,0.789956,0.015188,0.111111,0.159821,0.990907,0.003303,0.008547,0.012294,0.879016,0.009618,0.015873,0.022832,3
0.35,0.789988,0.016176,0.79396,0.014861,0.083333,0.146176,0.99318,0.0049,0.00641,0.011244,0.882402,0.010202,0.011905,0.020882,4
0.35,0.795542,0.01828,0.798719,0.016378,0.066667,0.134687,0.994544,0.005168,0.005128,0.010361,0.885871,0.011486,0.009524,0.019241,5
0.35,0.796455,0.016786,0.799103,0.014951,0.055556,0.125274,0.995454,0.005136,0.004274,0.009636,0.886479,0.010557,0.007937,0.017896,6
0.35,0.797676,0.015811,0.799945,0.01398,0.047619,0.117485,0.996103,0.005013,0.003663,0.009037,0.887263,0.009952,0.006803,0.016784,7
0.35,0.796757,0.014978,0.798742,0.013452,0.041667,0.110935,0.99659,0.004861,0.003205,0.008533,0.886715,0.009414,0.005952,0.015848,8
0.35,0.796532,0.014126,0.798297,0.012737,0.037037,0.105343,0.996969,0.004705,0.002849,0.008103,0.886595,0.008876,0.005291,0.015049,9
0.35,0.798483,0.014628,0.800071,0.013208,0.033333,0.100504,0.997272,0.004553,0.002564,0.007731,0.887805,0.009173,0.004762,0.014358,10


In [17]:
DTmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.4,0.687097,0.0034,0.86883,0.006152,0.35354,0.007423,0.711565,0.003513,0.594872,0.023562,0.782351,0.001799,0.443462,0.012304,1
0.4,0.69946,0.014768,0.889389,0.022216,0.361208,0.012707,0.712795,0.009353,0.648718,0.061189,0.791219,0.010856,0.463547,0.0248,2
0.45,0.704956,0.017566,0.887669,0.018145,0.382147,0.036632,0.71767,0.01429,0.65812,0.052303,0.793524,0.011593,0.482766,0.0385,3
0.35,0.705604,0.019611,0.872993,0.008685,0.365218,0.043518,0.736334,0.021415,0.583974,0.052416,0.798734,0.014739,0.449058,0.047054,4
0.35,0.712054,0.022586,0.880228,0.016963,0.36732,0.039978,0.740039,0.021204,0.599487,0.058123,0.803951,0.017361,0.455079,0.044906,5
0.35,0.717443,0.024404,0.878693,0.0163,0.371614,0.038769,0.749926,0.029972,0.586752,0.062468,0.808889,0.019644,0.453989,0.042471,6
0.35,0.719808,0.023569,0.876482,0.016171,0.370314,0.036523,0.756459,0.032242,0.571429,0.069593,0.811624,0.019529,0.448103,0.042437,7
0.35,0.721028,0.022429,0.876093,0.015241,0.374038,0.035778,0.758075,0.030581,0.572436,0.065375,0.812429,0.018487,0.451254,0.040779,8
0.35,0.724306,0.0233,0.876882,0.014712,0.379531,0.037482,0.761703,0.030816,0.574929,0.062687,0.814865,0.01888,0.456067,0.041213,9
0.35,0.7298,0.027887,0.88105,0.019089,0.385206,0.040009,0.76559,0.031709,0.585897,0.069568,0.818923,0.021843,0.463725,0.046223,10


In [18]:
NBmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.5,0.532258,0.0,0.983871,0.0,0.306452,0.0,0.414966,0.0,0.974359,0.0,0.583732,0.0,0.466258,0.0,1
0.5,0.576474,0.045364,0.949078,0.035696,0.306287,0.000169,0.500166,0.087413,0.871795,0.105229,0.648743,0.0667,0.452107,0.014518,2
0.5,0.585096,0.038757,0.92637,0.043609,0.315302,0.012968,0.520313,0.076459,0.82906,0.105039,0.660891,0.056746,0.454844,0.012393,3
0.5,0.594294,0.037112,0.921192,0.038686,0.312793,0.012018,0.538383,0.073155,0.807692,0.098026,0.674772,0.054657,0.449342,0.0144,4
0.5,0.603473,0.037948,0.911196,0.039988,0.303893,0.020935,0.562135,0.081008,0.758974,0.131664,0.689655,0.057287,0.432201,0.036936,5
0.5,0.615989,0.044638,0.905836,0.038394,0.308799,0.022055,0.584157,0.088972,0.735043,0.131565,0.704013,0.06143,0.432239,0.033661,6
0.5,0.613705,0.04166,0.900939,0.037502,0.303932,0.023667,0.585,0.082299,0.721612,0.126091,0.703969,0.056805,0.425436,0.035364,7
0.5,0.621669,0.044334,0.898941,0.03545,0.310461,0.028132,0.596909,0.083193,0.714744,0.119251,0.712126,0.057359,0.430292,0.035489,8
0.5,0.623355,0.042043,0.897591,0.033619,0.311679,0.02673,0.599755,0.078796,0.712251,0.112575,0.714281,0.054387,0.431262,0.033549,9
0.5,0.622812,0.039897,0.89965,0.032475,0.309923,0.025892,0.598161,0.074865,0.717949,0.108112,0.714231,0.051567,0.430689,0.031857,10


In [19]:
LRmaxacc_comb

Unnamed: 0_level_0,acc_mean,acc_std,precision_0_mean,precision_0_std,precision_1_mean,precision_1_std,recall_0_mean,recall_0_std,recall_1_mean,recall_1_std,F1_0_mean,F1_0_std,F1_1_mean,F1_1_std,x_col
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.35,0.456989,0.000000,0.838235,0.000000,0.237288,0.000000,0.387755,0.000000,0.717949,0.000000,0.530233,0.000000,0.356688,0.000000,1
0.40,0.456989,0.000000,0.859375,0.000000,0.245902,0.000000,0.374150,0.000000,0.769231,0.000000,0.521327,0.000000,0.372671,0.000000,1
0.45,0.456989,0.000000,0.859375,0.000000,0.245902,0.000000,0.374150,0.000000,0.769231,0.000000,0.521327,0.000000,0.372671,0.000000,1
0.50,0.456989,0.000000,0.859375,0.000000,0.245902,0.000000,0.374150,0.000000,0.769231,0.000000,0.521327,0.000000,0.372671,0.000000,1
0.40,0.519135,0.063760,0.851427,0.008155,0.242269,0.003727,0.482807,0.111480,0.653846,0.118382,0.608334,0.089267,0.351690,0.021526,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.45,0.522015,0.062163,0.851233,0.012215,0.243651,0.014412,0.486575,0.100736,0.655678,0.092457,0.613201,0.081768,0.353667,0.022293,7
0.40,0.525312,0.058755,0.852587,0.011972,0.247285,0.016585,0.489528,0.094472,0.660256,0.087263,0.616679,0.076977,0.358371,0.024310,8
0.45,0.525312,0.058755,0.852587,0.011972,0.247285,0.016585,0.489528,0.094472,0.660256,0.087263,0.616679,0.076977,0.358371,0.024310,8
0.40,0.530102,0.057008,0.850628,0.012581,0.247906,0.015725,0.498418,0.092529,0.649573,0.087651,0.623400,0.075000,0.357353,0.023086,9
