In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, \
  ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv(r'..\10_fold_cross_validation\train_10folds.csv')
data

Unnamed: 0,MaxPartialCharge,FpDensityMorgan2,BCUT2D_CHGLO,BCUT2D_MRHI,PEOE_VSA12,PEOE_VSA6,SMR_VSA3,SlogP_VSA3,SlogP_VSA8,EState_VSA6,NumHAcceptors,NumSaturatedCarbocycles,fr_bicyclic,TARGET,Kfold
0,0.335201,1.714286,-2.072068,5.975550,0.000000,12.132734,9.551078,0.000000,38.974594,43.638476,6,0,1,1.0,0
1,0.226791,2.000000,-2.362642,7.150190,5.948339,0.000000,9.967957,0.000000,10.440599,11.336786,6,0,0,1.0,1
2,0.211302,1.772727,-2.089721,7.912349,9.837253,29.297126,4.983979,9.837253,10.902925,24.265468,4,0,1,0.0,1
3,0.226898,1.869565,-2.357502,6.433493,5.948339,35.334614,15.284746,0.000000,11.126903,12.263211,5,0,0,0.0,0
4,0.158370,1.870968,-2.421374,7.991366,0.000000,23.362825,19.935914,6.420822,0.000000,18.460054,9,0,1,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2734,0.152613,2.000000,-2.415835,6.432407,0.000000,35.334614,9.967957,11.343745,11.257379,6.066367,6,0,0,1.0,3
2735,0.335203,1.700000,-2.001320,6.164147,0.000000,30.331835,4.983979,0.000000,10.902925,36.528679,2,0,1,0.0,6
2736,0.264178,1.833333,-2.414068,5.737009,5.948339,38.112943,9.551078,11.784535,11.126903,11.614772,7,0,0,1.0,8
2737,0.338995,1.815789,-2.065301,7.218862,5.907180,59.014740,24.544948,18.386966,43.634305,6.196844,9,0,2,0.0,5


In [3]:
data.columns

Index(['MaxPartialCharge', 'FpDensityMorgan2', 'BCUT2D_CHGLO', 'BCUT2D_MRHI',
       'PEOE_VSA12', 'PEOE_VSA6', 'SMR_VSA3', 'SlogP_VSA3', 'SlogP_VSA8',
       'EState_VSA6', 'NumHAcceptors', 'NumSaturatedCarbocycles',
       'fr_bicyclic', 'TARGET', 'Kfold'],
      dtype='object')

In [4]:
def run(fold, data):
    # load the full training data with folds
    df = data
    # all columns are features except target and kfold columns
    features = [
        f for f in df.columns if f not in ("TARGET", "Kfold")
    ]
    # get training data using folds
    df_train = df[df.Kfold != fold].reset_index(drop=True)
    # get validation data using folds
    df_valid = df[df.Kfold == fold].reset_index(drop=True)
    # get training data
    X_train = df_train[features].values
    # get validation data
    X_valid = df_valid[features].values
    # initialize Logistic Regression model
    model = LogisticRegression()
    model.fit(X_train, df_train.TARGET)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(df_valid.TARGET.values, valid_preds)
    y_pred = model.predict(X_valid)
    y_true = df_valid.TARGET.values
    accuracy = accuracy_score(y_true,y_pred)
    precision_1 = precision_score(y_true,y_pred,pos_label=1)
    precision_0 = precision_score(y_true,y_pred,pos_label=0)
    recall_1 = recall_score(y_true,y_pred,pos_label=1)
    recall_0 = recall_score(y_true,y_pred,pos_label=0)
    f1score = f1_score(y_true,y_pred)
    kappa = cohen_kappa_score(y_true,y_pred)
    MCC = matthews_corrcoef(y_true,y_pred)
    print(f"Fold = {fold}, AUC = {auc}, Accuracy = {accuracy}, \
          Precision_1 = {precision_1}, Precision_0 = {precision_0}\
          Recall_1 = {recall_1}, Recall_0 = {recall_0}, F1Score = {f1score}, kappa = {kappa}, MCC = {MCC}")
    
    return auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model

In [7]:
aucs, accuracies, precisions_1, precisions_0, recalls_1, recalls_0, f1scores, kappas, MCCs = [], [], [], [], [], [], [], [], []

for fold_ in range(10):
    auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model = run(fold_, data)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions_1.append(precision_1)
    precisions_0.append(precision_0)
    recalls_1.append(recall_1)
    recalls_0.append(recall_0)
    f1scores.append(f1score)
    kappas.append(kappa)
    MCCs.append(MCC)
    # filename = 'XGBoost_' + str(fold_) + '.pkl'
    # joblib.dump(model, filename)
print("\n")
print(f"Mean Scores: AUC = {np.mean(np.array(aucs))}, \
      Accuracy = {np.mean(np.array(accuracies))}, \
      Precision_1 = {np.mean(np.array(precisions_1))}, Precision_0 = {np.mean(np.array(precisions_0))}\
      Recall_1 = {np.mean(np.array(recalls_1))}, Recall_0 = {np.mean(np.array(recalls_0))}\
      F1Score = {np.mean(np.array(f1scores))} \
      Kappa = {np.mean(np.array(kappas))} \
      MCC = {np.mean(np.array(MCCs))}")

Fold = 0, AUC = 0.8591065292096219, Accuracy = 0.7992700729927007,           Precision_1 = 0.723404255319149, Precision_0 = 0.8388888888888889          Recall_1 = 0.7010309278350515, Recall_0 = 0.8531073446327684, F1Score = 0.7120418848167538, kappa = 0.5580385946389818, MCC = 0.5582008165095612
Fold = 1, AUC = 0.8764051488147242, Accuracy = 0.8065693430656934,           Precision_1 = 0.782051282051282, Precision_0 = 0.8163265306122449          Recall_1 = 0.6288659793814433, Recall_0 = 0.903954802259887, F1Score = 0.6971428571428572, kappa = 0.557498933512097, MCC = 0.5646486818015339
Fold = 2, AUC = 0.8630671559205544, Accuracy = 0.7883211678832117,           Precision_1 = 0.7142857142857143, Precision_0 = 0.825136612021858          Recall_1 = 0.6701030927835051, Recall_0 = 0.8531073446327684, F1Score = 0.6914893617021276, kappa = 0.5306279165928289, MCC = 0.5312545447330203
Fold = 3, AUC = 0.8747743025219873, Accuracy = 0.8211678832116789,           Precision_1 = 0.8, Precision_0 = 0

In [8]:
fold_metrics = pd.DataFrame(columns=['Accuracy','AUC','Precision_1','Precision_0','Recall_1','Recall_0','F1score','Kappa','MCC'])
fold_metrics['Accuracy'] = np.array(accuracies)
fold_metrics['AUC'] = np.array(aucs)
fold_metrics['Precision_1'] = np.array(precisions_1)
fold_metrics['Precision_0'] = np.array(precisions_0)
fold_metrics['Recall_1'] = np.array(recalls_1)
fold_metrics['Recall_0'] = np.array(recalls_0)
fold_metrics['F1score'] = np.array(f1scores)
fold_metrics['Kappa'] = np.array(kappas)
fold_metrics['MCC'] = np.array(MCCs)
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.79927,0.859107,0.723404,0.838889,0.701031,0.853107,0.712042,0.558039,0.558201
1,0.806569,0.876405,0.782051,0.816327,0.628866,0.903955,0.697143,0.557499,0.564649
2,0.788321,0.863067,0.714286,0.825137,0.670103,0.853107,0.691489,0.530628,0.531255
3,0.821168,0.874774,0.8,0.829897,0.659794,0.909605,0.723164,0.59288,0.598884
4,0.824818,0.880016,0.795181,0.837696,0.680412,0.903955,0.733333,0.60407,0.608139
5,0.839416,0.906867,0.791209,0.863388,0.742268,0.892655,0.765957,0.643925,0.644685
6,0.79562,0.858815,0.741176,0.820106,0.649485,0.875706,0.692308,0.540296,0.542937
7,0.784672,0.846904,0.686275,0.843023,0.721649,0.819209,0.703518,0.534631,0.535047
8,0.835766,0.909235,0.752475,0.884393,0.791667,0.859551,0.771574,0.643497,0.644003
9,0.787546,0.861582,0.686275,0.847953,0.729167,0.819209,0.707071,0.540644,0.541255


In [9]:
fold_metrics.loc[10,:] = [np.mean(np.array(accuracies)), np.mean(np.array(aucs)), np.mean(np.array(precisions_1)),
                               np.mean(np.array(precisions_0)), np.mean(np.array(recalls_1)), np.mean(np.array(recalls_0)),
                            np.mean(np.array(f1scores)), np.mean(np.array(kappas)), np.mean(np.array(MCCs))]

fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.79927,0.859107,0.723404,0.838889,0.701031,0.853107,0.712042,0.558039,0.558201
1,0.806569,0.876405,0.782051,0.816327,0.628866,0.903955,0.697143,0.557499,0.564649
2,0.788321,0.863067,0.714286,0.825137,0.670103,0.853107,0.691489,0.530628,0.531255
3,0.821168,0.874774,0.8,0.829897,0.659794,0.909605,0.723164,0.59288,0.598884
4,0.824818,0.880016,0.795181,0.837696,0.680412,0.903955,0.733333,0.60407,0.608139
5,0.839416,0.906867,0.791209,0.863388,0.742268,0.892655,0.765957,0.643925,0.644685
6,0.79562,0.858815,0.741176,0.820106,0.649485,0.875706,0.692308,0.540296,0.542937
7,0.784672,0.846904,0.686275,0.843023,0.721649,0.819209,0.703518,0.534631,0.535047
8,0.835766,0.909235,0.752475,0.884393,0.791667,0.859551,0.771574,0.643497,0.644003
9,0.787546,0.861582,0.686275,0.847953,0.729167,0.819209,0.707071,0.540644,0.541255


In [10]:
fold_metrics.loc[11,:] = [np.std(np.array(accuracies)), np.std(np.array(aucs)), np.std(np.array(precisions_1)),
                               np.std(np.array(precisions_0)), np.std(np.array(recalls_1)), np.std(np.array(recalls_0)),
                            np.std(np.array(f1scores)), np.std(np.array(kappas)), np.std(np.array(MCCs))]

fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.79927,0.859107,0.723404,0.838889,0.701031,0.853107,0.712042,0.558039,0.558201
1,0.806569,0.876405,0.782051,0.816327,0.628866,0.903955,0.697143,0.557499,0.564649
2,0.788321,0.863067,0.714286,0.825137,0.670103,0.853107,0.691489,0.530628,0.531255
3,0.821168,0.874774,0.8,0.829897,0.659794,0.909605,0.723164,0.59288,0.598884
4,0.824818,0.880016,0.795181,0.837696,0.680412,0.903955,0.733333,0.60407,0.608139
5,0.839416,0.906867,0.791209,0.863388,0.742268,0.892655,0.765957,0.643925,0.644685
6,0.79562,0.858815,0.741176,0.820106,0.649485,0.875706,0.692308,0.540296,0.542937
7,0.784672,0.846904,0.686275,0.843023,0.721649,0.819209,0.703518,0.534631,0.535047
8,0.835766,0.909235,0.752475,0.884393,0.791667,0.859551,0.771574,0.643497,0.644003
9,0.787546,0.861582,0.686275,0.847953,0.729167,0.819209,0.707071,0.540644,0.541255


In [11]:
fold_metrics.index = ['Fold_0','Fold_1','Fold_2','Fold_3','Fold_4','Fold_5','Fold_6','Fold_7','Fold_8','Fold_9','Mean','Std']
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
Fold_0,0.79927,0.859107,0.723404,0.838889,0.701031,0.853107,0.712042,0.558039,0.558201
Fold_1,0.806569,0.876405,0.782051,0.816327,0.628866,0.903955,0.697143,0.557499,0.564649
Fold_2,0.788321,0.863067,0.714286,0.825137,0.670103,0.853107,0.691489,0.530628,0.531255
Fold_3,0.821168,0.874774,0.8,0.829897,0.659794,0.909605,0.723164,0.59288,0.598884
Fold_4,0.824818,0.880016,0.795181,0.837696,0.680412,0.903955,0.733333,0.60407,0.608139
Fold_5,0.839416,0.906867,0.791209,0.863388,0.742268,0.892655,0.765957,0.643925,0.644685
Fold_6,0.79562,0.858815,0.741176,0.820106,0.649485,0.875706,0.692308,0.540296,0.542937
Fold_7,0.784672,0.846904,0.686275,0.843023,0.721649,0.819209,0.703518,0.534631,0.535047
Fold_8,0.835766,0.909235,0.752475,0.884393,0.791667,0.859551,0.771574,0.643497,0.644003
Fold_9,0.787546,0.861582,0.686275,0.847953,0.729167,0.819209,0.707071,0.540644,0.541255


# Bayesian Optimization

In [12]:
from hyperopt import hp
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [13]:
class BayesianOPT:
    def __init__(self, data, space):
        self.space = space
        self.data = data
        self.trials = Trials()
    
    def _split_data(self, fold):
        df = self.data
        # all columns are features except target and kfold columns
        features = [
            f for f in df.columns if f not in ("TARGET", "Kfold")
        ]
        # get training data using folds
        df_train = df[df.Kfold != fold].reset_index(drop=True)
        # get validation data using folds
        df_valid = df[df.Kfold == fold].reset_index(drop=True)
        # get training data
        X_train = df_train[features].values
        # get validation data
        X_test = df_valid[features].values
        y_train = df_train.TARGET.values
        y_test = df_valid.TARGET.values
        
        return X_train, y_train, X_test, y_test
    
    def _objective(self,space):
        clf= LogisticRegression(**space)
        
        
        accuracies = []
        for fold_ in range(10):
            X_train, y_train, X_test, y_test = self._split_data(fold_)
            evaluation = [( X_train, y_train), ( X_test, y_test)]
        

            clf.fit(X_train, y_train)


            pred = clf.predict(X_test)
            accuracy = accuracy_score(y_test, pred>0.5)
            accuracies.append(accuracy)
        
        final_accuracy = np.mean(np.array(accuracies))
        #print ("SCORE:", final_accuracy)
        return {'loss': -final_accuracy, 'status': STATUS_OK }
    
    def search_hyperparameters(self):

        best_hyperparams = fmin(fn = self._objective,
                                space = self.space,
                                algo = tpe.suggest,
                                max_evals = 500,
                                trials = self.trials)
        return best_hyperparams

In [16]:
space = {
    'C': hp.loguniform('C', np.log(0.01), np.log(10)),
    'penalty': hp.choice('penalty', ['l2'])
}

In [17]:
bayes_opt = BayesianOPT(data, space)
best_params = bayes_opt.search_hyperparameters()
print(best_params)

100%|█████████████████████████████████████████████| 500/500 [22:35<00:00,  2.71s/trial, best loss: -0.8148899762038448]
{'C': 3.6918690818779667, 'penalty': 0}


In [14]:
print(best_params)

{'colsample_bytree': 0.9611638174070645, 'gamma': 0.5344801495975495, 'learning_rate': 0.048882041230470526, 'max_depth': 14.0, 'min_child_weight': 0.0, 'reg_alpha': 0.0, 'reg_lambda': 1.164764580743543}


# Train the model with optimized hyperparameters

In [16]:
def run(fold, data, best_hyper_parameters):
    # load the full training data with folds
    df = data
    # all columns are features except target and kfold columns
    features = [
        f for f in df.columns if f not in ("TARGET", "Kfold")
    ]
    # get training data using folds
    df_train = df[df.Kfold != fold].reset_index(drop=True)
    # get validation data using folds
    df_valid = df[df.Kfold == fold].reset_index(drop=True)
    # get training data
    X_train = df_train[features].values
    # get validation data
    X_valid = df_valid[features].values
    # initialize Logistic Regression model
    model = XGBClassifier(n_jobs = -1,
                          colsample_bytree= best_hyper_parameters["colsample_bytree"], 
                          gamma= best_hyper_parameters["gamma"], 
                          max_depth= int(best_hyper_parameters["max_depth"]), 
                          min_child_weight= best_hyper_parameters["min_child_weight"], 
                          reg_alpha= best_hyper_parameters["reg_alpha"], 
                          reg_lambda= best_hyper_parameters["reg_lambda"])
    
    model.fit(X_train, df_train.TARGET.values)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(df_valid.TARGET.values, valid_preds)
    y_pred = model.predict(X_valid)
    y_true = df_valid.TARGET.values
    accuracy = accuracy_score(y_true,y_pred)
    precision_1 = precision_score(y_true,y_pred,pos_label=1)
    precision_0 = precision_score(y_true,y_pred,pos_label=0)
    recall_1 = recall_score(y_true,y_pred,pos_label=1)
    recall_0 = recall_score(y_true,y_pred,pos_label=0)
    f1score = f1_score(y_true,y_pred)
    kappa = cohen_kappa_score(y_true,y_pred)
    MCC = matthews_corrcoef(y_true,y_pred)
    print(f"Fold = {fold}, AUC = {auc}, Accuracy = {accuracy}, \
          Precision_1 = {precision_1}, Precision_0 = {precision_0}\
          Recall_1 = {recall_1}, Recall_0 = {recall_0}, F1Score = {f1score}, kappa = {kappa}, MCC = {MCC}")
    
    return auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model

In [17]:
aucs, accuracies, precisions_1, precisions_0, recalls_1, recalls_0, f1scores, kappas, MCCs = [], [], [], [], [], [], [], [], []

for fold_ in range(10):
    auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model = run(fold_, data, best_params)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions_1.append(precision_1)
    precisions_0.append(precision_0)
    recalls_1.append(recall_1)
    recalls_0.append(recall_0)
    f1scores.append(f1score)
    kappas.append(kappa)
    MCCs.append(MCC)
    filename = 'XGBoost_' + str(fold_) + '.pkl'
    joblib.dump(model, filename)
print("\n")
print(f"Mean Scores: AUC = {np.mean(np.array(aucs))}, \
      Accuracy = {np.mean(np.array(accuracies))}, \
      Precision_1 = {np.mean(np.array(precisions_1))}, Precision_0 = {np.mean(np.array(precisions_0))}\
      Recall_1 = {np.mean(np.array(recalls_1))}, Recall_0 = {np.mean(np.array(recalls_0))}\
      F1Score = {np.mean(np.array(f1scores))} \
      Kappa = {np.mean(np.array(kappas))} \
      MCC = {np.mean(np.array(MCCs))}")

Fold = 0, AUC = 0.9680820082707206, Accuracy = 0.9598540145985401,           Precision_1 = 0.9479166666666666, Precision_0 = 0.9662921348314607          Recall_1 = 0.9381443298969072, Recall_0 = 0.9717514124293786, F1Score = 0.9430051813471502, kappa = 0.9120205499445385, MCC = 0.9120497223728334
Fold = 1, AUC = 0.9729163026384763, Accuracy = 0.9343065693430657,           Precision_1 = 0.9438202247191011, Precision_0 = 0.9297297297297298          Recall_1 = 0.865979381443299, Recall_0 = 0.9717514124293786, F1Score = 0.9032258064516129, kappa = 0.8536411656478129, MCC = 0.8554529191182063
Fold = 2, AUC = 0.9726250800862019, Accuracy = 0.9233576642335767,           Precision_1 = 0.8958333333333334, Precision_0 = 0.9382022471910112          Recall_1 = 0.8865979381443299, Recall_0 = 0.943502824858757, F1Score = 0.8911917098445595, kappa = 0.8320392317123008, MCC = 0.8320658458108835
Fold = 3, AUC = 0.9630147358611452, Accuracy = 0.927007299270073,           Precision_1 = 0.8888888888888888