In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, \
  ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef

In [2]:
data = pd.read_csv(r'..\..\10_fold_cross_validation\train_10folds.csv')
data

Unnamed: 0,MaxPartialCharge,FpDensityMorgan2,BCUT2D_CHGLO,BCUT2D_MRHI,PEOE_VSA12,PEOE_VSA6,SMR_VSA3,SlogP_VSA3,SlogP_VSA8,EState_VSA6,NumHAcceptors,NumSaturatedCarbocycles,fr_bicyclic,TARGET,Kfold
0,0.335201,1.714286,-2.072068,5.975550,0.000000,12.132734,9.551078,0.000000,38.974594,43.638476,6,0,1,1.0,0
1,0.226791,2.000000,-2.362642,7.150190,5.948339,0.000000,9.967957,0.000000,10.440599,11.336786,6,0,0,1.0,1
2,0.211302,1.772727,-2.089721,7.912349,9.837253,29.297126,4.983979,9.837253,10.902925,24.265468,4,0,1,0.0,1
3,0.226898,1.869565,-2.357502,6.433493,5.948339,35.334614,15.284746,0.000000,11.126903,12.263211,5,0,0,0.0,0
4,0.158370,1.870968,-2.421374,7.991366,0.000000,23.362825,19.935914,6.420822,0.000000,18.460054,9,0,1,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2734,0.152613,2.000000,-2.415835,6.432407,0.000000,35.334614,9.967957,11.343745,11.257379,6.066367,6,0,0,1.0,3
2735,0.335203,1.700000,-2.001320,6.164147,0.000000,30.331835,4.983979,0.000000,10.902925,36.528679,2,0,1,0.0,6
2736,0.264178,1.833333,-2.414068,5.737009,5.948339,38.112943,9.551078,11.784535,11.126903,11.614772,7,0,0,1.0,8
2737,0.338995,1.815789,-2.065301,7.218862,5.907180,59.014740,24.544948,18.386966,43.634305,6.196844,9,0,2,0.0,5


In [3]:
scaler = StandardScaler()
model = scaler.fit(data.iloc[:,:-5])
data.iloc[:,:-5] = model.transform(data.iloc[:,:-5])
data

Unnamed: 0,MaxPartialCharge,FpDensityMorgan2,BCUT2D_CHGLO,BCUT2D_MRHI,PEOE_VSA12,PEOE_VSA6,SMR_VSA3,SlogP_VSA3,SlogP_VSA8,EState_VSA6,NumHAcceptors,NumSaturatedCarbocycles,fr_bicyclic,TARGET,Kfold
0,0.585376,-0.250569,1.152690,-0.895736,-0.763312,-0.829512,-0.045451,-0.928526,2.571709,1.347435,6,0,1,1.0,0
1,-0.812543,0.773954,-0.620110,-0.129854,0.246980,-1.356805,0.007064,-0.928526,0.098776,-0.519958,6,0,0,1.0,1
2,-1.012262,-0.041007,1.044985,0.367084,0.907490,-0.083541,-0.620773,-0.192245,0.138844,0.227462,4,0,1,0.0,1
3,-0.811158,0.306237,-0.588754,-0.597150,0.246980,0.178850,0.676825,-0.928526,0.158255,-0.466400,5,0,0,0.0,0
4,-1.694810,0.311266,-0.978435,0.418605,-0.763312,-0.341448,1.262737,-0.447952,-0.806071,-0.108155,9,0,1,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2734,-1.769042,0.773954,-0.944643,-0.597859,-0.763312,0.178850,0.007064,-0.079490,0.169563,-0.824646,6,0,0,1.0,3
2735,0.585410,-0.301795,1.584323,-0.772768,-0.763312,-0.038572,-0.620773,-0.928526,0.138844,0.936410,2,0,1,0.0,6
2736,-0.330450,0.176316,-0.933859,-1.051268,0.246980,0.299597,-0.045451,-0.046499,0.158255,-0.503887,7,0,0,1.0,8
2737,0.634297,0.113406,1.193971,-0.085079,0.239989,1.207997,1.843341,0.447668,2.975548,-0.817103,9,0,2,0.0,5


In [4]:
def run(fold, data):
    # load the full training data with folds
    df = data
    # all columns are features except target and kfold columns
    features = [
        f for f in df.columns if f not in ("TARGET", "Kfold")
    ]
    # get training data using folds
    df_train = df[df.Kfold != fold].reset_index(drop=True)
    # get validation data using folds
    df_valid = df[df.Kfold == fold].reset_index(drop=True)
    # get training data
    X_train = df_train[features].values
    # get validation data
    X_valid = df_valid[features].values
    # initialize Logistic Regression model
    model = SVC(probability=True)
    model.fit(X_train, df_train.TARGET.values)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(df_valid.TARGET.values, valid_preds)
    y_pred = model.predict(X_valid)
    y_true = df_valid.TARGET.values
    accuracy = accuracy_score(y_true,y_pred)
    precision_1 = precision_score(y_true,y_pred,pos_label=1)
    precision_0 = precision_score(y_true,y_pred,pos_label=0)
    recall_1 = recall_score(y_true,y_pred,pos_label=1)
    recall_0 = recall_score(y_true,y_pred,pos_label=0)
    f1score = f1_score(y_true,y_pred)
    kappa = cohen_kappa_score(y_true,y_pred)
    MCC = matthews_corrcoef(y_true,y_pred)
    print(f"Fold = {fold}, AUC = {auc}, Accuracy = {accuracy}, \
          Precision_1 = {precision_1}, Precision_0 = {precision_0}\
          Recall_1 = {recall_1}, Recall_0 = {recall_0}, F1Score = {f1score}, kappa = {kappa}, MCC = {MCC}")
    
    return auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model

In [5]:
aucs, accuracies, precisions_1, precisions_0, recalls_1, recalls_0, f1scores, kappas, MCCs = [], [], [], [], [], [], [], [], []

for fold_ in range(10):
    auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model = run(fold_, data)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions_1.append(precision_1)
    precisions_0.append(precision_0)
    recalls_1.append(recall_1)
    recalls_0.append(recall_0)
    f1scores.append(f1score)
    kappas.append(kappa)
    MCCs.append(MCC)
    filename = 'SVM_' + str(fold_) + '.pkl'
    joblib.dump(model, filename)
print("\n")
print(f"Mean Scores: AUC = {np.mean(np.array(aucs))}, \
      Accuracy = {np.mean(np.array(accuracies))}, \
      Precision_1 = {np.mean(np.array(precisions_1))}, Precision_0 = {np.mean(np.array(precisions_0))}\
      Recall_1 = {np.mean(np.array(recalls_1))}, Recall_0 = {np.mean(np.array(recalls_0))}\
      F1Score = {np.mean(np.array(f1scores))} \
      Kappa = {np.mean(np.array(kappas))} \
      MCC = {np.mean(np.array(MCCs))}")

Fold = 0, AUC = 0.9149630147358612, Accuracy = 0.8613138686131386,           Precision_1 = 0.8314606741573034, Precision_0 = 0.8756756756756757          Recall_1 = 0.7628865979381443, Recall_0 = 0.9152542372881356, F1Score = 0.7956989247311828, kappa = 0.6910202385898273, MCC = 0.6924868481744613
Fold = 1, AUC = 0.9122255227444814, Accuracy = 0.8759124087591241,           Precision_1 = 0.8795180722891566, Precision_0 = 0.8743455497382199          Recall_1 = 0.7525773195876289, Recall_0 = 0.943502824858757, F1Score = 0.8111111111111111, kappa = 0.7195496417604913, MCC = 0.7243959545122349
Fold = 2, AUC = 0.909255052711282, Accuracy = 0.8540145985401459,           Precision_1 = 0.8131868131868132, Precision_0 = 0.8743169398907104          Recall_1 = 0.7628865979381443, Recall_0 = 0.903954802259887, F1Score = 0.7872340425531914, kappa = 0.6762951148916061, MCC = 0.6770937640708392
Fold = 3, AUC = 0.9083231405440038, Accuracy = 0.8467153284671532,           Precision_1 = 0.7894736842105263

In [6]:
fold_metrics = pd.DataFrame(columns=['Accuracy','AUC','Precision_1','Precision_0','Recall_1','Recall_0','F1score','Kappa','MCC'])
fold_metrics['Accuracy'] = np.array(accuracies)
fold_metrics['AUC'] = np.array(aucs)
fold_metrics['Precision_1'] = np.array(precisions_1)
fold_metrics['Precision_0'] = np.array(precisions_0)
fold_metrics['Recall_1'] = np.array(recalls_1)
fold_metrics['Recall_0'] = np.array(recalls_0)
fold_metrics['F1score'] = np.array(f1scores)
fold_metrics['Kappa'] = np.array(kappas)
fold_metrics['MCC'] = np.array(MCCs)
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.861314,0.914963,0.831461,0.875676,0.762887,0.915254,0.795699,0.69102,0.692487
1,0.875912,0.912226,0.879518,0.874346,0.752577,0.943503,0.811111,0.71955,0.724396
2,0.854015,0.909255,0.813187,0.874317,0.762887,0.903955,0.787234,0.676295,0.677094
3,0.846715,0.908323,0.789474,0.877095,0.773196,0.887006,0.78125,0.663292,0.663377
4,0.850365,0.90908,0.785714,0.886364,0.793814,0.881356,0.789744,0.673601,0.673622
5,0.890511,0.929699,0.860215,0.906077,0.824742,0.926554,0.842105,0.758363,0.758757
6,0.883212,0.917584,0.857143,0.896175,0.804124,0.926554,0.829787,0.741036,0.741911
7,0.843066,0.911701,0.759615,0.894118,0.814433,0.858757,0.78607,0.662388,0.66339
8,0.875912,0.931355,0.816327,0.909091,0.833333,0.898876,0.824742,0.728713,0.728806
9,0.857143,0.907015,0.806452,0.883333,0.78125,0.898305,0.793651,0.684449,0.684651


In [7]:
fold_metrics.loc[10,:] = [np.mean(np.array(accuracies)), np.mean(np.array(aucs)), np.mean(np.array(precisions_1)),
                               np.mean(np.array(precisions_0)), np.mean(np.array(recalls_1)), np.mean(np.array(recalls_0)),
                            np.mean(np.array(f1scores)), np.mean(np.array(kappas)), np.mean(np.array(MCCs))]

fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.861314,0.914963,0.831461,0.875676,0.762887,0.915254,0.795699,0.69102,0.692487
1,0.875912,0.912226,0.879518,0.874346,0.752577,0.943503,0.811111,0.71955,0.724396
2,0.854015,0.909255,0.813187,0.874317,0.762887,0.903955,0.787234,0.676295,0.677094
3,0.846715,0.908323,0.789474,0.877095,0.773196,0.887006,0.78125,0.663292,0.663377
4,0.850365,0.90908,0.785714,0.886364,0.793814,0.881356,0.789744,0.673601,0.673622
5,0.890511,0.929699,0.860215,0.906077,0.824742,0.926554,0.842105,0.758363,0.758757
6,0.883212,0.917584,0.857143,0.896175,0.804124,0.926554,0.829787,0.741036,0.741911
7,0.843066,0.911701,0.759615,0.894118,0.814433,0.858757,0.78607,0.662388,0.66339
8,0.875912,0.931355,0.816327,0.909091,0.833333,0.898876,0.824742,0.728713,0.728806
9,0.857143,0.907015,0.806452,0.883333,0.78125,0.898305,0.793651,0.684449,0.684651


In [8]:
fold_metrics.loc[11,:] = [np.std(np.array(accuracies)), np.std(np.array(aucs)), np.std(np.array(precisions_1)),
                               np.std(np.array(precisions_0)), np.std(np.array(recalls_1)), np.std(np.array(recalls_0)),
                            np.std(np.array(f1scores)), np.std(np.array(kappas)), np.std(np.array(MCCs))]

fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.861314,0.914963,0.831461,0.875676,0.762887,0.915254,0.795699,0.69102,0.692487
1,0.875912,0.912226,0.879518,0.874346,0.752577,0.943503,0.811111,0.71955,0.724396
2,0.854015,0.909255,0.813187,0.874317,0.762887,0.903955,0.787234,0.676295,0.677094
3,0.846715,0.908323,0.789474,0.877095,0.773196,0.887006,0.78125,0.663292,0.663377
4,0.850365,0.90908,0.785714,0.886364,0.793814,0.881356,0.789744,0.673601,0.673622
5,0.890511,0.929699,0.860215,0.906077,0.824742,0.926554,0.842105,0.758363,0.758757
6,0.883212,0.917584,0.857143,0.896175,0.804124,0.926554,0.829787,0.741036,0.741911
7,0.843066,0.911701,0.759615,0.894118,0.814433,0.858757,0.78607,0.662388,0.66339
8,0.875912,0.931355,0.816327,0.909091,0.833333,0.898876,0.824742,0.728713,0.728806
9,0.857143,0.907015,0.806452,0.883333,0.78125,0.898305,0.793651,0.684449,0.684651


In [9]:
fold_metrics.index = ['Fold_0','Fold_1','Fold_2','Fold_3','Fold_4','Fold_5','Fold_6','Fold_7','Fold_8','Fold_9','Mean','Std']
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
Fold_0,0.861314,0.914963,0.831461,0.875676,0.762887,0.915254,0.795699,0.69102,0.692487
Fold_1,0.875912,0.912226,0.879518,0.874346,0.752577,0.943503,0.811111,0.71955,0.724396
Fold_2,0.854015,0.909255,0.813187,0.874317,0.762887,0.903955,0.787234,0.676295,0.677094
Fold_3,0.846715,0.908323,0.789474,0.877095,0.773196,0.887006,0.78125,0.663292,0.663377
Fold_4,0.850365,0.90908,0.785714,0.886364,0.793814,0.881356,0.789744,0.673601,0.673622
Fold_5,0.890511,0.929699,0.860215,0.906077,0.824742,0.926554,0.842105,0.758363,0.758757
Fold_6,0.883212,0.917584,0.857143,0.896175,0.804124,0.926554,0.829787,0.741036,0.741911
Fold_7,0.843066,0.911701,0.759615,0.894118,0.814433,0.858757,0.78607,0.662388,0.66339
Fold_8,0.875912,0.931355,0.816327,0.909091,0.833333,0.898876,0.824742,0.728713,0.728806
Fold_9,0.857143,0.907015,0.806452,0.883333,0.78125,0.898305,0.793651,0.684449,0.684651


# Final Model

In [10]:
X = data.iloc[:,:-2]
y = data.iloc[:,-2]

In [11]:
model = SVC(probability=True)
model.fit(X, y)

SVC(probability=True)

In [12]:
joblib.dump(model, 'SVM_Final.pkl')

['SVM_Final.pkl']