In [32]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost.sklearn import XGBClassifier 

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler

from utils import data_handler, plotter
import time


#handle warnings
import warnings
warnings.filterwarnings("ignore")

In [33]:
def test(clf, X_test,y_test):
    y_pred = clf.predict(X_test)
    c_mat = metrics.confusion_matrix(y_test,y_pred)
    accuracy = (c_mat[0,0] + c_mat[1,1])/c_mat.sum()
    sensitivity = c_mat[1,1]/(c_mat[1,0]+c_mat[1,1])  
    specificity = c_mat[0,0]/(c_mat[0,0]+c_mat[0,1])  
    try:
        auc = metrics.roc_auc_score(y_true=y_test, y_score=clf.predict_proba(X_test))
    except:
        auc = metrics.roc_auc_score(y_true=y_test, y_score=clf.predict_proba(X_test)[:,1])

    return np.array([auc,accuracy,sensitivity,specificity])

# Data Handling

Data Analysis - There are 183 Can and 117 Cannot, single-class guess accuracy is 61%

Samples of two classes are separated such that latter can perform stratefied outer cross validation

In [34]:
X_df,Y_df = data_handler.load_XY()
X = X_df.values
Y = Y_df.values
feature_list = X_df.columns

loading MoS2 dataset...


# Model Selection

Generalization - Nested Cross Validatoin
1. Outer CV: 10-fold cross validation (10 repetitions)
2. Inner CV: 10-fold cross validation

In [None]:
# setup
save_csv= True
verbose = False
n_jobs=4

# cross validation settup
Ntrials = 10
outter_nsplit = 10
inner_nsplit = 10

print('start  ',str(Ntrials),' trials...')
tot_count = Ntrials * outter_nsplit

# Results store
mlp_mat = np.zeros((tot_count,4))
nb_mat = np.zeros((tot_count,4))
svm_mat = np.zeros((tot_count,4))
xgb_mat = np.zeros((tot_count,4))


for i in range(Ntrials):
    init_time = time.time()
    print("trial = ",i)
    train_index = []  
    test_index = []  
    
    outer_cv = StratifiedKFold(n_splits=outter_nsplit, shuffle=True, random_state=i)
    for train_ind,test_ind in outer_cv.split(X,Y):
        train_index.append(train_ind.tolist())
        test_index.append(test_ind.tolist())

        
    for j in range(outter_nsplit):#outter_nsplit
        count = i * outter_nsplit + j
        print(str(count), "  / ",str(tot_count))
        X_train = X[train_index[j]]
        Y_train = Y[train_index[j]]
        
        X_test = X[test_index[j]]
        Y_test = Y[test_index[j]]
        
         
        inner_cv = StratifiedKFold(n_splits=inner_nsplit, shuffle=False, random_state=j)
        
        
        #NB
        nb_clf = GaussianNB()
        tuned_parameters = dict(var_smoothing=[1e-12,1e-11,1e-10,1e-9,1e-8,1e-7,1e-6])
        nb_cv = GridSearchCV(nb_clf, tuned_parameters, cv=inner_cv,scoring='roc_auc',verbose=0,n_jobs=n_jobs)
        nb_cv.fit(X_train, Y_train)
        nb_mat[count] = test(nb_cv,X_test,Y_test)


        # MLP
        mlp_clf = Pipeline([            
                ('sc', StandardScaler()), 
                ('clf',  MLPClassifier())
                ])
        tuned_parameters = dict(clf__hidden_layer_sizes=[[5],[10],[20],[5,5],[10,10],[20,20],[5,5,5],[10,10,10],[20,20,20]],
                          clf__alpha=[1e-4,1e-3, 1e-2, 1e-1,1], #L2 penalty (regularization term) parameter.
                          clf__early_stopping=[True],
                         clf__solver= ['lbfgs'])
        mlp_cv = GridSearchCV(mlp_clf, tuned_parameters, cv=inner_cv,scoring='roc_auc',verbose=verbose,n_jobs=n_jobs)
        mlp_cv.fit(X_train, Y_train)
        mlp_mat[count] = test(mlp_cv,X_test,Y_test)

        
        # SVM - rbf
        svm_clf = Pipeline([            
                ('sc', StandardScaler()), 
                ('clf',  SVC(probability=True))
                ])
        tuned_parameters = dict(clf__kernel=['rbf'],
                              clf__gamma=[1e-2,1e-1,'auto', 1, 1e1,1e2],
                                clf__C=[1e-2,1e-1,1,1e1,1e2,1e3,1e4])
        svm_cv = GridSearchCV(svm_clf, tuned_parameters, cv=inner_cv,scoring='roc_auc',verbose=verbose,n_jobs=n_jobs)
        svm_cv.fit(X_train, Y_train)
        svm_mat[count] = test(svm_cv,X_test,Y_test)
        
        
        # XGBoost
        xgb_clf = XGBClassifier(objective="binary:logistic",min_child_weight=1,**{'tree_method':'exact'},
                                 silent=True,n_jobs=4,random_state=3,seed=3);
        tuned_parameters = dict(learning_rate=[0.01,0.1],
                  n_estimators=[100, 300, 500],
                  colsample_bylevel = [0.5,0.7,0.9],
                  gamma=[0,0.2,0.4],
                  max_depth =[3,5,7],
                  reg_lambda = [0.1,1,10],
                  subsample=[0.4,0.7,1])
        xgb_cv = GridSearchCV(xgb_clf,tuned_parameters, cv=inner_cv,scoring='roc_auc',verbose=verbose,n_jobs=n_jobs)
        xgb_cv.fit(X_train, Y_train)
        xgb_mat[count] = test(xgb_cv,X_test,Y_test)
        
        if(verbose):
            print(nb_mat[count])
            print(mlp_mat[count])
            print(svm_mat[count])
            print(xgb_mat[count])
  

   
    print((time.time()-init_time)/60, ' min')

        
# Results store
svm_results = pd.DataFrame(data=svm_mat,columns=['AUROC','Accuracy','Sensitivity','Specificity'])
nb_results = pd.DataFrame(data=nb_mat,columns=['AUROC','Accuracy','Sensitivity','Specificity'])
mlp_results = pd.DataFrame(data=mlp_mat,columns=['AUROC','Accuracy','Sensitivity','Specificity'])
xgb_results = pd.DataFrame(data=xgb_mat,columns=['AUROC','Accuracy','Sensitivity','Specificity'])        

if(save_csv):
    data_handler.save_csv(svm_results,title='[model_selection_clf]mos2_svm_results')
    data_handler.save_csv(nb_results,title='[model_selection_clf]mos2_nb_results')
    data_handler.save_csv(mlp_results,title='[model_selection_clf]mos2_mlp_results')
    data_handler.save_csv(xgb_results,title='[model_selection_clf]mos2_xgb_results')


print('end')

trial =  0
0   /  5
1   /  5


# Determining Best Model
1. Comparing 4 performance metrics 
    - visualizing by boxplots
2. Bayesian correlated t-test
    - to verify recognizable difference between the best model versus other three candidates

In [None]:
np.random.seed(44)

print('->>>XGBoost_mean : \n',xgb_results.mean(axis=0), '\n  std = \n', xgb_results.std(axis=0))
print('->>>SVM_mean : \n',svm_results.mean(axis=0),' \n std =\n',svm_results.std(axis=0) )
print('->>>NB_mean : \n',nb_results.mean(axis=0), ' \n std =\n',nb_results.std(axis=0))
print('->>>MLP_mean : \n',mlp_results.mean(axis=0), '\n  std =\n',mlp_results.std(axis=0))

Boxplot - AUROC/acc/sen/spe of 4 candidate classifiers

In [None]:

data = []
data.append(xgb_results)
data.append(mlp_results)
data.append(svm_results)
data.append(nb_results)

plotter.plot_boxplots(data=data, ylabels = ['XGBoost-C','MLP-C','SVM-C','NB-C'],xmin=-0.025, toSaveFig=True,title='[model_selection_clf]mos2_')


## Bayesian correlated t-test

In [None]:
rope=0.0

In [None]:
names = ( "SVM-C","XGBoost-C")
x=np.zeros((svm_results.shape[0],2),'float')
x[:,1]=xgb_results['AUROC']
x[:,0]=svm_results['AUROC']
title = names[1]+' vs ' +names[0]+' on MoS2 dataset'
left, within, right = plotter.plot_ttest(x, rope=rope,names= names, verbose=True,runs=Ntrials,title=title,toSaveFig=True)

In [None]:
names = ( "MLP-C","XGBoost-C")
x=np.zeros((mlp_results.shape[0],2),'float')
x[:,1]=xgb_results['AUROC']
x[:,0]=mlp_results['AUROC']
title = names[1]+' vs ' +names[0]+' on MoS2 dataset'
left, within, right = plotter.plot_ttest(x, rope=rope,runs=Ntrials,verbose=True,names=names,title=title,toSaveFig=True)

In [None]:
names = ( "NB-C","XGBoost-C")
x=np.zeros((nb_results.shape[0],2),'float')
x[:,1]=xgb_results['AUROC']
x[:,0]=nb_results['AUROC']
title = names[1]+' vs ' +names[0]+' on MoS2 dataset'
left, within, right = plotter.plot_ttest(x, rope=rope,runs=Ntrials,verbose=True,names=names,title=title,toSaveFig=True)