In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import class_weight

from collections import Counter
import copy

from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

# Classifiers

In [None]:
n_jobs = 6
random_state = 42

clfs = {}

## RF
clf = RandomForestClassifier(n_estimators=150, #####
                             min_samples_leaf=4, #####
                             max_features=0.2, #####
                             criterion='entropy', #####
                             random_state=random_state, n_jobs=n_jobs, class_weight='balanced')
clfs['rf'] = clf

## SVM
clf = SVC(kernel='rbf', 
          gamma='scale', 
          C=100,
          random_state=random_state, class_weight='balanced', probability=True)
clfs['svm'] = clf

## Bagging decision tree
clf = BaggingClassifier(n_estimators=500, 
                        max_features=20,
                        random_state=random_state, n_jobs=n_jobs)
clfs['bagg_tree'] = clf

## Bagging SVM
base = clfs['svm']
clf = BaggingClassifier(n_estimators=500, 
                        max_features=20, 
                        base_estimator=base,
                        random_state=random_state, n_jobs=n_jobs)
clfs['bagg_svm'] = clf

len(clfs)

## MLP
clf = MLPClassifier(solver='adam',
                    learning_rate='constant',
                    hidden_layer_sizes=(20,),
                    alpha=0.01,
                    activation='relu',
                    random_state=random_state, max_iter=3000)
clfs['mlp'] = clf

# Classification with 5-fold cross validation

In [None]:
def classification(X, y, folder_out, n_jobs=1, random_state=1, cv=StratifiedKFold()):
    # X: array, transformed data of selected features
    # y: array, outcome
    # folder: string, folder to save outputs
    
    for key, clf in clfs.items():
        print(key)
        eval_metrics_weight = pd.DataFrame()
        
        df_proba_weight = pd.DataFrame()
  
        # cross validation
        for train_index, test_index in cv.split(X, y):
        # for train_index, test_index in loo.split(X_):
            # f+=1
            # var_ = df_var_['f'+str(f)].values
            # split data
            X_train = X[train_index]
            y_train = y[train_index]
            
            X_test = X[test_index]
            y_test = y[test_index]
            
            # Calculate test sample_weight based on the class distribution of train labels
            cls_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
            cls_weight_dict = {0: cls_weights[0], 1: cls_weights[1]}
            test_sample_weight = class_weight.compute_sample_weight(cls_weight_dict, y_test)
            
            sample_weight = None
            
            ave = 'weighted'
    
            # scaling
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train_scaled = scaler.transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            
            ## balanced class weights
            clf_weight = copy.deepcopy(clf)
            clf_weight.fit(X_train_scaled, y_train)
            y_pred_weight = clf_weight.predict(X_test_scaled)
    
            # evaluation
            
            
            # area under ROC curve
            y_probs_weight = clf_weight.predict_proba(X_test_scaled)[:,1]
            fp_rate_weight, tp_rate_weight, thresholds_weight = metrics.roc_curve(y_test, y_probs_weight, sample_weight=sample_weight)
            auc_weight = metrics.auc(fp_rate_weight, tp_rate_weight)
            # auc_weight = metrics.roc_auc_score(y_test, y_pred_weight)
            
            # probability
            df_proba_weight_ = pd.DataFrame()
            df_proba_weight_['proba'] = y_probs_weight
            df_proba_weight_['true'] = y_test
            df_proba_weight = df_proba_weight.append(df_proba_weight_, ignore_index=True)
            
            # area under precision-recall curve
            precision_weight_, recall_weight_, thresholds_weight_ = metrics.precision_recall_curve(y_test, y_probs_weight, sample_weight=sample_weight)
            auc_pr_weight = metrics.auc(recall_weight_, precision_weight_)
            
            precision_weight = metrics.precision_score(y_test, y_pred_weight, sample_weight=sample_weight, average=ave)
            precision_1_weight = metrics.precision_score(y_test, y_pred_weight, sample_weight=sample_weight)
            precision_0_weight = metrics.precision_score(y_test, y_pred_weight, pos_label=0, sample_weight=sample_weight)
            f1_weight = metrics.f1_score(y_test, y_pred_weight, sample_weight=sample_weight, average=ave)
            f1_1_weight = metrics.f1_score(y_test, y_pred_weight, sample_weight=sample_weight)
            f1_0_weight = metrics.f1_score(y_test, y_pred_weight, pos_label=0, sample_weight=sample_weight)
            accuracy_weight = metrics.accuracy_score(y_test, y_pred_weight, sample_weight=sample_weight)
            mcc_weight = metrics.matthews_corrcoef(y_test, y_pred_weight, sample_weight=sample_weight)
            recall_weight = metrics.recall_score(y_test, y_pred_weight, sample_weight=sample_weight, average=ave)
            recall_1_weight = metrics.recall_score(y_test, y_pred_weight, sample_weight=sample_weight)
            recall_0_weight = metrics.recall_score(y_test, y_pred_weight, pos_label=0, sample_weight=sample_weight)
            balanced_accuracy_weight = metrics.balanced_accuracy_score(y_test, y_pred_weight, sample_weight=sample_weight)
        
            eval_metrics_weight = eval_metrics_weight.append({'auroc': auc_weight,
                                                              'precision': precision_weight,
                                                              'precision_1': precision_1_weight,
                                                              'precision_0': precision_0_weight,
                                                              'f1': f1_weight,
                                                              'f1_1': f1_1_weight,
                                                              'f1_0': f1_0_weight,
                                                              'mcc': mcc_weight,
                                                              'recall': recall_weight,
                                                              'recall_1': recall_1_weight,
                                                              'recall_0': recall_0_weight,
                                                              'balanced accuracy': balanced_accuracy_weight}, 
                                                              ignore_index = True)
        
        
        
        mean_weight = eval_metrics_weight.mean(axis=0)
        mean_weight['classifier'] = key
        mean_eval_weight = mean_eval_weight.append(mean_weight, ignore_index=True)
        