In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE

from collections import Counter
import copy

from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

# Classifiers

In [None]:
n_jobs = 6
random_state = 42

clfs = {}

## RF
clf = RandomForestClassifier(n_estimators=250, min_samples_leaf=1, max_features='log2', criterion='gini',
                              random_state=random_state, n_jobs=n_jobs, class_weight='balanced')
clfs['rf'] = clf

## SVM
clf = SVC(kernel='rbf', gamma='scale', C=10,
          random_state=random_state, class_weight='balanced', probability=True)
clfs['svm'] = clf

## Bagging decision tree
clf = BaggingClassifier(n_estimators=1000, max_features=10,
                        random_state=random_state, n_jobs=n_jobs)
clfs['bagg_tree'] = clf

## Bagging SVM
base = SVC(random_state=random_state, class_weight='balanced', probability=True)
clf = BaggingClassifier(n_estimators=500, max_features=20, base_estimator=base,
                        random_state=random_state, n_jobs=n_jobs)
clfs['bagg_svm'] = clf

len(clfs)

# Classification function

In [None]:
def classification(X, y, folder_out, n_jobs=1, random_state=1, cv=StratifiedKFold()):
    # X: array, transformed data of selected features
    # y: array, outcome
    # folder: string, folder to store outputs
    
    # average from cross validation
    mean_eval_smote = pd.DataFrame()
    
    for key, clf in clfs.items():
        print(key)
        
        # evaluation metrics
        eval_metrics_smote = pd.DataFrame(columns=["auroc", "auprc", "accuracy",
                                                    "balanced accuracy", "mcc",
                                                    "precision","precision_0", "precision_1", 
                                                    "recall","recall_0", "recall_1", 
                                                    "f1","f1_0", "f1_1"])
        
        # cross validation# cross validation
        for train_index, test_index in kf.split(X, y):
            
            # split data
            X_train = X[train_index]
            y_train = y[train_index]
            
            X_test = X[test_index]
            y_test = y[test_index]
            
            # Calculate test sample_weight based on the class distribution of train labels
            cls_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
            cls_weight_dict = {0: cls_weights[0], 1: cls_weights[1]}
            test_sample_weight = class_weight.compute_sample_weight(cls_weight_dict, y_test)
            
            sample_weight = test_sample_weight
    
            # scaling
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train_scaled = scaler.transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # classifier
            # oversampling training dataset
            smote = SMOTE(random_state=random_state, n_jobs=n_jobs)
            X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
                
            clf_smote = copy.deepcopy(clf)
            clf_smote.fit(X_train_smote, y_train_smote)
            y_pred_smote = clf_smote.predict(X_test_scaled)
            
            # evaluation
            # area under ROC curve
            y_probs_smote = clf_smote.predict_proba(X_test_scaled)[:,1]
            
            # probability
            # df_proba_smote_ = pd.DataFrame()
            # df_proba_smote_['proba'] = y_probs_smote
            # df_proba_smote_['true'] = y_test
            # df_proba_smote = df_proba_smote.append(df_proba_smote_, ignore_index=True)
            
            # metrics            
            fp_rate_smote, tp_rate_smote, thresholds_smote = metrics.roc_curve(y_test, y_probs_smote, sample_weight=sample_weight)
            auc_smote = metrics.auc(fp_rate_smote, tp_rate_smote)
            
            precision_smote = metrics.precision_score(y_test, y_pred_smote, sample_weight=sample_weight, average='weighted')
            precision_1_smote = metrics.precision_score(y_test, y_pred_smote, sample_weight=sample_weight)
            precision_0_smote = metrics.precision_score(y_test, y_pred_smote, pos_label=0, sample_weight=sample_weight)
            f1_smote = metrics.f1_score(y_test, y_pred_smote, sample_weight=sample_weight, average='weighted')
            f1_1_smote = metrics.f1_score(y_test, y_pred_smote, sample_weight=sample_weight)
            f1_0_smote = metrics.f1_score(y_test, y_pred_smote, pos_label=0, sample_weight=sample_weight)
            mcc_smote = metrics.matthews_corrcoef(y_test, y_pred_smote, sample_weight=sample_weight)
            recall_smote = metrics.recall_score(y_test, y_pred_smote, sample_weight=sample_weight, average='weighted')
            recall_1_smote = metrics.recall_score(y_test, y_pred_smote, sample_weight=sample_weight)
            recall_0_smote = metrics.recall_score(y_test, y_pred_smote, pos_label=0, sample_weight=sample_weight)
            balanced_accuracy_smote = metrics.balanced_accuracy_score(y_test, y_pred_smote, sample_weight=sample_weight)
        
            eval_metrics_smote = eval_metrics_smote.append({'auroc': auc_smote,
                                                            'precision': precision_smote,
                                                            'precision_1': precision_1_smote,
                                                            'precision_0': precision_0_smote,
                                                            'f1': f1_smote,
                                                            'f1_1': f1_1_smote,
                                                            'f1_0': f1_0_smote,
                                                            'mcc': mcc_smote,
                                                            'recall': recall_smote,
                                                            'recall_1': recall_1_smote,
                                                            'recall_0': recall_0_smote,
                                                            'balanced accuracy': balanced_accuracy_smote}, 
                                                        ignore_index = True)
            
        file_out = 'evaluation_'+key+'_smote_all.csv'
        path_out = os.path.join(folder_out, file_out)
        eval_metrics_smote.to_csv(path_out)
            
        mean_smote = eval_metrics_smote.mean(axis=0)
        mean_smote['classifier'] = key
        mean_eval_smote = mean_eval_smote.append(mean_smote, ignore_index=True)
            
    file_out = 'mean_eval_smote.csv'
    path_out = os.path.join(folder_out, file_out)
    mean_eval_smote.to_csv(path_out)
        