# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
#!python3 -m pip install pickle5
import pickle

from sklearn.linear_model import SGDClassifier as sgd
from sklearn.kernel_approximation import RBFSampler as rbf_s
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit as sss

## 0.1 Load data

In [2]:
## 0.1 Load data

df_train_res = pd.read_pickle("df_train_to_colab.pkl")

cols_selected_boruta_resampled = pickle.load(open("cols_selected_boruta_resampled_nn.pkl", "rb"))

In [3]:
resp = ['response']
cols_selected_boruta_resampled_full = cols_selected_boruta_resampled.copy()
cols_selected_boruta_resampled_full.extend(resp)

In [4]:
df7 = df_train_res[cols_selected_boruta_resampled_full].copy()

## 0.2 Helper functions

In [5]:
## 0.2 Helper Functions

def metric_scores(y_true, y_pred):
    return pd.DataFrame({'accuracy':accuracy_score(y_true, y_pred),
                        'balanced_accuracy':balanced_accuracy_score(y_true, y_pred),
                        'precision':precision_score(y_true, y_pred),
                        'precision_0':precision_score(y_true, y_pred, pos_label=0),
                        'recall':recall_score(y_true, y_pred),
                        'specificity':recall_score(y_true, y_pred, pos_label=0),
                        'F1':f1_score(y_true, y_pred),
                        'F1_weighted':f1_score(y_true, y_pred, average='weighted'),
                        'G_mean':np.sqrt(recall_score(y_true, y_pred)*recall_score(y_true, y_pred, pos_label=0))},
                        index=[0])


def cross_validation(training_data, kfolds, model, model_name, verbose=False):
    xtraining = training_data.drop(['response'], axis=1)
    ytraining = training_data.response
    
    cv = sss(n_splits=kfolds)
    acc_list = []
    bal_acc_list = []
    prec_list = []
    prec0_list = []
    rec_list = []
    spec_list = []
    f1_list = []
    f1w_list = []
    g_list = []
    for train_index, prim_val_index in cv.split(xtraining, ytraining):
        X_training, X_prim_val = xtraining.iloc[train_index], xtraining.iloc[prim_val_index]
        y_training, y_prim_val = ytraining.iloc[train_index], ytraining.iloc[prim_val_index]
        
        m = model.fit(X_training, y_training)
        yhat = m.predict(X_prim_val)
        
        score_table = metric_scores(y_prim_val, yhat)
        acc_list.append(score_table['accuracy'])
        bal_acc_list.append(score_table['balanced_accuracy'])
        prec_list.append(score_table['precision'])
        prec0_list.append(score_table['precision_0'])
        rec_list.append(score_table['recall'])
        spec_list.append(score_table['specificity'])
        f1_list.append(score_table['F1'])
        f1w_list.append(score_table['F1_weighted'])
        g_list.append(score_table['G_mean'])
    
    acc_pred = np.round(np.mean(acc_list), 4).astype(str) + '+/-' + np.round(np.std(acc_list), 4).astype(str)
    bal_acc_pred = np.round(np.mean(bal_acc_list), 4).astype(str) + '+/-' + np.round(np.std(bal_acc_list), 4).astype(str)
    prec_pred = np.round(np.mean(prec_list), 4).astype(str) + '+/-' + np.round(np.std(prec_list), 4).astype(str)
    prec0_pred = np.round(np.mean(prec0_list), 4).astype(str) + '+/-' + np.round(np.std(prec0_list), 4).astype(str)
    rec_pred = np.round(np.mean(rec_list), 4).astype(str) + '+/-' + np.round(np.std(rec_list), 4).astype(str)
    spec_pred = np.round(np.mean(spec_list), 4).astype(str) + '+/-' + np.round(np.std(spec_list), 4).astype(str)
    f1_pred = np.round(np.mean(f1_list), 4).astype(str) + '+/-' + np.round(np.std(f1_list), 4).astype(str)
    f1w_pred = np.round(np.mean(f1w_list), 4).astype(str) + '+/-' + np.round(np.std(f1w_list), 4).astype(str)
    g_pred = np.round(np.mean(g_list), 4).astype(str) + '+/-' + np.round(np.std(g_list), 4).astype(str)
    return pd.DataFrame({'Model name':model_name,
                         'accuracy':acc_pred,
                         'balanced_accuracy':bal_acc_pred,
                         'precision':prec_pred,
                         'precision_0':prec0_pred,
                         'recall':rec_pred,
                         'specificity':spec_pred,
                         'F1':f1_pred,
                         'F1_weighted':f1w_pred,
                         'G_mean':g_pred}, index=[0])

# 7.0 Machine Learning Models

In [6]:
### 7.6.2 Approximation kernel - rbf

# rbf
df7_data = df7.copy().drop(['response'], axis=1)
df7_target = df7.response

data_rbf = rbf_s(random_state=30, n_components=500).fit_transform(df7_data)
data_rbf_df = pd.DataFrame(data_rbf)

df7_rbf = pd.concat([data_rbf_df, df7_target], axis=1)

In [7]:
# SGD Classifier - rbf, hinge loss
model = sgd(loss='hinge', random_state=30, n_jobs=-1)
sgd_rbf_hinge_cv = cross_validation(df7_rbf, 5, model, 'SGD Classifier - rbf, hinge')
sgd_rbf_hinge_cv

Unnamed: 0,Model name,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,"SGD Classifier - rbf, hinge",0.7956+/-0.0016,0.8038+/-0.0017,0.7187+/-0.0012,0.9098+/-0.0032,0.922+/-0.003,0.6855+/-0.0017,0.8077+/-0.0017,0.7939+/-0.0016,0.795+/-0.0016


In [8]:
sgd_rbf_hinge_cv.to_pickle('sgd_rbf_hinge_cv.pkl')

In [9]:
# SGD Classifier - rbf, perceptron loss
model = sgd(loss='perceptron', eta0=1.0, learning_rate='constant', random_state=30, n_jobs=-1)
sgd_rbf_perc_cv = cross_validation(df7_rbf, 5, model, 'SGD Classifier - rbf, perceptron')
sgd_rbf_perc_cv

Unnamed: 0,Model name,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,"SGD Classifier - rbf, perceptron",0.7312+/-0.0355,0.733+/-0.0455,0.7086+/-0.0466,0.8162+/-0.1319,0.758+/-0.2154,0.7079+/-0.1366,0.7091+/-0.0911,0.7227+/-0.0408,0.7109+/-0.0535


In [10]:
sgd_rbf_perc_cv.to_pickle('sgd_rbf_perc_cv.pkl')