# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.linear_model import SGDClassifier as sgd
from sklearn.kernel_approximation import RBFSampler as rbf_s
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit as sss

import xgboost as xgb

from imblearn.ensemble import BalancedRandomForestClassifier as bal_rf

## 0.1 Load data

In [2]:
df_train_res = pd.read_pickle("data_in_progress/df_train_res_nn.pkl")

df_val = pd.read_pickle("data_in_progress/df_val.pkl")

cols_selected_boruta_resampled = pickle.load(open("data_in_progress/cols_selected_boruta_resampled_nn.pkl", "rb"))

In [3]:
cols_selected_boruta_resampled

['age', 'region_code', 'previously_insured', 'annual_premium', 'vintage']

In [4]:
resp = ['response']
cols_selected_boruta_resampled_full = cols_selected_boruta_resampled.copy()
cols_selected_boruta_resampled_full.extend(resp)

## 0.2 Helper Functions

In [5]:
def metric_scores(y_true, y_pred):
    return pd.DataFrame({'accuracy':accuracy_score(y_true, y_pred),
                        'balanced_accuracy':balanced_accuracy_score(y_true, y_pred),
                        'precision':precision_score(y_true, y_pred),
                        'precision_0':precision_score(y_true, y_pred, pos_label=0),
                        'recall':recall_score(y_true, y_pred),
                        'specificity':recall_score(y_true, y_pred, pos_label=0),
                        'F1':f1_score(y_true, y_pred),
                        'F1_weighted':f1_score(y_true, y_pred, average='weighted'),
                        'G_mean':np.sqrt(recall_score(y_true, y_pred)*recall_score(y_true, y_pred, pos_label=0))},
                        index=[0])


def imb_scores(y_true, y_pred):
    return imb_cl_rep(y_true, y_pred)


def cross_validation(training_data, kfolds, model, model_name, verbose=False):
    xtraining = training_data.drop(['response'], axis=1)
    ytraining = training_data.response
    
    cv = sss(n_splits=kfolds)
    acc_list = []
    bal_acc_list = []
    prec_list = []
    prec0_list = []
    rec_list = []
    spec_list = []
    f1_list = []
    f1w_list = []
    g_list = []
    for train_index, prim_val_index in cv.split(xtraining, ytraining):
        X_training, X_prim_val = xtraining.iloc[train_index], xtraining.iloc[prim_val_index]
        y_training, y_prim_val = ytraining.iloc[train_index], ytraining.iloc[prim_val_index]
        
        m = model.fit(X_training, y_training)
        yhat = m.predict(X_prim_val)
        
        score_table = metric_scores(y_prim_val, yhat)
        acc_list.append(score_table['accuracy'])
        bal_acc_list.append(score_table['balanced_accuracy'])
        prec_list.append(score_table['precision'])
        prec0_list.append(score_table['precision_0'])
        rec_list.append(score_table['recall'])
        spec_list.append(score_table['specificity'])
        f1_list.append(score_table['F1'])
        f1w_list.append(score_table['F1_weighted'])
        g_list.append(score_table['G_mean'])
    
    acc_pred = np.round(np.mean(acc_list), 4).astype(str) + '+/-' + np.round(np.std(acc_list), 4).astype(str)
    bal_acc_pred = np.round(np.mean(bal_acc_list), 4).astype(str) + '+/-' + np.round(np.std(bal_acc_list), 4).astype(str)
    prec_pred = np.round(np.mean(prec_list), 4).astype(str) + '+/-' + np.round(np.std(prec_list), 4).astype(str)
    prec0_pred = np.round(np.mean(prec0_list), 4).astype(str) + '+/-' + np.round(np.std(prec0_list), 4).astype(str)
    rec_pred = np.round(np.mean(rec_list), 4).astype(str) + '+/-' + np.round(np.std(rec_list), 4).astype(str)
    spec_pred = np.round(np.mean(spec_list), 4).astype(str) + '+/-' + np.round(np.std(spec_list), 4).astype(str)
    f1_pred = np.round(np.mean(f1_list), 4).astype(str) + '+/-' + np.round(np.std(f1_list), 4).astype(str)
    f1w_pred = np.round(np.mean(f1w_list), 4).astype(str) + '+/-' + np.round(np.std(f1w_list), 4).astype(str)
    g_pred = np.round(np.mean(g_list), 4).astype(str) + '+/-' + np.round(np.std(g_list), 4).astype(str)
    return pd.DataFrame({'Model name':model_name,
                         'accuracy':acc_pred,
                         'balanced_accuracy':bal_acc_pred,
                         'precision':prec_pred,
                         'precision_0':prec0_pred,
                         'recall':rec_pred,
                         'specificity':spec_pred,
                         'F1':f1_pred,
                         'F1_weighted':f1w_pred,
                         'G_mean':g_pred}, index=[0])

# 7.0 Machine Learning Model

## 7.0.1 Separate train and validation data

In [6]:
df7 = df_train_res[cols_selected_boruta_resampled_full].copy()

In [7]:
X_train_res = df7.drop(['response'], axis=1)
y_train_res = df7.response

In [8]:
df7_val = df_val[cols_selected_boruta_resampled_full].copy()

In [9]:
X_val = df7_val.drop(['response'], axis=1)
y_val = df7_val.response

## 7.1 Logistic Classifier

In [10]:
logreg = LogReg(random_state=30, solver='saga', n_jobs=-1).fit(X_train_res, y_train_res)

yhat_logreg = logreg.predict(X_val)

score_table_logreg = metric_scores(y_val, yhat_logreg)
score_table_logreg

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.580095,0.759293,0.225524,0.999113,0.996682,0.521905,0.36782,0.646695,0.72123


There are many false positives.

## 7.2 SGD Classifier

### 7.2.1 Hinge - linear SVC

In [11]:
sgd_svc = sgd(loss='hinge', random_state=30, n_jobs=-1).fit(X_train_res, y_train_res)

yhat_sgd_svc = sgd_svc.predict(X_val)

score_table_sgd_svc = metric_scores(y_val, yhat_sgd_svc)
score_table_sgd_svc

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.580095,0.759293,0.225524,0.999113,0.996682,0.521905,0.36782,0.646695,0.72123


The documentation says that the SGDClassifier with loss='hinge' is equivalent to a SVC classifier with linear kernel, but the result is exactly that of the logistic regression.

### 7.2.2 Perceptron

In [12]:
#sgd_per = sgd(loss='perceptron', random_state=30, n_jobs=-1).fit(X_train_res, y_train_res)
sgd_per = sgd(loss='perceptron', eta0=1.0, learning_rate='constant', random_state=30, n_jobs=-1).fit(X_train_res, y_train_res)

yhat_sgd_per = sgd_per.predict(X_val)

score_table_sgd_per = metric_scores(y_val, yhat_sgd_per)
score_table_sgd_per

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.733017,0.601518,0.210194,0.906518,0.427317,0.775718,0.281782,0.768102,0.575741


## 7.3 SGD with kernel approximation

### 7.3.1 RBFSampler

In [13]:
X_train_rbf = rbf_s(random_state=30, n_components=500).fit_transform(X_train_res)

X_val_rbf = rbf_s(random_state=30, n_components=500).fit_transform(X_val)

#### 7.3.1.1 hinge loss

In [14]:
sgd_rbf_hinge = sgd(loss='hinge', random_state=30, n_jobs=-1).fit(X_train_rbf, y_train_res)

yhat_sgd_rbf_hinge = sgd_rbf_hinge.predict(X_val_rbf)

score_table_sgd_rbf_hinge = metric_scores(y_val, yhat_sgd_rbf_hinge)
score_table_sgd_rbf_hinge

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.581145,0.758878,0.225669,0.998488,0.994327,0.52343,0.367852,0.647723,0.721429


#### 7.3.1.2 perceptron loss

In [15]:
sgd_rbf_perc = sgd(loss='perceptron', eta0=1.0, learning_rate='constant', random_state=30, n_jobs=-1).fit(X_train_rbf, y_train_res)

yhat_sgd_rbf_perc = sgd_rbf_perc.predict(X_val_rbf)

score_table_sgd_rbf_perc = metric_scores(y_val, yhat_sgd_rbf_perc)
score_table_sgd_rbf_perc

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.592336,0.75135,0.226344,0.990279,0.962,0.5407,0.366465,0.658664,0.721216


## 7.4 XGBoost classifier

In [16]:
xgb_cl = xgb.XGBClassifier(objective='binary:logistic',
                              use_label_encoder=False,
                              n_estimators=100,
                              eta=0.01,
                              max_depth=10,
                              n_jobs=-1,
                              subsample=0.7,
                              colsample_bytree=0.9).fit(X_train_res, y_train_res, eval_metric='logloss')

yhat_xgb = xgb_cl.predict(X_val)

score_table_xgb = metric_scores(y_val, yhat_xgb)
score_table_xgb

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.654535,0.715653,0.233482,0.957157,0.796617,0.634689,0.361122,0.713974,0.711059


## 7.5 Balanced Random Forest

In [17]:
brf_clf = bal_rf(n_estimators = 100, max_depth=10, random_state=42, n_jobs=-1).fit(X_train_res, y_train_res)

yhat_brf = brf_clf.predict(X_val)

score_table_brf = metric_scores(y_val, yhat_brf)
score_table_brf

Unnamed: 0,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,0.622078,0.741131,0.23159,0.976354,0.898844,0.583418,0.368289,0.686012,0.724156


## 7.6 Cross-Validation

### 7.6.1 Linear models with linear kernels

In [18]:
# Logistic classifier
model = LogReg(random_state=30, solver='saga', n_jobs=-1)
log_cv = cross_validation(df7, 5, model, 'Logistic Classifier')
log_cv

Unnamed: 0,Model name,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,Logistic Classifier,0.7439+/-0.0015,0.7603+/-0.0014,0.6453+/-0.0013,0.9991+/-0.0002,0.9995+/-0.0001,0.5212+/-0.0028,0.7842+/-0.0009,0.7312+/-0.0017,0.7218+/-0.0019


In [19]:
# SGD Classifier - hinge loss
model = sgd(loss='hinge', random_state=30, n_jobs=-1)
sgd_hinge_cv = cross_validation(df7, 5, model, 'SGD Classifier - hinge')
sgd_hinge_cv

Unnamed: 0,Model name,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,SGD Classifier - hinge,0.7452+/-0.0017,0.7616+/-0.0016,0.6464+/-0.0015,0.9992+/-0.0002,0.9995+/-0.0001,0.5236+/-0.0032,0.7851+/-0.0011,0.7327+/-0.002,0.7234+/-0.0022


In [20]:
# SGD Classifier - perceptron loss
model = sgd(loss='perceptron', eta0=1.0, learning_rate='constant', random_state=30, n_jobs=-1)
sgd_perc_cv = cross_validation(df7, 5, model, 'SGD Classifier - perceptron')
sgd_perc_cv

Unnamed: 0,Model name,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,SGD Classifier - perceptron,0.6981+/-0.0541,0.7032+/-0.0674,0.6473+/-0.0045,0.8264+/-0.1501,0.7772+/-0.2612,0.6292+/-0.1265,0.6794+/-0.1433,0.6847+/-0.0646,0.6698+/-0.0853


### 7.6.2 Approximation kernel - rbf

In [21]:
# rbf
#df7_data = df7.copy().drop(['response'], axis=1)
#df7_target = df7.response

#data_rbf = rbf_s(random_state=30, n_components=500).fit_transform(df7_data)
#data_rbf_df = pd.DataFrame(data_rbf)

#df7_rbf = pd.concat([data_rbf_df, df7_target], axis=1)

In [22]:
# SGD Classifier - rbf, hinge loss
#model = sgd(loss='hinge', random_state=30, n_jobs=-1)
#sgd_rbf_hinge_cv = cross_validation(df7_rbf, 5, model, 'SGD Classifier - rbf, hinge')
#sgd_rbf_hinge_cv

In [23]:
# SGD Classifier - rbf, perceptron loss
#model = sgd(loss='perceptron', eta0=1.0, learning_rate='constant', random_state=30, n_jobs=-1)
#sgd_rbf_perc_cv = cross_validation(df7_rbf, 5, model, 'SGD Classifier - rbf, perceptron')
#sgd_rbf_perc_cv

### 7.6.3 XGBoost

In [24]:
# XGBoost Classifier
#model = xgb.XGBClassifier(objective='binary:logistic',
#                              use_label_encoder=False,
#                              n_estimators=100,
#                              eta=0.01,
#                              max_depth=10,
#                              n_jobs=-1,
#                              subsample=0.7,
#                              colsample_bytree=0.9)
#xgb_cv = cross_validation(df7, 5, model, 'XGBoost Classifier')
#xgb_cv

### 7.6.4 Balanced Random Forest

In [25]:
# Balanced Random Forest Classifier
#model = bal_rf(n_estimators = 100, max_depth=10, random_state=42, n_jobs=-1)
#bal_rf_cv = cross_validation(df7, 5, model, 'Balanced Random Forest Classifier')
#bal_rf_cv

### 7.6.5 Gather results

In [26]:
# SGD - rbf
sgd_rbf_hinge_cv = pd.read_pickle('data_in_progress/sgd_rbf_hinge_cv.pkl')
sgd_rbf_perc_cv = pd.read_pickle('data_in_progress/sgd_rbf_perc_cv.pkl')

# xgboost
xgb_cv = pd.read_pickle('data_in_progress/xgb_cv.pkl')

# bal_rf
bal_rf_cv = pd.read_pickle('data_in_progress/bal_rf_cv.pkl')

In [27]:
cv_results = pd.concat([log_cv, sgd_hinge_cv, sgd_perc_cv, sgd_rbf_hinge_cv, sgd_rbf_perc_cv, xgb_cv, bal_rf_cv])
cv_results.sort_values(['F1'], ascending=False, inplace=True)
cv_results

Unnamed: 0,Model name,accuracy,balanced_accuracy,precision,precision_0,recall,specificity,F1,F1_weighted,G_mean
0,XGBoost Classifier,0.8196+/-0.0011,0.8239+/-0.0011,0.7644+/-0.0013,0.8843+/-0.001,0.8855+/-0.001,0.7622+/-0.0016,0.8205+/-0.001,0.8196+/-0.0011,0.8216+/-0.0011
0,Balanced Random Forest Classifier,0.8109+/-0.0013,0.8178+/-0.0013,0.7392+/-0.0014,0.9093+/-0.0018,0.9178+/-0.0017,0.7178+/-0.0019,0.8189+/-0.0013,0.81+/-0.0014,0.8117+/-0.0014
0,"SGD Classifier - rbf, hinge",0.7956+/-0.0016,0.8038+/-0.0017,0.7187+/-0.0012,0.9098+/-0.0032,0.922+/-0.003,0.6855+/-0.0017,0.8077+/-0.0017,0.7939+/-0.0016,0.795+/-0.0016
0,SGD Classifier - hinge,0.7452+/-0.0017,0.7616+/-0.0016,0.6464+/-0.0015,0.9992+/-0.0002,0.9995+/-0.0001,0.5236+/-0.0032,0.7851+/-0.0011,0.7327+/-0.002,0.7234+/-0.0022
0,Logistic Classifier,0.7439+/-0.0015,0.7603+/-0.0014,0.6453+/-0.0013,0.9991+/-0.0002,0.9995+/-0.0001,0.5212+/-0.0028,0.7842+/-0.0009,0.7312+/-0.0017,0.7218+/-0.0019
0,"SGD Classifier - rbf, perceptron",0.7312+/-0.0355,0.733+/-0.0455,0.7086+/-0.0466,0.8162+/-0.1319,0.758+/-0.2154,0.7079+/-0.1366,0.7091+/-0.0911,0.7227+/-0.0408,0.7109+/-0.0535
0,SGD Classifier - perceptron,0.6981+/-0.0541,0.7032+/-0.0674,0.6473+/-0.0045,0.8264+/-0.1501,0.7772+/-0.2612,0.6292+/-0.1265,0.6794+/-0.1433,0.6847+/-0.0646,0.6698+/-0.0853


## 7.7 Export Data

In [28]:
cv_results.to_pickle('data_in_progress/cv_results.pkl')