# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.linear_model import SGDClassifier as sgd
from sklearn.svm import SVC
from sklearn.kernel_approximation import RBFSampler as rbf_s
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score

import xgboost as xgb

from imblearn.ensemble import BalancedRandomForestClassifier as bal_rf

## 0.1 Load data

In [2]:
df_train_res = pd.read_pickle("df_train_res.pkl")

df_val = pd.read_pickle("df_val.pkl")

cols_selected_boruta_resampled = pickle.load(open("cols_selected_boruta_resampled.pkl", "rb"))

In [3]:
cols_selected_boruta_resampled

['age', 'region_code', 'previously_insured', 'annual_premium', 'vintage']

In [4]:
resp = ['response']
cols_selected_boruta_resampled_full = cols_selected_boruta_resampled.copy()
cols_selected_boruta_resampled_full.extend(resp)

## 0.2 Helper Functions

In [5]:
def metric_scores(y_true, y_pred):
    return pd.DataFrame({'accuracy':accuracy_score(y_true, y_pred),
                        'balanced_accuracy':balanced_accuracy_score(y_true, y_pred),
                        'precision':precision_score(y_true, y_pred),
                        'recall':recall_score(y_true, y_pred),
                        'F1':f1_score(y_true, y_pred)}, index=[0])

# 7.0 Machine Learning Model

## 7.0.1 Separate train and validation data

In [6]:
df7 = df_train_res[cols_selected_boruta_resampled_full].copy()

In [7]:
X_train_res = df7.drop(['response'], axis=1)
y_train_res = df7.response

In [8]:
df7_val = df_val[cols_selected_boruta_resampled_full].copy()

In [9]:
X_val = df7_val.drop(['response'], axis=1)
y_val = df7_val.response

## 7.1 Logistic Classifier

In [10]:
logreg = LogReg(random_state=30, solver='saga', n_jobs=-1).fit(X_train_res, y_train_res)

yhat_logreg = logreg.predict(X_val)

score_table_logreg = metric_scores(y_val, yhat_logreg)
score_table_logreg

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,F1
0,0.580095,0.759293,0.225524,0.996682,0.36782


There are many false positives.

## 7.2 Support Vector Machines

### 7.2.1 Linear Kernel

In [11]:
#svc_l = SVC(kernel='linear', random_state=30, verbose=True).fit(X_train_res, y_train_res)

#yhat_svc_l = svc_l.predict(X_val)

#score_table_svc_l = metric_scores(y_val, yhat_svc_l)
#score_table_svc_l

### 7.2.2 Radial Basis Function kernel

In [12]:
#svc_r = SVC(kernel='rbf', random_state=30, verbose=True).fit(X_train_res, y_train_res)

#yhat_svc_r = svc_r.predict(X_val)

#score_table_svc_r = metric_scores(y_val, yhat_svc_r)
#score_table_svc_r

## 7.3 SGD Classifier

### 7.3.1 Hinge - linear SVC

In [13]:
sgd_svc = sgd(loss='hinge', random_state=30, n_jobs=-1).fit(X_train_res, y_train_res)

yhat_sgd_svc = sgd_svc.predict(X_val)

score_table_sgd_svc = metric_scores(y_val, yhat_sgd_svc)
score_table_sgd_svc

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,F1
0,0.580095,0.759293,0.225524,0.996682,0.36782


The documentation says that the SGDClassifier with loss='hinge' is equivalent to a SVC classifier with linear kernel, but the result is exactly that of the logistic regression.

### 7.3.2 Perceptron

In [14]:
sgd_per = sgd(loss='perceptron', random_state=30, n_jobs=-1).fit(X_train_res, y_train_res)

yhat_sgd_per = sgd_per.predict(X_val)

score_table_sgd_per = metric_scores(y_val, yhat_sgd_per)
score_table_sgd_per

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,F1
0,0.811301,0.539117,0.199117,0.178548,0.188272


### 7.3.3 with RBFSampler

In [15]:
X_train_rbf = rbf_s(random_state=30, n_components=500).fit_transform(X_train_res)

X_val_rbf = rbf_s(random_state=30, n_components=500).fit_transform(X_val)

sgd_rbf = sgd(loss='hinge', random_state=30, n_jobs=-1).fit(X_train_rbf, y_train_res)

yhat_sgd_rbf = sgd_rbf.predict(X_val_rbf)

score_table_sgd_rbf = metric_scores(y_val, yhat_sgd_rbf)
score_table_sgd_rbf

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,F1
0,0.580777,0.758945,0.225596,0.994969,0.367798


It has to modify the validation data and doesn't produce better result. I don't like it.

## 7.4 XGBoost classifier

In [16]:
xgb_cl = xgb.XGBClassifier(objective='binary:logistic',
                              use_label_encoder=False,
                              n_estimators=100,
                              eta=0.01,
                              max_depth=10,
                              n_jobs=-1,
                              subsample=0.7,
                              colsample_bytree=0.9).fit(X_train_res, y_train_res, eval_metric='logloss')

yhat_xgb = xgb_cl.predict(X_val)

score_table_xgb = metric_scores(y_val, yhat_xgb)
score_table_xgb

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,F1
0,0.630527,0.731948,0.231189,0.866303,0.364977


## 7.5 Balanced Random Forest

In [17]:
brf_clf = bal_rf(n_estimators = 100, max_depth=10, random_state=42, n_jobs=-1).fit(X_train_res, y_train_res)

yhat_brf = brf_clf.predict(X_val)

score_table_brf = metric_scores(y_val, yhat_brf)
score_table_brf

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,F1
0,0.621081,0.742359,0.231682,0.903019,0.368755
