# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import random

import matplotlib.pyplot as plt
import seaborn as sns

import scikitplot as skplt
from sklearn.model_selection import StratifiedShuffleSplit as sss

from imblearn.ensemble import BalancedBaggingClassifier

## 0.1 Load data

In [2]:
home_path = "/home/marcos/Documentos/comunidade_DS/pa004_health_insurance_cross_sell/"

In [3]:
df_tree_train = pd.read_pickle(home_path + "interim/df6_bal_tree_train.pkl")

df_tree_validation = pd.read_pickle(home_path + "interim/df6_bal_tree_val.pkl")

## 0.2 Separate train and validation data

In [4]:
df7_train = df_tree_train.copy()

df7_validation = df_tree_validation.copy()

In [5]:
df7_train.isna().sum()

vintage                 0
annual_premium          0
age                     0
region_code             0
policy_sales_channel    0
vehicle_hist            0
vehicle_damage          0
previously_insured      0
weight_ages             0
id                      0
response                0
dtype: int64

In [6]:
df7_validation.isna().sum()

vintage                 0
annual_premium          0
age                     0
region_code             0
policy_sales_channel    3
vehicle_hist            0
vehicle_damage          0
previously_insured      0
weight_ages             0
id                      0
response                0
dtype: int64

In [7]:
df7_validation.shape

(76222, 11)

I will drop the rows containing NAs from df7_validation. Since it contains over 76k rows, this must not make a big difference.

In [8]:
df7_validation.dropna(axis=0, inplace=True)
df7_validation.isna().sum()

vintage                 0
annual_premium          0
age                     0
region_code             0
policy_sales_channel    0
vehicle_hist            0
vehicle_damage          0
previously_insured      0
weight_ages             0
id                      0
response                0
dtype: int64

In [9]:
df7_validation.shape

(76219, 11)

## 0.3 Helper Functions

In [10]:
# precision_at_k
def precision_at_k(data, k):
    data = data.reset_index(drop=True)
    data['n_samples'] = data.index + 1
    data['precision_at_k'] = data['response'].cumsum() / data['n_samples']
    return data.loc[k, 'precision_at_k']

# recall_at_k
def recall_at_k(data, k):
    data = data.reset_index(drop=True)
    data['recall_at_k'] = data['response'].cumsum() / data['response'].sum()
    return data.loc[k, 'recall_at_k']

# model predict
def model_evaluate(model, model_name, data_train, data_val, k):
    # separate X and Y data:
    xtrain = data_train.drop(['id', 'response'], axis=1)
    ytrain = data_train.response
    xval = data_val.drop(['id', 'response'], axis=1)
    yval = data_val.response
    
    # fit and predict_proba:
    model.fit(xtrain, ytrain)
    yhat_proba = model.predict_proba(xval)
    
    # transform yhat_proba to 1D-array
    yhat_proba_1d = yhat_proba[:, 1].tolist()
    
    # include in dataframe
    validation_data = data_val.copy()
    validation_data['score'] = yhat_proba_1d
    # sort
    validation_data = validation_data.sort_values('score', ascending=False)
    
    # plot
    skplt.metrics.plot_cumulative_gain(yval, yhat_proba);
    
    return pd.DataFrame({'Model name':model_name,
                         'precision_at_k':precision_at_k(validation_data, k),
                         'recall_at_k':recall_at_k(validation_data, k)}, index=[0])

# model fit
def model_fit(model, data):
    # separate X and Y data:
    xtrain = data.drop(['id', 'response'], axis=1)
    ytrain = data.response
    
    # fit
    model_fitted = model.fit(xtrain, ytrain)
    
    return model_fitted



def cross_validation(model, model_name, training_data, k_top, kfolds, verbose=False):
    # X separate X and Y data:
    xtraining = training_data.drop(['response'], axis=1)
    ytraining = training_data.response
    
    # cross-validation:
    cv = sss(n_splits=kfolds)
    prec_k_list = []
    rec_k_list = []
    for train_index, prim_val_index in cv.split(xtraining, ytraining):
        X_training, X_prim_val = xtraining.iloc[train_index], xtraining.iloc[prim_val_index]
        y_training, y_prim_val = ytraining.iloc[train_index], ytraining.iloc[prim_val_index]
        
        # remove id from training, and create new validation without id
        X_training = X_training.drop(['id'], axis=1)
        X_prim_val_no_id = X_prim_val.drop(['id'], axis=1)
        
        # fit and predict_proba
        model.fit(X_training, y_training)
        yhat_proba = model.predict_proba(X_prim_val_no_id)
        
        # transform yhat_proba to 1D-array
        yhat_proba_1d = yhat_proba[:, 1].tolist()
        
        # reconstruct dataframe
        prim_val = pd.concat([X_prim_val, y_prim_val], axis=1)
        prim_val['score'] = yhat_proba_1d
        prim_val = prim_val.sort_values('score', ascending=False)
        
        # evaluate accuracy and store in list
        prec_k_list.append(precision_at_k(prim_val, k_top))
        rec_k_list.append(recall_at_k(prim_val, k_top))
    
    #evaluate mean and std
    prec_k_pred = np.round(np.mean(prec_k_list), 4).astype(str) + '+/-' + np.round(np.std(prec_k_list), 4).astype(str)
    rec_k_pred = np.round(np.mean(rec_k_list), 4).astype(str) + '+/-' + np.round(np.std(rec_k_list), 4).astype(str)
    
    return pd.DataFrame({'Model name':model_name,
                         'precision_at_k':prec_k_pred,
                         'recall_at_k':rec_k_pred}, index=[0])

# 7.0 Machine Learning Model

## 7.1 Cross-validation

### 7.1.1 Balanced Bagging

In [11]:
bal_bag = BalancedBaggingClassifier(n_estimators=150, bootstrap=True, sampling_strategy=0.15, replacement=True,
                                    random_state=30, n_jobs=-1)

bal_bag_cv_metrics = cross_validation(bal_bag, "Balanced Bagging Classifier", df7_train, 20000, 5)

bal_bag_cv_metrics

Unnamed: 0,Model name,precision_at_k,recall_at_k
0,Balanced Bagging Classifier,0.1863+/-0.0001,0.9972+/-0.0006
