# 0.0 Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
import random

import matplotlib.pyplot as plt
import seaborn as sns

import scikitplot as skplt
from sklearn.model_selection import StratifiedShuffleSplit as sss

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier,\
                              RUSBoostClassifier, EasyEnsembleClassifier

## 0.1 Load data

In [None]:
home_path = "/home/marcos/Documentos/comunidade_DS/pa004_health_insurance_cross_sell/"

In [None]:
df_tree_train = pd.read_pickle(home_path + "interim/df6_bal_tree_train.pkl")

df_tree_validation = pd.read_pickle(home_path + "interim/df6_bal_tree_val.pkl")

## 0.2 Separate train and validation data

In [None]:
df7_train = df_tree_train.copy()

df7_validation = df_tree_validation.copy()

In [None]:
df7_train.isna().sum()

In [None]:
df7_validation.isna().sum()

In [None]:
df7_validation.shape

I will drop the rows containing NAs from df7_validation. Since it contains over 76k rows, this must not make a big difference.

In [None]:
df7_validation.dropna(axis=0, inplace=True)
df7_validation.isna().sum()

In [None]:
df7_validation.shape

## 0.3 Helper Functions

In [None]:
# precision_at_k
def precision_at_k(data, k):
    data = data.reset_index(drop=True)
    data['n_samples'] = data.index + 1
    data['precision_at_k'] = data['response'].cumsum() / data['n_samples']
    return data.loc[k, 'precision_at_k']

# recall_at_k
def recall_at_k(data, k):
    data = data.reset_index(drop=True)
    data['recall_at_k'] = data['response'].cumsum() / data['response'].sum()
    return data.loc[k, 'recall_at_k']

# model predict
def model_evaluate(model, model_name, data_train, data_val, k):
    # separate X and Y data:
    xtrain = data_train.drop(['id', 'response'], axis=1)
    ytrain = data_train.response
    xval = data_val.drop(['id', 'response'], axis=1)
    yval = data_val.response
    
    # fit and predict_proba:
    model.fit(xtrain, ytrain)
    yhat_proba = model.predict_proba(xval)
    
    # transform yhat_proba to 1D-array
    yhat_proba_1d = yhat_proba[:, 1].tolist()
    
    # include in dataframe
    validation_data = data_val.copy()
    validation_data['score'] = yhat_proba_1d
    # sort
    validation_data = validation_data.sort_values('score', ascending=False)
    
    # plot
    skplt.metrics.plot_cumulative_gain(yval, yhat_proba);
    
    return pd.DataFrame({'Model name':model_name,
                         'precision_at_k':precision_at_k(validation_data, k),
                         'recall_at_k':recall_at_k(validation_data, k)}, index=[0])

# model fit
def model_fit(model, data):
    # separate X and Y data:
    xtrain = data.drop(['id', 'response'], axis=1)
    ytrain = data.response
    
    # fit
    model_fitted = model.fit(xtrain, ytrain)
    
    return model_fitted



def cross_validation(model, model_name, training_data, k_top, kfolds, verbose=False):
    # X separate X and Y data:
    xtraining = training_data.drop(['response'], axis=1)
    ytraining = training_data.response
    
    # cross-validation:
    cv = sss(n_splits=kfolds)
    prec_k_list = []
    rec_k_list = []
    for train_index, prim_val_index in cv.split(xtraining, ytraining):
        X_training, X_prim_val = xtraining.iloc[train_index], xtraining.iloc[prim_val_index]
        y_training, y_prim_val = ytraining.iloc[train_index], ytraining.iloc[prim_val_index]
        
        # remove id from training, and create new validation without id
        X_training = X_training.drop(['id'], axis=1)
        X_prim_val_no_id = X_prim_val.drop(['id'], axis=1)
        
        # fit and predict_proba
        model.fit(X_training, y_training)
        yhat_proba = model.predict_proba(X_prim_val_no_id)
        
        # transform yhat_proba to 1D-array
        yhat_proba_1d = yhat_proba[:, 1].tolist()
        
        # reconstruct dataframe
        prim_val = pd.concat([X_prim_val, y_prim_val], axis=1)
        prim_val['score'] = yhat_proba_1d
        prim_val = prim_val.sort_values('score', ascending=False)
        
        # evaluate accuracy and store in list
        prec_k_list.append(precision_at_k(prim_val, k_top))
        rec_k_list.append(recall_at_k(prim_val, k_top))
    
    #evaluate mean and std
    prec_k_pred = np.round(np.mean(prec_k_list), 4).astype(str) + '+/-' + np.round(np.std(prec_k_list), 4).astype(str)
    rec_k_pred = np.round(np.mean(rec_k_list), 4).astype(str) + '+/-' + np.round(np.std(rec_k_list), 4).astype(str)
    
    return pd.DataFrame({'Model name':model_name,
                         'precision_at_k':prec_k_pred,
                         'recall_at_k':rec_k_pred}, index=[0])

# 7.0 Machine Learning Model

## 7.1 Models

### 7.1.1 Balanced Bagging

In [None]:
bal_bag = BalancedBaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators=150,
                                    bootstrap=True, sampling_strategy=0.15, replacement=True,
                                    random_state=30, n_jobs=-1)

b_bag_rank = model_evaluate(bal_bag, "Balanced Bagging Classifier", df7_train, df7_validation, 20000)

b_bag_rank

### 7.1.2 Balanced Random Forest Classifier

In [None]:
bal_rf = BalancedRandomForestClassifier(n_estimators=500, max_depth=10, bootstrap=True, sampling_strategy=0.15,
                                        replacement=True, random_state=30, n_jobs=-1)

b_rf_rank = model_evaluate(bal_rf, "Balanced Random Forest Classifier", df7_train, df7_validation, 20000)

b_rf_rank

### 7.1.3 Random Under Sampling Boost Classifier

In [None]:
rus = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=150,
                         sampling_strategy=0.15, replacement=True, random_state=50)

rus_rank = model_evaluate(rus, "Random Under Sampling Boost Classifier", df7_train, df7_validation, 20000)

rus_rank

### 7.1.4 Easy Ensemble Classifier

In [None]:
easy_ens = EasyEnsembleClassifier(n_estimators=150, base_estimator=AdaBoostClassifier(), sampling_strategy=0.15,
                                  replacement=True, n_jobs=-1, random_state=50)

e_ens_rank = model_evaluate(easy_ens, "Easy Ensemble Classifier", df7_train, df7_validation, 20000)

e_ens_rank

## 7.2 Cross-validation

### 7.2.1 Balanced Bagging

In [None]:
bal_bag = BalancedBaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators=150,
                                    bootstrap=True, sampling_strategy=0.15, replacement=True,
                                    random_state=30, n_jobs=-1)

b_bag_cv_metrics = cross_validation(bal_bag, "Balanced Bagging Classifier", df7_train, 20000, 5)

bal_bag_cv_metrics

### 7.2.2 Balanced Random Forest

In [None]:
bal_rf = BalancedRandomForestClassifier(n_estimators=500, max_depth=10, bootstrap=True, sampling_strategy=0.15,
                                        replacement=True, random_state=30, n_jobs=-1)

bal_rf_cv_metrics = cross_validation(bal_rf, "Balanced Random Forest Classifier", df7_train, 20000, 5)

bal_rf_cv_metrics

### 7.2.3 RUS Boost

In [None]:
rus = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=150,
                         sampling_strategy=0.15, replacement=True, random_state=50)

rus_cv_metrics = cross_validation(rus, "Random Under Sampling Classifier", df7_train, 20000, 5)

rus_cv_metrics

### 7.2.4 Easy Ensemble

In [None]:
easy_ens = EasyEnsembleClassifier(n_estimators=150, base_estimator=AdaBoostClassifier(), sampling_strategy=0.15,
                                  replacement=True, n_jobs=-1, random_state=50)

e_ens_cv_metrics = cross_validation(easy_ens, "Easy Ensemble Classifier", df7_train, 20000, 5)

e_ens_cv_metrics

# 8.0 Hyperparameter Fine Tunning

## 8.1 XGBoost Classifier

In [None]:
param = {'n_estimators':[2000, 2500, 3000, 3500],
         'eta':[0.01, 0.03],
         'gamma':[3, 10, 20],
         'max_depth':[3, 5, 9],
         'subsample':[0.1, 0.5, 0.7],
         'colsample_bytree':[0.3, 0.7, 0.9],
         'min_child_weight':[3, 8, 15],
        }

MAX_EVAL = 5

In [None]:
final_result = pd.DataFrame()
hyperparam_lst = []

for i in range(MAX_EVAL):
    #randomly selected hyperparameters
    hp = {k:random.sample(v, 1)[0] for k, v in param.items()}
    #print the selected ones
    print(hp)
    #store in list
    hyperparam_lst.append(hp.copy())
    #model
    model_xgb = xgb.XGBClassifier(objective='binary:logistic',
                                  use_label_encoder=False,
                                  n_estimators=hp['n_estimators'],
                                  eta=hp['eta'],
                                  gamma=hp['gamma'],
                                  max_depth=hp['max_depth'],
                                  n_jobs=-1,
                                  subsample=hp['subsample'],
                                  colsample_bytree=hp['colsample_bytree'],
                                  min_child_weight=hp['min_child_weight'],
                                  verbosity=0)
    result = cross_validation(model_xgb, 'XGBoost Classifier', df7_train, 20000, 5)
    iter_nr = pd.Series([i], name='Iter_nr')
    result_w_nr = pd.concat([result, iter_nr], axis=1)
    final_result = pd.concat([final_result, result_w_nr], ignore_index=True)

final_result.sort_values(by=['precision_at_k'], ascending=False, inplace=True, ignore_index=True)
final_result

In [None]:
final_result.to_pickle(home_path + 'interim/xgb_tunning_new_tree_cv.pkl')

In [None]:
# select the hyperparameters that resulted in best accuracy
best_param_nr = final_result.loc[0, 'Iter_nr']

param_tuned = hyperparam_lst[best_param_nr]

In [None]:
#model
model_xgb_tuned = xgb.XGBClassifier(objective='binary:logistic',
                                    use_label_encoder=False,
                                    n_estimators=param_tuned['n_estimators'],
                                    eta=param_tuned['eta'],
                                    gamma=param_tuned['gamma'],
                                    max_depth=param_tuned['max_depth'],
                                    n_jobs=-1,
                                    subsample=param_tuned['subsample'],
                                    colsample_bytree=param_tuned['colsample_bytree'],
                                    min_child_weight=param_tuned['min_child_weight'],
                                    verbosity=0)

In [None]:
xgb_tuned_ranking = model_evaluate(model_xgb_tuned, "XGBoost Classifier", df7_train, df7_validation, 20000)

xgb_tuned_ranking

In [None]:
xgb_tuned = model_fit(model_xgb_tuned, df7_train)

In [None]:
pickle.dump(xgb_tuned, open(home_path + "ml_models_comp/xgb_tuned_new_tree.pkl", "wb"))

### 8.1.1 Validation

In [None]:
for i in range(final_result.shape[0]):
    j = final_result.loc[i, 'Iter_nr']
    param_dict = hyperparam_lst[j]
    
    # model
    model = xgb.XGBClassifier(objective='binary:logistic',
                              use_label_encoder=False,
                              n_estimators=param_dict['n_estimators'],
                              eta=param_dict['eta'],
                              gamma=param_dict['gamma'],
                              max_depth=param_dict['max_depth'],
                              n_jobs=-1,
                              subsample=param_dict['subsample'],
                              colsample_bytree=param_dict['colsample_bytree'],
                              min_child_weight=param_dict['min_child_weight'],
                              verbosity=0)
    
    # print results
    print(param_dict)
    xgb_ranking = model_evaluate(model, "XGBoost Classifier", df7_train, df7_validation, 20000)
    display(xgb_ranking)

All models perform very similarly. I will take the fourth one: {'n_estimators': 2000, 'eta': 0.01, 'gamma': 10, 'max_depth': 9, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 15}. The 4th (index=3) element in final_result (Iter_nr=2).

In [None]:
j = final_result.loc[3, 'Iter_nr']
param_dict = hyperparam_lst[j]
    
# model
model = xgb.XGBClassifier(objective='binary:logistic',
                          use_label_encoder=False,
                          n_estimators=param_dict['n_estimators'],
                          eta=param_dict['eta'],
                          gamma=param_dict['gamma'],
                          max_depth=param_dict['max_depth'],
                          n_jobs=-1,
                          subsample=param_dict['subsample'],
                          colsample_bytree=param_dict['colsample_bytree'],
                          min_child_weight=param_dict['min_child_weight'],
                          verbosity=0)

xgb_ranking = model_evaluate(model, "XGBoost Classifier", df7_train, df7_validation, 20000)
xgb_ranking

In [None]:
xgb_validated = model_fit(model, df7_train)

In [None]:
pickle.dump(xgb_validated, open(home_path + "ml_models_comp/xgb_validated_new_tree.pkl", "wb"))