In [10]:
import pandas as pd 
import numpy as np
from pprint import pprint
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

## Import data

In [11]:
train_origin = pd.read_csv('../data/featured_data/train.csv')
train_origin.shape

(260753, 297)

In [12]:
X_train_origin = train_origin.drop('QuoteConversion_Flag',axis=1)
y_train_origin = train_origin['QuoteConversion_Flag']

In [13]:
X_train_origin.shape,  y_train_origin.shape

((260753, 296), (260753,))

## Final iteration: under sampling + LGBC tuning

In [15]:
rus = RUS(random_state=1) 
X_train_res,y_train_res=rus.fit_resample(X_train_origin,y_train_origin)

In [16]:
X_train_res.shape, y_train_res.shape

((97788, 296), (97788,))

In [17]:
param_LGBC = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40],
    'max_depth': [4, 6, 8],
    'num_leaves': [20, 30, 40],
    'min_child_samples': [18, 19, 20, 21, 22],
    'min_child_weight': [0.001, 0.002],
    'feature_fraction': [0.6, 0.8, 1],
    'bagging_fraction': [0.8, 0.9, 1],
    'bagging_freq': [2, 3, 4],
    'cat_smooth': [0, 10, 20]
}
lgbc = LGBC()
clf_lgbc = RandomizedSearchCV(lgbc, param_distributions = param_LGBC, cv=3,n_iter=40, n_jobs=-1)
clf_lgbc.fit(X_train_res, y_train_res)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=31, objective=None,
                                            random_state=None, reg_alpha=0.0,
                                            reg_lambda=0.0, sile...
                   param_distributions={'bagging_fraction': [0.8, 0.9, 1],
                                        'bagging_f

In [19]:
clf_lgbc.best_estimator_

LGBMClassifier(bagging_fraction=1, bagging_freq=2, boosting_type='gbdt',
               cat_smooth=20, class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.6, importance_type='split', learning_rate=1,
               max_depth=4, min_child_samples=18, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=20, n_jobs=-1, num_leaves=30,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [21]:
clf_lgbc.cv_results_

{'mean_fit_time': array([11.80660804, 10.18054303,  8.08536235,  8.11963701,  7.40771206,
         6.32577697,  9.34397205,  9.35843301,  9.93049296,  6.44843547,
         5.84738175,  7.08948302,  8.73938306,  5.36283739,  7.48543827,
         5.1350522 ,  5.16575813,  7.57434082,  5.57659896,  6.12486267,
         5.42443458,  5.95842894,  6.6205767 ,  5.54972188,  6.25267506,
         5.17270716,  4.87337049,  5.63703553,  7.97144365,  8.04095928,
         6.12503926,  5.35108217,  5.03112364,  7.51097902,  7.95025071,
         5.12743823,  7.93787193,  5.72347562,  4.80335069,  5.67546097]),
 'std_fit_time': array([0.22311171, 0.67725563, 1.1759908 , 0.17417809, 0.11414198,
        0.23503971, 0.3219782 , 0.14692747, 0.27511704, 0.40773481,
        0.03133447, 0.15923109, 0.25446308, 0.14614858, 0.29310689,
        0.09994276, 0.19524765, 0.06986526, 0.16217849, 0.14298995,
        0.14596904, 0.11180176, 0.18328945, 0.18743012, 0.09864331,
        0.13337366, 0.04869075, 0.0343773

## Train the best model

In [22]:
best_lgbc = LGBC(bagging_fraction=1, bagging_freq=2, boosting_type='gbdt',
               cat_smooth=20, class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.6, importance_type='split', learning_rate=1,
               max_depth=4, min_child_samples=18, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=20, n_jobs=-1, num_leaves=30,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [23]:
best_lgbc.fit(X_train_res,y_train_res)

LGBMClassifier(bagging_fraction=1, bagging_freq=2, boosting_type='gbdt',
               cat_smooth=20, class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.6, importance_type='split', learning_rate=1,
               max_depth=4, min_child_samples=18, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=20, n_jobs=-1, num_leaves=30,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [24]:
joblib.dump(best_lgbc,'../models/lgbc.m')

['../models/lgbc.m']