In [2]:
import pandas as pd 
import numpy as np
from pprint import pprint
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

## Import data

In [5]:
train_origin = pd.read_csv('../data/featured_data/train.csv')
train_origin.shape

(260753, 297)

In [6]:
X_train_origin = train_origin.drop('QuoteConversion_Flag',axis=1)
y_train_origin = train_origin['QuoteConversion_Flag']

In [7]:
X_train_origin.shape,  y_train_origin.shape

((260753, 296), (260753,))

## Final iteration: under sampling + LGBC tuning

In [8]:
rus = RUS(random_state=1) 
X_train_res,y_train_res=rus.fit_resample(X_train_origin,y_train_origin)

In [9]:
X_train_res.shape, y_train_res.shape

((97788, 296), (97788,))

In [10]:
def hyperopt_train_test(params):
    clf = LGBC(**params)
    return cross_val_score(clf, X_train_res, y_train_res).mean()


space4LGBC = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 1),
    'n_estimators': hp.choice('n_estimators', range(20, 40)),
    'max_depth': hp.choice('max_depth', range(4, 8)),
    'num_leaves': hp.choice('num_leaves', range(20, 40)),
    'min_child_samples': hp.choice('min_child_samples', range(18, 22)),
    'min_child_weight': hp.uniform('min_child_weight', 0.001, 0.002),
    'feature_fraction': hp.uniform('feature_fraction', 0.6, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.8, 1),
    'bagging_freq': hp.choice('bagging_freq', range(2, 4)),
    'cat_smooth': hp.choice('cat_smooth', range(18, 22))
}

def f(params):
    acc = hyperopt_train_test(params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space4LGBC, algo=tpe.suggest, max_evals=100, trials=trials)
print('best:')
print(best)

100%|█████████████████████████████████████████████| 100/100 [12:36<00:00,  7.56s/trial, best loss: -0.8891888310011954]
best:
{'bagging_fraction': 0.9765317624503994, 'bagging_freq': 1, 'cat_smooth': 0, 'feature_fraction': 0.6185567199763784, 'learning_rate': 0.26388294296050524, 'max_depth': 3, 'min_child_samples': 3, 'min_child_weight': 0.0015244539361763455, 'n_estimators': 19, 'num_leaves': 7}


## Train the best model

In [11]:
best_lgbc = LGBC(bagging_fraction=0.9765317624503994, bagging_freq=1, boosting_type='gbdt',
               cat_smooth=0, class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.6185567199763784, importance_type='split', learning_rate=0.26388294296050524,
               max_depth=3, min_child_samples=3, min_child_weight=0.0015244539361763455,
               min_split_gain=0.0, n_estimators=19, n_jobs=-1, num_leaves=7,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [12]:
best_lgbc.fit(X_train_res,y_train_res)

LGBMClassifier(bagging_fraction=0.9765317624503994, bagging_freq=1,
               boosting_type='gbdt', cat_smooth=0, class_weight=None,
               colsample_bytree=1.0, feature_fraction=0.6185567199763784,
               importance_type='split', learning_rate=0.26388294296050524,
               max_depth=3, min_child_samples=3,
               min_child_weight=0.0015244539361763455, min_split_gain=0.0,
               n_estimators=19, n_jobs=-1, num_leaves=7, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [14]:
joblib.dump(best_lgbc,'../models/lgbc.m')

['../models/lgbc.m']