In [1]:
import pandas as pd 
import numpy as np
from pprint import pprint
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


## Import data

In [2]:
train_origin = pd.read_csv('../data/featured_data/train.csv')
train_origin.shape

(260753, 18)

In [3]:
X_train_origin = train_origin.drop('QuoteConversion_Flag',axis=1)
y_train_origin = train_origin['QuoteConversion_Flag']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1)

In [5]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((208602, 17), (52151, 17), (208602,), (52151,))

## 1st iteration: under sampling

In [6]:
rus = RUS(random_state=1) 
X_train_res,y_train_res=rus.fit_resample(X_train,y_train)

In [7]:
X_train_res.shape, X_val.shape, y_train_res.shape, y_val.shape

((78268, 17), (52151, 17), (78268,), (52151,))

In [8]:
def modeling2(model_name):
    model=model_name()
    model.fit(X_train_res,y_train_res)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))
    importance_list = sorted(zip(model.feature_importances_, X_train_origin.columns.to_list()), reverse=True)
    print("Feature importance: ")
    pprint(importance_list[:10])

In [9]:
modeling2(LGBC) # a bit worse than previous best iter: 0.891767591694185
# conclusion: use 1st iter wiht LGBC and undersampling!

Confusion_Matrix:
[[36983  5408]
 [ 1070  8690]]
              precision    recall  f1-score   support

           0       0.97      0.87      0.92     42391
           1       0.62      0.89      0.73      9760

    accuracy                           0.88     52151
   macro avg       0.79      0.88      0.82     52151
weighted avg       0.91      0.88      0.88     52151

Accuracy:0.8757837817108013
AUC_ROC:0.881397301603998
Feature importance: 
[(479, 'Field7'),
 (403, 'PersonalField10B'),
 (345, 'SalesField1B'),
 (284, 'SalesField6'),
 (225, 'SalesField1A'),
 (182, 'SalesField5'),
 (160, 'PersonalField82'),
 (145, 'PersonalField10A'),
 (141, 'SalesField4'),
 (129, 'PersonalField12')]
