In [1]:
import pandas as pd 
import numpy as np
from pprint import pprint
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


## Import data

In [2]:
train_origin = pd.read_csv('../data/featured_data/train.csv')
train_origin.shape

(260753, 297)

In [3]:
X_train_origin = train_origin.drop('QuoteConversion_Flag',axis=1)
y_train_origin = train_origin['QuoteConversion_Flag']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1)

In [5]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((208602, 296), (52151, 296), (208602,), (52151,))

## 1st iteration: under sampling

In [6]:
rus = RUS(random_state=1) 
X_train_res,y_train_res=rus.fit_resample(X_train,y_train)

In [7]:
X_train_res.shape, X_val.shape, y_train_res.shape, y_val.shape

((78268, 296), (52151, 296), (78268,), (52151,))

In [8]:
def modeling2(model_name):
    model=model_name()
    model.fit(X_train_res,y_train_res)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))
    importance_list = sorted(zip(model.feature_importances_, X_train_origin.columns.to_list()), reverse=True)
    print("Feature importance: ")
    pprint(importance_list[:10])

In [9]:
modeling2(RF) # 3rd # a bit better than baseline: 0.8768362837804653
# (0.03616844947292094, 'PropertyField29'), not dropping it is correct! 

Confusion_Matrix:
[[35785  6606]
 [  867  8893]]
              precision    recall  f1-score   support

           0       0.98      0.84      0.91     42391
           1       0.57      0.91      0.70      9760

    accuracy                           0.86     52151
   macro avg       0.78      0.88      0.80     52151
weighted avg       0.90      0.86      0.87     52151

Accuracy:0.8567045694234051
AUC_ROC:0.8776665339089529
Feature importance: 
[(0.07093536290027179, 'SalesField5'),
 (0.048535117714417125, 'PropertyField37'),
 (0.04636423344252919, 'PersonalField10A'),
 (0.04015843486192391, 'PersonalField10B'),
 (0.03616844947292094, 'PropertyField29'),
 (0.035042073376632565, 'PersonalField12'),
 (0.033452365760272613, 'SalesField4'),
 (0.030120368162084424, 'PersonalField9'),
 (0.015283127586114798, 'SalesField1B'),
 (0.01486888669148291, 'SalesField1A')]


In [10]:
modeling2(LGBC) # 1st # a bit better than baseline: 0.8896402504436643
# (85, 'PropertyField29'), not dropping it is correct! 

Confusion_Matrix:
[[37480  4911]
 [  982  8778]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.93     42391
           1       0.64      0.90      0.75      9760

    accuracy                           0.89     52151
   macro avg       0.81      0.89      0.84     52151
weighted avg       0.91      0.89      0.89     52151

Accuracy:0.8870012080305267
AUC_ROC:0.891767591694185
Feature importance: 
[(166, 'PersonalField9'),
 (121, 'SalesField1B'),
 (85, 'PropertyField29'),
 (82, 'SalesField5'),
 (80, 'SalesField6'),
 (77, 'PropertyField37'),
 (71, 'SalesField1A'),
 (58, 'PersonalField84'),
 (53, 'PersonalField10B'),
 (53, 'PersonalField10A')]


In [11]:
modeling2(XGBC) # 2nd # a bit better than baseline AUC_ROC:0.8809186656539763
# (0.08754908, 'PropertyField29'), not dropping it is correct! 

Confusion_Matrix:
[[36649  5742]
 [  996  8764]]
              precision    recall  f1-score   support

           0       0.97      0.86      0.92     42391
           1       0.60      0.90      0.72      9760

    accuracy                           0.87     52151
   macro avg       0.79      0.88      0.82     52151
weighted avg       0.90      0.87      0.88     52151

Accuracy:0.8707982589020344
AUC_ROC:0.8812487697473675
Feature importance: 
[(0.16280703, 'SalesField5'),
 (0.10573273, 'PersonalField9'),
 (0.08754908, 'PropertyField29'),
 (0.07197702, 'PropertyField37'),
 (0.053697627, 'PersonalField1'),
 (0.046497922, 'SalesField1B'),
 (0.044303063, 'PersonalField2'),
 (0.026811315, 'PropertyField34'),
 (0.026312828, 'PersonalField10B'),
 (0.025681589, 'PropertyField35')]
