In [1]:
import pandas as pd 
import numpy as np
from pprint import pprint
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


## Import data

In [2]:
train_origin = pd.read_csv('../data/featured_data/train.csv')
train_origin.shape

(260753, 18)

In [3]:
X_train_origin = train_origin.drop('QuoteConversion_Flag',axis=1)
y_train_origin = train_origin['QuoteConversion_Flag']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1)

In [5]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((208602, 17), (52151, 17), (208602,), (52151,))

## 1st iteration: under sampling

In [6]:
rus = RUS(random_state=1) 
X_train_res,y_train_res=rus.fit_resample(X_train,y_train)

In [7]:
X_train_res.shape, X_val.shape, y_train_res.shape, y_val.shape

((78268, 17), (52151, 17), (78268,), (52151,))

In [8]:
def modeling2(model_name):
    model=model_name()
    model.fit(X_train_res,y_train_res)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))
    importance_list = sorted(zip(model.feature_importances_, X_train_origin.columns.to_list()), reverse=True)
    print("Feature importance: ")
    pprint(importance_list[:10])

In [9]:
modeling2(RF) # 3rd # a bit worse than 2nd iter: 0.8765639822248072

Confusion_Matrix:
[[36295  6096]
 [ 1306  8454]]
              precision    recall  f1-score   support

           0       0.97      0.86      0.91     42391
           1       0.58      0.87      0.70      9760

    accuracy                           0.86     52151
   macro avg       0.77      0.86      0.80     52151
weighted avg       0.89      0.86      0.87     52151

Accuracy:0.8580660006519529
AUC_ROC:0.8611922076136638
Feature importance: 
[(0.16170486406259307, 'Field7'),
 (0.14525574568320862, 'SalesField5'),
 (0.09556599104988064, 'PropertyField37'),
 (0.08791906145486568, 'SalesField1B'),
 (0.06675894501979951, 'PersonalField10A'),
 (0.06079400186967433, 'PersonalField10B'),
 (0.06055120560411137, 'PersonalField10B.1'),
 (0.057630070346702096, 'PropertyField29'),
 (0.052508615661978154, 'SalesField6'),
 (0.04401829851122269, 'SalesField1A')]


In [10]:
modeling2(LGBC) # 1st # a bit worse than 2nd iter: 0.8901793222521329

Confusion_Matrix:
[[36929  5462]
 [ 1050  8710]]
              precision    recall  f1-score   support

           0       0.97      0.87      0.92     42391
           1       0.61      0.89      0.73      9760

    accuracy                           0.88     52151
   macro avg       0.79      0.88      0.82     52151
weighted avg       0.91      0.88      0.88     52151

Accuracy:0.8751318287281165
AUC_ROC:0.8817849641181955
Feature importance: 
[(485, 'Field7'),
 (384, 'PersonalField10B'),
 (325, 'SalesField1B'),
 (262, 'SalesField6'),
 (225, 'SalesField1A'),
 (190, 'PersonalField10A'),
 (184, 'PersonalField82'),
 (168, 'SalesField5'),
 (142, 'PersonalField84'),
 (126, 'PersonalField9')]


In [11]:
modeling2(XGBC) # 2nd # a bit worse than 2nd iter AUC_ROC:0.8816185077949193

Confusion_Matrix:
[[36430  5961]
 [ 1031  8729]]
              precision    recall  f1-score   support

           0       0.97      0.86      0.91     42391
           1       0.59      0.89      0.71      9760

    accuracy                           0.87     52151
   macro avg       0.78      0.88      0.81     52151
weighted avg       0.90      0.87      0.88     52151

Accuracy:0.8659277866196238
AUC_ROC:0.8768726414921044
Feature importance: 
[(0.2569554, 'SalesField5'),
 (0.13814019, 'PersonalField10A'),
 (0.12736575, 'PropertyField37'),
 (0.12518091, 'PersonalField10B'),
 (0.123644345, 'Field7'),
 (0.05150102, 'PersonalField1'),
 (0.050158743, 'PersonalField2'),
 (0.03511158, 'SalesField1B'),
 (0.021583091, 'PersonalField12'),
 (0.020189526, 'PersonalField9')]
