In [1]:
import pandas as pd 
import numpy as np
from pprint import pprint
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


## Import data

In [5]:
train_origin = pd.read_csv('../data/featured_data/train.csv')
train_origin.shape

(260753, 297)

In [6]:
X_train_origin = train_origin.drop('QuoteConversion_Flag',axis=1)
y_train_origin = train_origin['QuoteConversion_Flag']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1)

In [8]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((208602, 296), (52151, 296), (208602,), (52151,))

## 1st iteration: under sampling

In [9]:
rus = RUS(random_state=1) 
X_train_res,y_train_res=rus.fit_resample(X_train,y_train)

In [10]:
X_train_res.shape, X_val.shape, y_train_res.shape, y_val.shape

((78268, 296), (52151, 296), (78268,), (52151,))

In [11]:
def modeling2(model_name):
    model=model_name()
    model.fit(X_train_res,y_train_res)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))
    importance_list = sorted(zip(model.feature_importances_, X_train_origin.columns.to_list()), reverse=True)
    print("Feature importance: ")
    pprint(importance_list[:10])

In [12]:
modeling2(RF) # 3rd # a bit better than 1st iter: 0.8776665339089529

Confusion_Matrix:
[[35609  6782]
 [  848  8912]]
              precision    recall  f1-score   support

           0       0.98      0.84      0.90     42391
           1       0.57      0.91      0.70      9760

    accuracy                           0.85     52151
   macro avg       0.77      0.88      0.80     52151
weighted avg       0.90      0.85      0.87     52151

Accuracy:0.853694080650419
AUC_ROC:0.8765639822248072
Feature importance: 
[(0.07305035461095132, 'SalesField5'),
 (0.055861380698376886, 'Field7'),
 (0.04817283808716051, 'PropertyField37'),
 (0.042357318845531615, 'PersonalField10B'),
 (0.0393665725244751, 'PersonalField10A'),
 (0.037833057452471744, 'PersonalField12'),
 (0.036115825555784006, 'PersonalField9'),
 (0.02861176679130415, 'PropertyField29'),
 (0.021758868654239493, 'SalesField4'),
 (0.016155568243477757, 'SalesField1B')]


In [13]:
modeling2(LGBC) # 1st # a bit worse than 1st iter: 0.891767591694185

Confusion_Matrix:
[[37341  5050]
 [  981  8779]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.93     42391
           1       0.63      0.90      0.74      9760

    accuracy                           0.88     52151
   macro avg       0.80      0.89      0.83     52151
weighted avg       0.91      0.88      0.89     52151

Accuracy:0.8843550459243351
AUC_ROC:0.8901793222521329
Feature importance: 
[(158, 'PersonalField9'),
 (102, 'SalesField1B'),
 (99, 'SalesField6'),
 (93, 'PropertyField37'),
 (79, 'Field7'),
 (76, 'PersonalField10B'),
 (71, 'SalesField5'),
 (60, 'SalesField1A'),
 (51, 'PersonalField84'),
 (50, 'PersonalField1')]


In [14]:
modeling2(XGBC) # 2nd # a bit better than 1st iter AUC_ROC:0.8812487697473675

Confusion_Matrix:
[[36385  6006]
 [  928  8832]]
              precision    recall  f1-score   support

           0       0.98      0.86      0.91     42391
           1       0.60      0.90      0.72      9760

    accuracy                           0.87     52151
   macro avg       0.79      0.88      0.82     52151
weighted avg       0.90      0.87      0.88     52151

Accuracy:0.8670399417077334
AUC_ROC:0.8816185077949193
Feature importance: 
[(0.19082816, 'SalesField5'),
 (0.11275378, 'Field7'),
 (0.09116209, 'PropertyField37'),
 (0.09032211, 'PersonalField10A'),
 (0.068193525, 'PersonalField9'),
 (0.05321063, 'PersonalField2'),
 (0.038401186, 'PersonalField10B'),
 (0.037471008, 'PersonalField1'),
 (0.031053618, 'SalesField1B'),
 (0.018290102, 'PersonalField82')]
