In [19]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler as RUS

import joblib

## Import data

In [4]:
train_origin = pd.read_csv('../data/featured_data/train.csv')
train_origin.shape

(260753, 295)

In [7]:
X_train_origin = train_origin.drop('QuoteConversion_Flag',axis=1)
y_train_origin = train_origin['QuoteConversion_Flag']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1)

In [9]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((208602, 294), (52151, 294), (208602,), (52151,))

## Baseline: class weight modification (rejected)

In [11]:
imbalance_ratio = 211859/48894

def modeling(model_name):
    model=model_name(class_weight={0: 1, 1: imbalance_ratio})
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))

In [12]:
modeling(RF) # 3rd in AUC_ROC

Confusion_Matrix:
[[41773   618]
 [ 3957  5803]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     42391
           1       0.90      0.59      0.72      9760

    accuracy                           0.91     52151
   macro avg       0.91      0.79      0.83     52151
weighted avg       0.91      0.91      0.90     52151

Accuracy:0.9122739736534294
AUC_ROC:0.7899955529630284


In [14]:
modeling(LGBC) # 1st in AUC_ROC

Confusion_Matrix:
[[37584  4807]
 [ 1012  8748]]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     42391
           1       0.65      0.90      0.75      9760

    accuracy                           0.89     52151
   macro avg       0.81      0.89      0.84     52151
weighted avg       0.91      0.89      0.89     52151

Accuracy:0.8884201645222527
AUC_ROC:0.8914573819218509


In [15]:
modeling(XGBC) # 2nd in AUC_ROC

Confusion_Matrix:
[[42010   381]
 [ 3682  6078]]
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     42391
           1       0.94      0.62      0.75      9760

    accuracy                           0.92     52151
   macro avg       0.93      0.81      0.85     52151
weighted avg       0.92      0.92      0.92     52151

Accuracy:0.922091618569155
AUC_ROC:0.8068790724020835


## Baseline: under sampling

In [20]:
rus = RUS(random_state=1) 
X_train_res,y_train_res=rus.fit_resample(X_train,y_train)

In [21]:
X_train_res.shape, X_val.shape, y_train_res.shape, y_val.shape

((78268, 294), (52151, 294), (78268,), (52151,))

In [22]:
def modeling2(model_name):
    model=model_name()
    model.fit(X_train_res,y_train_res)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))

In [23]:
modeling2(RF) # 3rd # fat more better than previous baseline: 0.7899955529630284

Confusion_Matrix:
[[35793  6598]
 [  905  8855]]
              precision    recall  f1-score   support

           0       0.98      0.84      0.91     42391
           1       0.57      0.91      0.70      9760

    accuracy                           0.86     52151
   macro avg       0.77      0.88      0.80     52151
weighted avg       0.90      0.86      0.87     52151

Accuracy:0.8561293167916243
AUC_ROC:0.8758141722492905


In [24]:
modeling2(LGBC) # 1st # a bit worse than previous baseline: 0.8914573819218509

Confusion_Matrix:
[[37456  4935]
 [ 1018  8742]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.93     42391
           1       0.64      0.90      0.75      9760

    accuracy                           0.89     52151
   macro avg       0.81      0.89      0.84     52151
weighted avg       0.91      0.89      0.89     52151

Accuracy:0.8858507027669652
AUC_ROC:0.8896402504436643


In [25]:
modeling2(XGBC) # 2nd # far more better than previous baseline AUC_ROC:0.8068790724020835

Confusion_Matrix:
[[36760  5631]
 [ 1028  8732]]
              precision    recall  f1-score   support

           0       0.97      0.87      0.92     42391
           1       0.61      0.89      0.72      9760

    accuracy                           0.87     52151
   macro avg       0.79      0.88      0.82     52151
weighted avg       0.90      0.87      0.88     52151

Accuracy:0.8723130908323905
AUC_ROC:0.8809186656539763
