# Final Report - Group 10

### Libaray

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Resampling
from imblearn.over_sampling import SMOTE

# Feature selection
from sklearn.feature_selection import SelectFromModel

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier

# Tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, auc,
                             roc_curve, roc_auc_score, classification_report, f1_score)
from sklearn.model_selection import GridSearchCV


pd.options.display.max_columns = 999
%matplotlib inline
color = sns.color_palette()
sns.set_style('white')

## Bookmark

<a href=#p1>1. Modeling - Baseline</a>

<a href=#p2>2. Modeling - Tuning</a>

<a href=#p3>3. Modeling - Stacking</a>

## Read data

In [2]:
train = pd.read_csv('train_set.csv',index_col = 0)
test = pd.read_csv('test_set.csv',index_col = 0)

In [3]:
X_train = train.drop(columns = ['y'])
y_train = train['y']

X_test = test.drop(columns = ['y'])
y_test = test['y']

<a name='p1' /></a>
## 1. Modeling - Baseline

### SMOTE

In [4]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [5]:
def Clf_train(classifier):    
    # classifier
    clf = classifier
    # fit data
    clf.fit(X_train_smote, y_train_smote)
    # get predictions
    y_pred = clf.predict(X_test) # get predicted class
    y_proba = clf.predict_proba(X_test) # get predicted prob.
        
    # generate confusion matrix
    cm = pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred), \
                      index=['Actual: 0', 'Actual: 1'], \
                      columns=['Pred: 0', 'Pred: 1'])
    
    # auc
    fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_proba[:,1])
    auc_score = auc(fpr, tpr)
    
    # f1-score
    f1 = f1_score(y_true=y_test, y_pred=y_pred, average='macro')
    
    return cm, auc_score, f1

### Baseline - Naive Bayesian

In [6]:
nb = GaussianNB()

# build model
cm_nb, auc_nb, f1_nb = Clf_train(nb)

print("Confusion Matrix: \n\n", cm_nb)
print("\n")
print(f"AUC Score: {auc_nb}")
print(f"f1 Score: {f1_nb}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0     9742     1223
Actual: 1      382     1010


AUC Score: 0.9008529949001787
f1 Score: 0.7405676762358543


###  Logistic Regression

In [7]:
lr = LogisticRegression()
# solver = 'saga', max_iter=1000
# build model
cm_lr, auc_lr, f1_lr = Clf_train(lr)

print("Confusion Matrix: \n\n", cm_lr)
print("\n")
print(f"AUC Score: {auc_lr}")
print(f"f1 Score: {f1_lr}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0     9278     1687
Actual: 1      355     1037


AUC Score: 0.8856130530266101
f1 Score: 0.702375715381238


###  Random Forest

In [8]:
rf = RandomForestClassifier()

# build model
cm_rf, auc_rf, f1_rf = Clf_train(rf)

print("Confusion Matrix: \n\n", cm_rf)
print("\n")
print(f"AUC Score: {auc_rf}")
print(f"f1 Score: {f1_rf}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10555      410
Actual: 1      680      712


AUC Score: 0.9373173066339607
f1 Score: 0.7586644520415403


### XGBoost

In [9]:
xgb = XGBClassifier()

# build model
cm_xgb, auc_xgb, f1_xgb = Clf_train(xgb)

print("Confusion Matrix: \n\n", cm_xgb)
print("\n")
print(f"AUC Score: {auc_xgb}")
print(f"f1 Score: {f1_xgb}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10515      450
Actual: 1      624      768


AUC Score: 0.9440809249388074
f1 Score: 0.7699586281777677


### LightGBM

In [10]:
lgb = LGBMClassifier()

# build model
cm_lgb, auc_lgb,f1_lgb = Clf_train(lgb)

print("Confusion Matrix: \n\n", cm_lgb)
print("\n")
print(f"AUC Score: {auc_lgb}")
print(f"f1 Score: {f1_lgb}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10512      453
Actual: 1      586      806


AUC Score: 0.9476117846229644
f1 Score: 0.7804900041653977


<a name='p2' /></a>
## 2. Modeling - Tuning


#### NB

In [11]:
nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_NB.fit(X_train_smote, y_train_smote)

gs_NB.best_params_

{'var_smoothing': 5.336699231206302e-08}

In [12]:
nb_2 = GaussianNB(var_smoothing = 5.336699231206302e-08)

# build model
cm_nb_2, auc_nb_2, f1_nb_2 = Clf_train(nb_2)

print("Confusion Matrix: \n\n", cm_nb_2)
print("\n")
print(f"AUC Score: {auc_nb_2}")
print(f"f1 Score: {f1_nb_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10051      914
Actual: 1      497      895


AUC Score: 0.9051659603964547
f1 Score: 0.7468060004780788


#### LR

In [17]:
lr_classifier = LogisticRegression()
#solver = 'saga', max_iter=1000
params_LR = {
            'solver':['sag', 'saga','liblinear'],
            'penalty': ['l2','l1'],
            #'C': [100, 10, 1.0, 0.1, 0.01]
            'C': [0.05, 0.1, 0.3]
}

gs_LR = GridSearchCV(estimator=lr_classifier, 
                 param_grid=params_LR, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_LR.fit(X_train_smote, y_train_smote)

params_1 = gs_LR.best_params_
params_1

 0.80692572 0.78133901 0.88997814        nan 0.78137415 0.98563664
 0.80694608 0.78133784 0.8895246         nan 0.78131807 0.9831459 ]


{'C': 0.05, 'penalty': 'l1', 'solver': 'liblinear'}

In [19]:
lr_2 = LogisticRegression(
    solver = params_1['solver'],
    #max_iter = 1000,
    penalty = params_1['penalty'],
    C = params_1['C']     
)

# build model
cm_lr_2, auc_lr_2, f1_lr_2 = Clf_train(lr_2)

print("Confusion Matrix: \n\n", cm_lr_2)
print("\n")
print(f"AUC Score: {auc_lr_2}")
print(f"f1 Score: {f1_lr_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10615      350
Actual: 1      733      659


AUC Score: 0.9338902909466379
f1 Score: 0.7502006075278281


####    RF

In [21]:
rf_classifier = RandomForestClassifier()

params_RF = {
     'n_estimators': [220, 280],
     'max_depth':[5, None], 
     'min_samples_split':[2, 3],
     'min_samples_leaf':[1, 3],
    
}

gs_RF = GridSearchCV(estimator=rf_classifier, 
                 param_grid=params_RF, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_RF.fit(X_train_smote, y_train_smote)

params_2 = gs_RF.best_params_
params_2

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 280}

In [22]:
rf_2 = RandomForestClassifier(
    n_estimators = params_2['n_estimators'],
    max_depth = params_2['max_depth'],
    min_samples_split = params_2['min_samples_split'],
    min_samples_leaf = params_2['min_samples_leaf']
)

# build model
cm_rf_2, auc_rf_2, f1_rf_2 = Clf_train(rf_2)

print("Confusion Matrix: \n\n", cm_rf_2)
print("\n")
print(f"AUC Score: {auc_rf_2}")
print(f"f1 Score: {f1_rf_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10567      398
Actual: 1      657      735


AUC Score: 0.9403449324129545
f1 Score: 0.7673160682150508


#### XGB

In [23]:
xgb_classifier = XGBClassifier(eta = 0.1)

params_XGB = {
    'eta':[0.1,0.3],
    'subsample':[0.5,1],
    'colsample_bytree':[0.5, 1],
    'gamma':[0, 2],
    'max_depth':[6, 8],
    'min_child_weight':[1,3]
}

gs_XGB = GridSearchCV(estimator=xgb_classifier, 
                 param_grid=params_XGB, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_XGB.fit(X_train_smote, y_train_smote)

params_3 = gs_XGB.best_params_
params_3

{'colsample_bytree': 0.5,
 'eta': 0.1,
 'gamma': 0,
 'max_depth': 8,
 'min_child_weight': 1,
 'subsample': 0.5}

In [24]:
xgb_2 = XGBClassifier(
    eta = params_3['eta'],
    min_child_weight = params_3['min_child_weight'],
    gamma = params_3['gamma'],
    subsample = params_3['subsample'],
    colsample_bytree = params_3['colsample_bytree'],
    max_depth = params_3['max_depth']
)

# build model
cm_xgb_2,auc_xgb_2, f1_xgb_2 = Clf_train(xgb_2)

print("Confusion Matrix: \n\n", cm_xgb_2)
print("\n")
print(f"AUC Score: {auc_xgb_2}")
print(f"f1 Score: {f1_xgb_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10516      449
Actual: 1      603      789


AUC Score: 0.9450584998768286
f1 Score: 0.7761818511139287


#### LGBM

In [43]:
lgbm_classifier = LGBMClassifier()

params_lgbm = {
    'learning_rate' : [0.05,0.1],
    #'max_depth':[6,10],
    #'min_child_samples': [23, 26],
    #'feature_fraction': [0.3, 0.4],
    #'num_leaves': [100,150],
    #'subsample': [0.4, 0.8], 
    #'colsample_bytree': [0.4, 0.6],
    'reg_alpha':[0.4,0.7],
    'reg_lambda':[0.4,0.7]

}

gs_lgbm = GridSearchCV(estimator=lgbm_classifier, 
                 param_grid=params_lgbm, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_lgbm.fit(X_train_smote, y_train_smote)

params_4 = gs_lgbm.best_params_
params_4

{'learning_rate': 0.1, 'reg_alpha': 0.7, 'reg_lambda': 0.7}

In [44]:
lgbm_2 = LGBMClassifier(
    learning_rate = params_4['learning_rate'],
    #colsample_bytree = params_4['colsample_bytree'],
    #feature_fraction = params_4['feature_fraction'],
    #num_leaves = params_4['num_leaves'],
    #subsample = params_4['subsample'],
    #max_depth = params_4['max_depth']
    #min_child_samples = params_4['min_child_samples'],
    reg_alpha = params_4['reg_alpha'],
    reg_lambda = params_4['reg_lambda']
)

cm_lgbm_2, auc_lgbm_2,f1_lgbm_2 = Clf_train(lgbm_2)

print("Confusion Matrix: \n\n", cm_lgbm_2)
print("\n")
print(f"AUC Score: {auc_lgbm_2}")
print(f"f1 Score: {f1_lgbm_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10514      451
Actual: 1      580      812


AUC Score: 0.9483368581327212
f1 Score: 0.7824688950510995


<a name='p3' /></a>
## 3. Modeling - Stacking

In [46]:
# tuned models
nb_2 = GaussianNB(var_smoothing = 5.336699231206302e-08)
lr_2 = LogisticRegression(
    solver = params_1['solver'],
    #max_iter = 1000,
    penalty = params_1['penalty'],
    C = params_1['C']     
)
rf_2 = RandomForestClassifier(
     n_estimators = params_2['n_estimators'],
    max_depth = params_2['max_depth'],
    min_samples_split = params_2['min_samples_split'],
    min_samples_leaf = params_2['min_samples_leaf']
)
xgb_2 = XGBClassifier(
    eta = params_3['eta'],
    min_child_weight = params_3['min_child_weight'],
    gamma = params_3['gamma'],
    subsample = params_3['subsample'],
    colsample_bytree = params_3['colsample_bytree'],
    max_depth = params_3['max_depth']
)
lgbm_2 = LGBMClassifier(
    learning_rate = params_4['learning_rate'],
    #colsample_bytree = params_4['colsample_bytree'],
    #feature_fraction = params_4['feature_fraction'],
    #num_leaves = params_4['num_leaves'],
    #subsample = params_4['subsample'],
    #max_depth = params_4['max_depth']
    #min_child_samples = params_4['min_child_samples'],
    reg_alpha = params_4['reg_alpha'],
    reg_lambda = params_4['reg_lambda']
)

In [47]:
# get a stacking ensemble of models
def get_model():
    # define the level-0 base models with tuned paramters
    level0 = []
    level0.append(('lr', lr_2))
    level0.append(('nb', nb_2))
    level0.append(('rf', rf_2))
    level0.append(('xgb', xgb_2))
    level0.append(('lgbm', lgbm_2))
    # define level-1 meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1)
    return model

In [48]:
stacking_model = get_model()

cm_stk, auc_stk, f1_stk= Clf_train(stacking_model)

print("Confusion Matrix: \n\n", cm_stk)
print("\n")
print(f"AUC Score: {auc_stk}")
print(f"f1 Score: {f1_stk}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10448      517
Actual: 1      674      718


AUC Score: 0.9183287602664696
f1 Score: 0.7463540079909278
