# Final Report - Group 10

### Libaray

In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Resampling
from imblearn.over_sampling import SMOTE

# Feature selection
from sklearn.feature_selection import SelectFromModel

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier

# Tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, auc,
                             roc_curve, roc_auc_score, classification_report, f1_score)
from sklearn.model_selection import GridSearchCV


pd.options.display.max_columns = 999
%matplotlib inline
color = sns.color_palette()
sns.set_style('white')

## Bookmark

<a href=#p0>0. Raw Data Preparation</a>

<a href=#p1>1. Modeling - Baseline</a>

<a href=#p2>2. Modeling - Tuning</a>

<a href=#p3>3. Modeling - Stacking</a>

## Read data

In [20]:
df = pd.read_csv('bank-additional-full.csv',sep =";")
train = pd.read_csv('train_set.csv',index_col = 0)
test = pd.read_csv('test_set.csv',index_col = 0)

<a name='p0' /></a>
## 0. Raw Data Preparation

In [21]:
df['y'].replace("no", 0, inplace = True)
df['y'].replace("yes", 1, inplace = True)

In [22]:
#get dummies for raw data
dummy_features = ['job', 'marital', 'education', 'default', 'housing' ,          
                  'loan','contact', 'month', 'day_of_week', 'poutcome']
df = pd.get_dummies(data=df, columns=dummy_features,drop_first=True)

In [23]:
# To get the index of train and test set to split the raw data
train_index = train.index.values.tolist()
test_index = test.index.values.tolist()
train_index1 = pd.DataFrame(train_index,columns=['index'])
train_index1=train_index1.sort_values(by='index')

In [24]:
# Generate train_raw/test_raw, make sure that the raw data train/test set is the same to the processed one
df['ID']=df.index.values
train_raw = df.loc[df['ID'].isin(train_index)]
test_raw = df.loc[df['ID'].isin(test_index)]

In [25]:
train_raw = train_raw.drop(columns = 'ID')
test_raw = test_raw.drop(columns = 'ID')

In [26]:
X_train_raw = train_raw.drop(columns = ['y'])
y_train_raw = train_raw['y']

X_test_raw = test_raw.drop(columns = ['y'])
y_test_raw = test_raw['y']

<a name='p1' /></a>
## 1. Modeling - Baseline

### SMOTE

In [27]:
smote = SMOTE()
X_train_raw_smote, y_train_raw_smote = smote.fit_resample(X_train_raw, y_train_raw)

In [28]:
def Clf_train(classifier):    
    # classifier
    clf = classifier
    # fit data
    clf.fit(X_train_raw_smote, y_train_raw_smote)
    # get predictions
    y_pred = clf.predict(X_test_raw) # get predicted class
    y_proba = clf.predict_proba(X_test_raw) # get predicted prob.
        
    # generate confusion matrix
    cm = pd.DataFrame(confusion_matrix(y_true=y_test_raw, y_pred=y_pred), \
                      index=['Actual: 0', 'Actual: 1'], \
                      columns=['Pred: 0', 'Pred: 1'])
    
    # auc
    fpr, tpr, thresholds = roc_curve(y_true=y_test_raw, y_score=y_proba[:,1])
    auc_score = auc(fpr, tpr)
    
    # f1-score
    f1 = f1_score(y_true=y_test_raw, y_pred=y_pred, average='macro')
    
    return cm, auc_score, f1

### Baseline - Naive Bayesian

In [29]:
nb = GaussianNB()

# build model
cm_nb, auc_nb, f1_nb = Clf_train(nb)

print("Confusion Matrix: \n\n", cm_nb)
print("\n")
print(f"AUC Score: {auc_nb}")
print(f"f1 Score: {f1_nb}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0     9199     1766
Actual: 1      659      733


AUC Score: 0.7138473185317966
f1 Score: 0.6301545674596003


###  Logistic Regression

In [43]:
lr = LogisticRegression(solver = 'saga', max_iter=1000)

# build model
cm_lr, auc_lr, f1_lr = Clf_train(lr)

print("Confusion Matrix: \n\n", cm_lr)
print("\n")
print(f"AUC Score: {auc_lr}")
print(f"f1 Score: {f1_lr}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0     9482     1483
Actual: 1      233     1159


AUC Score: 0.9258351415947292
f1 Score: 0.7458185212924187




###  Random Forest

In [35]:
rf = RandomForestClassifier()

# build model
cm_rf, auc_rf, f1_rf = Clf_train(rf)

print("Confusion Matrix: \n\n", cm_rf)
print("\n")
print(f"AUC Score: {auc_rf}")
print(f"f1 Score: {f1_rf}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10461      504
Actual: 1      604      788


AUC Score: 0.9393888469581898
f1 Score: 0.7684441281466202


### XGBoost

In [36]:
xgb = XGBClassifier()

# build model
cm_xgb, auc_xgb, f1_xgb = Clf_train(xgb)

print("Confusion Matrix: \n\n", cm_xgb)
print("\n")
print(f"AUC Score: {auc_xgb}")
print(f"f1 Score: {f1_xgb}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10409      556
Actual: 1      564      828


AUC Score: 0.9405492462956848
f1 Score: 0.7727444096448912


### LightGBM

In [37]:
lgb = LGBMClassifier()

# build model
cm_lgb, auc_lgb,f1_lgb = Clf_train(lgb)

print("Confusion Matrix: \n\n", cm_lgb)
print("\n")
print(f"AUC Score: {auc_lgb}")
print(f"f1 Score: {f1_lgb}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10379      586
Actual: 1      499      893


AUC Score: 0.9452623223841795
f1 Score: 0.7862051169668065


<a name='p2' /></a>
## 2. Modeling - Tuning


#### NB

In [39]:
nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_NB.fit(X_train_raw_smote, y_train_raw_smote)

gs_NB.best_params_

{'var_smoothing': 0.0657933224657568}

In [40]:
nb_2 = GaussianNB(var_smoothing = 0.0657933224657568)

# build model
cm_nb_2, auc_nb_2, f1_nb_2 = Clf_train(nb_2)

print("Confusion Matrix: \n\n", cm_nb_2)
print("\n")
print(f"AUC Score: {auc_nb_2}")
print(f"f1 Score: {f1_nb_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10436      529
Actual: 1      683      709


AUC Score: 0.9186466473785452
f1 Score: 0.7421410680133855


#### LR

In [44]:
lr_classifier = LogisticRegression(solver = 'saga', max_iter=1000)

params_LR = {
            'penalty': ['l2','l1'],
            'C': [100, 10, 1.0, 0.1, 0.01]
}

gs_LR = GridSearchCV(estimator=lr_classifier, 
                 param_grid=params_LR, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_LR.fit(X_train_raw_smote, y_train_raw_smote)

params_1 = gs_LR.best_params_
params_1



{'C': 100, 'penalty': 'l1'}

In [45]:
lr_2 = LogisticRegression(
    solver = 'saga',
    max_iter = 1000,
    penalty = params_1['penalty'],
    C = params_1['C']     
)

# build model
cm_lr_2, auc_lr_2, f1_lr_2 = Clf_train(lr_2)

print("Confusion Matrix: \n\n", cm_lr_2)
print("\n")
print(f"AUC Score: {auc_lr_2}")
print(f"f1 Score: {f1_lr_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0     9482     1483
Actual: 1      233     1159


AUC Score: 0.9258355346950328
f1 Score: 0.7458185212924187




####    RF

In [53]:
rf_classifier = RandomForestClassifier()

params_RF = {
     'n_estimators': [220, 280],
     #'max_depth':[5, None], 
     #'min_samples_split':[2, 3],
     #'min_samples_leaf':[1, 3],
    
}

gs_RF = GridSearchCV(estimator=rf_classifier, 
                 param_grid=params_RF, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_RF.fit(X_train_raw_smote, y_train_raw_smote)

params_2 = gs_RF.best_params_
params_2

{'n_estimators': 220}

In [54]:
rf_2 = RandomForestClassifier(
    n_estimators = params_2['n_estimators'],
    #max_depth = params_2['max_depth'],
    #min_samples_split = params_2['min_samples_split'],
    #min_samples_leaf = params_2['min_samples_leaf'],
)

# build model
cm_rf_2, auc_rf_2, f1_rf_2 = Clf_train(rf_2)

print("Confusion Matrix: \n\n", cm_rf_2)
print("\n")
print(f"AUC Score: {auc_rf_2}")
print(f"f1 Score: {f1_rf_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10466      499
Actual: 1      590      802


AUC Score: 0.9414249099800306
f1 Score: 0.7730827372642793


#### XGB

In [124]:
xgb_classifier = XGBClassifier(eta = 0.1)

params_XGB = {
    #'n_estimators':[100,150]
    #'eta':[0.2,0.3],
    #'subsample':[0.5,1],
    #'max_depth':[28, 30],
    'gamma':[0,2],
    'reg_alpha':[0.6, 0.8],
    'reg_lambda':[0.4, 0.8]
    #'min_child_weight':[1,3]
    #'booster': ['gbtree', 'gblinear', 'dart']
}

gs_XGB = GridSearchCV(estimator=xgb_classifier, 
                 param_grid=params_XGB, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_XGB.fit(X_train_raw_smote, y_train_raw_smote)

params_3 = gs_XGB.best_params_
params_3

{'gamma': 2, 'reg_alpha': 0.8, 'reg_lambda': 0.8}

In [125]:
xgb_2 = XGBClassifier(
    #n_estimators = params_3['n_estimators']
    #eta = params_3['eta'],
    #min_child_weight = params_3['min_child_weight'],
    gamma = params_3['gamma'],
    #subsample = params_3['subsample'],
    #colsample_bytree = params_3['colsample_bytree'],
    #max_depth = params_3['max_depth']
    #booster = params_3['booster']
    reg_alpha = params_3['reg_alpha'],
    reg_lambda = params_3['reg_lambda']

)

# build model
cm_xgb_2,auc_xgb_2, f1_xgb_2 = Clf_train(xgb_2)

print("Confusion Matrix: \n\n", cm_xgb_2)
print("\n")
print(f"AUC Score: {auc_xgb_2}")
print(f"f1 Score: {f1_xgb_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10417      548
Actual: 1      578      814


AUC Score: 0.9412711094862964
f1 Score: 0.7699325571151351


#### LGBM

In [116]:
lgbm_classifier = LGBMClassifier()

params_lgbm = {
    'learning_rate' : [0.05,0.1],
    #'max_depth':[6,10],
    #'min_child_samples': [23, 26],
    #'feature_fraction': [0.3, 0.4],
    #'num_leaves': [100,150],
    #'subsample': [0.4, 0.6], 
    #'colsample_bytree': [0.4, 0.6],
    'reg_alpha':[0.4,0.7]

}

gs_lgbm = GridSearchCV(estimator=lgbm_classifier, 
                 param_grid=params_lgbm, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='roc_auc') 

gs_lgbm.fit(X_train_raw_smote, y_train_raw_smote)

params_4 = gs_lgbm.best_params_
params_4

{'learning_rate': 0.1, 'reg_alpha': 0.7}

In [115]:
lgbm_2 = LGBMClassifier(
    learning_rate = params_4['learning_rate'],
    #colsample_bytree = params_5['colsample_bytree'],
    #feature_fraction = params_5['feature_fraction'],
    #num_leaves = params_5['num_leaves'],
    #subsample = params_5['subsample'],
    #max_depth = params_5['max_depth']
    #min_child_samples = params_5['min_child_samples'],
    reg_alpha = params_4['reg_alpha']


)

cm_lgbm_2, auc_lgbm_2,f1_lgbm_2 = Clf_train(lgbm_2)

print("Confusion Matrix: \n\n", cm_lgbm_2)
print("\n")
print(f"AUC Score: {auc_lgbm_2}")
print(f"f1 Score: {f1_lgbm_2}")

Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10395      570
Actual: 1      493      899


AUC Score: 0.9455669096026541
f1 Score: 0.7899041917353365


<a name='p3' /></a>
## 3. Modeling - Stacking

In [126]:
# tuned models
nb_2 = GaussianNB(var_smoothing = 0.0657933224657568)
lr_2 = LogisticRegression(
    solver = 'saga',
    max_iter = 1000,
    penalty = params_1['penalty'],
    C = params_1['C']     
)
rf_2 = RandomForestClassifier(
    n_estimators = params_2['n_estimators'],
    #max_depth = params_2['max_depth'],
    #min_samples_split = params_2['min_samples_split'],
    #min_samples_leaf = params_2['min_samples_leaf'],
    #max_features = params_2['max_features']
)
xgb_2 = XGBClassifier(
    #n_estimators = params_3['n_estimators']
    #eta = params_3['eta'],
    #min_child_weight = params_3['min_child_weight'],
    gamma = params_3['gamma'],
    #subsample = params_3['subsample'],
    #colsample_bytree = params_3['colsample_bytree'],
    #max_depth = params_3['max_depth']
    #booster = params_3['booster']
    reg_alpha = params_3['reg_alpha'],
    reg_lambda = params_3['reg_lambda']
)
lgbm_2 = LGBMClassifier(
    learning_rate = params_5['learning_rate'],
    #colsample_bytree = params_5['colsample_bytree'],
    #feature_fraction = params_5['feature_fraction'],
    #num_leaves = params_5['num_leaves'],
    #subsample = params_5['subsample'],
    #max_depth = params_5['max_depth']
    #min_child_samples = params_5['min_child_samples'],
    reg_alpha = params_5['reg_alpha']
)

In [127]:
# get a stacking ensemble of models
def get_model():
    # define the level-0 base models with tuned paramters
    level0 = []
    level0.append(('lr', lr_2))
    level0.append(('nb', nb_2))
    level0.append(('rf', rf_2))
    level0.append(('xgb', xgb_2))
    level0.append(('lgbm', lgbm_2))
    # define level-1 meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1)
    return model

In [130]:
stacking_model = get_model()

cm_stk, auc_stk, f1_stk= Clf_train(stacking_model)

print("Confusion Matrix: \n\n", cm_stk)
print("\n")
print(f"AUC Score: {auc_stk}")
print(f"f1 Score: {f1_stk}")



Confusion Matrix: 

            Pred: 0  Pred: 1
Actual: 0    10247      718
Actual: 1      484      908


AUC Score: 0.932913567725941
f1 Score: 0.7731605389785183
