In [4]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostClassifier as ABC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC
from sklearn.ensemble import VotingClassifier

import joblib

In [2]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [3]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [4]:
X_train_origin.shape,y_train_origin.shape

((95512, 48), (95512,))

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1) 

In [6]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((76409, 48), (19103, 48), (76409,), (19103,))

In [7]:
def modeling(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))

### Baseline

In [8]:
modeling(KNN)  # reject

Confusion_Matrix:
[[10709  1312]
 [ 1618  5464]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88     12021
           1       0.81      0.77      0.79      7082

    accuracy                           0.85     19103
   macro avg       0.84      0.83      0.83     19103
weighted avg       0.85      0.85      0.85     19103

Accuracy:0.8466209495890697
AUC_ROC:0.8311955654372242


In [9]:
modeling(LR)  # reject

Confusion_Matrix:
[[10933  1088]
 [ 2129  4953]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.87     12021
           1       0.82      0.70      0.75      7082

    accuracy                           0.83     19103
   macro avg       0.83      0.80      0.81     19103
weighted avg       0.83      0.83      0.83     19103

Accuracy:0.8315971313406271
AUC_ROC:0.8044352146992318


In [10]:
modeling(SVC)  # accept

Confusion_Matrix:
[[11227   794]
 [ 1447  5635]]
              precision    recall  f1-score   support

           0       0.89      0.93      0.91     12021
           1       0.88      0.80      0.83      7082

    accuracy                           0.88     19103
   macro avg       0.88      0.86      0.87     19103
weighted avg       0.88      0.88      0.88     19103

Accuracy:0.8826885829450871
AUC_ROC:0.8648140546945039


In [11]:
modeling(NB) # reject

Confusion_Matrix:
[[10087  1934]
 [ 2782  4300]]
              precision    recall  f1-score   support

           0       0.78      0.84      0.81     12021
           1       0.69      0.61      0.65      7082

    accuracy                           0.75     19103
   macro avg       0.74      0.72      0.73     19103
weighted avg       0.75      0.75      0.75     19103

Accuracy:0.7531277809768099
AUC_ROC:0.7231439986143049


In [12]:
modeling(RF)  # accept

Confusion_Matrix:
[[11399   622]
 [ 1592  5490]]
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     12021
           1       0.90      0.78      0.83      7082

    accuracy                           0.88     19103
   macro avg       0.89      0.86      0.87     19103
weighted avg       0.89      0.88      0.88     19103

Accuracy:0.8841019735120138
AUC_ROC:0.8617309804801027


In [13]:
# use Out-of-bag data as validation set
model = RF(oob_score=True)
model.fit(X_train_origin, y_train_origin)
print(model.oob_score_) # almost the same

0.8868519138956361


In [14]:
model.feature_importances_  # useless because of PCA

array([0.03828724, 0.05382331, 0.05689759, 0.02558169, 0.06356312,
       0.02899642, 0.01292555, 0.01670632, 0.01122736, 0.01406549,
       0.01511886, 0.01362671, 0.02731224, 0.01176071, 0.01863789,
       0.0140053 , 0.01884813, 0.05702997, 0.02240864, 0.01174492,
       0.01303253, 0.01678525, 0.0120306 , 0.01070322, 0.01232616,
       0.01525836, 0.01295941, 0.01210727, 0.0143155 , 0.01044347,
       0.01054064, 0.01266792, 0.0099565 , 0.01359441, 0.01307525,
       0.01037713, 0.01340016, 0.01707277, 0.01360764, 0.01569073,
       0.01176651, 0.02304132, 0.01627026, 0.01757061, 0.0152463 ,
       0.01452748, 0.06390604, 0.0451591 ])

In [15]:
modeling(ABC)  # reject

Confusion_Matrix:
[[10663  1358]
 [ 2205  4877]]
              precision    recall  f1-score   support

           0       0.83      0.89      0.86     12021
           1       0.78      0.69      0.73      7082

    accuracy                           0.81     19103
   macro avg       0.81      0.79      0.79     19103
weighted avg       0.81      0.81      0.81     19103

Accuracy:0.8134847929644559
AUC_ROC:0.7878391519068308


In [16]:
modeling(GBC)  # reject

Confusion_Matrix:
[[11196   825]
 [ 2174  4908]]
              precision    recall  f1-score   support

           0       0.84      0.93      0.88     12021
           1       0.86      0.69      0.77      7082

    accuracy                           0.84     19103
   macro avg       0.85      0.81      0.82     19103
weighted avg       0.84      0.84      0.84     19103

Accuracy:0.8430089514735906
AUC_ROC:0.812197335825818


In [17]:
modeling(XGBC)  # reject

Confusion_Matrix:
[[11183   838]
 [ 2259  4823]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.88     12021
           1       0.85      0.68      0.76      7082

    accuracy                           0.84     19103
   macro avg       0.84      0.81      0.82     19103
weighted avg       0.84      0.84      0.83     19103

Accuracy:0.8378788671936345
AUC_ROC:0.8056554857954618


In [18]:
modeling(LGBC) #accept

Confusion_Matrix:
[[11190   831]
 [ 1576  5506]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     12021
           1       0.87      0.78      0.82      7082

    accuracy                           0.87     19103
   macro avg       0.87      0.85      0.86     19103
weighted avg       0.87      0.87      0.87     19103

Accuracy:0.8739988483484269
AUC_ROC:0.8541674845073085


### Final tuning

In [19]:
params_SVC = [{'kernel': ['rbf'],       # Gaussian kernel
               # punishment, inverse of regularization, bigger C -> overfitting
               'C': [0.1, 1, 10, 100],
               'gamma': [1e-2, 1e-3, 1e-4]  # bigger gamma -> overfitting
               }
              ]
svm = SVC()
clf_SVC = GridSearchCV(svm, param_grid=params_SVC, cv=3,n_jobs=-1)
clf_SVC.fit(X_train_origin, y_train_origin)

GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 10, 100],
                          'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [20]:
clf_SVC.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [22]:
param_RF = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 40, 80],
    'criterion': ['gini']
}
rf = RF()
clf_rf = GridSearchCV(rf, param_grid=param_RF, cv=3,n_jobs=-1)
clf_rf.fit(X_train_origin, y_train_origin)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [23]:
clf_rf.best_params_

{'criterion': 'gini',
 'max_depth': 80,
 'max_features': 'sqrt',
 'n_estimators': 200}

In [30]:
param_LGBC = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40],
    'max_depth': [4, 6, 8],
    'num_leaves': [20, 30, 40],
    'min_child_samples': [18, 19, 20, 21, 22],
    'min_child_weight': [0.001, 0.002],
    'feature_fraction': [0.6, 0.8, 1],
    'bagging_fraction': [0.8, 0.9, 1],
    'bagging_freq': [2, 3, 4],
    'cat_smooth': [0, 10, 20]
}
lgbc = LGBC()
clf_lgbc = RandomizedSearchCV(lgbc, param_distributions = param_LGBC, cv=3,n_iter=40, n_jobs=-1)
clf_lgbc.fit(X_train_origin, y_train_origin)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=31, objective=None,
                                            random_state=None, reg_alpha=0.0,
                                            reg_lambda=0.0, sile...
                   param_distributions={'bagging_fraction': [0.8, 0.9, 1],
                                        'bagging_f

In [31]:
clf_lgbc.best_params_

{'num_leaves': 20,
 'n_estimators': 40,
 'min_child_weight': 0.002,
 'min_child_samples': 18,
 'max_depth': 6,
 'learning_rate': 1,
 'feature_fraction': 1,
 'cat_smooth': 10,
 'bagging_freq': 2,
 'bagging_fraction': 0.8}

###  Train best models

In [8]:
best_svc = SVC(C=100,gamma=0.01,kernel='rbf')
best_svc.fit(X_train_origin,y_train_origin)
joblib.dump(best_svc, '../models/svc.m') 

['../models/svc.m']

In [9]:
best_rf = RF(criterion='gini',max_depth=80,max_features='sqrt',n_estimators=200)
best_rf.fit(X_train_origin,y_train_origin)
joblib.dump(best_rf, '../models/rf.m') 

['../models/rf.m']

In [10]:
best_lgbc = LGBC(num_leaves=20, n_estimators=40, min_child_weight=0.002,
                 min_child_samples=18, max_depth=6, learning_rate=1, feature_fraction=1, cat_smooth=10, bagging_freq=2, bagging_fraction=0.8)
best_lgbc.fit(X_train_origin, y_train_origin)
joblib.dump(best_lgbc, '../models/lgbc.m')

['../models/lgbc.m']

### Combine 3 best models into a voter

In [12]:
voting_clf = VotingClassifier(
    estimators=[ ('svc',best_svc), ('rf', best_rf), ('lgbc', best_lgbc)], voting='hard')
voting_clf.fit(X_train_origin,y_train_origin)
joblib.dump(voting_clf, '../models/voter.m')

['../models/voter.m']