In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostClassifier as ABC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC
from sklearn.ensemble import VotingClassifier

import joblib

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [3]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [4]:
X_train_origin.shape,y_train_origin.shape

((5634, 25), (5634,))

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1)

In [6]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((4507, 25), (1127, 25), (4507,), (1127,))

### Baseline (class weight modification)

In [7]:
data=pd.read_csv('../data/raw_data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
ratio = data[data.Churn=='No'].shape[0]/data[data.Churn=='Yes'].shape[0]
ratio

2.7683253076511503

In [8]:
def modeling(model_name):
    model=model_name(class_weight={0: 1, 1: ratio})
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))

In [9]:
modeling(LR)  # accept

Confusion_Matrix:
[[596 218]
 [ 67 246]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       814
           1       0.53      0.79      0.63       313

    accuracy                           0.75      1127
   macro avg       0.71      0.76      0.72      1127
weighted avg       0.80      0.75      0.76      1127

Accuracy:0.7471162377994676
AUC_ROC:0.7590646120997558


In [10]:
modeling(SVC) # accept

Confusion_Matrix:
[[601 213]
 [ 67 246]]
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       814
           1       0.54      0.79      0.64       313

    accuracy                           0.75      1127
   macro avg       0.72      0.76      0.72      1127
weighted avg       0.80      0.75      0.76      1127

Accuracy:0.7515527950310559
AUC_ROC:0.762135865171009


In [11]:
modeling(RF) # reject

Confusion_Matrix:
[[741  73]
 [168 145]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       814
           1       0.67      0.46      0.55       313

    accuracy                           0.79      1127
   macro avg       0.74      0.69      0.70      1127
weighted avg       0.77      0.79      0.77      1127

Accuracy:0.7861579414374446
AUC_ROC:0.6867890981309512


In [12]:
# use Out-of-bag data as validation set
model = RF(oob_score=True,class_weight={0: 1, 1: ratio})
model.fit(X_train_origin, y_train_origin)
print(model.oob_score_)  # almost the same

0.7800851970181044


In [9]:
modeling(LGBC) # reject

Confusion_Matrix:
[[666 148]
 [115 198]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       814
           1       0.57      0.63      0.60       313

    accuracy                           0.77      1127
   macro avg       0.71      0.73      0.72      1127
weighted avg       0.77      0.77      0.77      1127

Accuracy:0.7666370896184561
AUC_ROC:0.7253848388033691


In [10]:
modeling(XGBC) #accept

Confusion_Matrix:
[[737  77]
 [150 163]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       814
           1       0.68      0.52      0.59       313

    accuracy                           0.80      1127
   macro avg       0.76      0.71      0.73      1127
weighted avg       0.79      0.80      0.79      1127

Accuracy:0.7985803016858918
AUC_ROC:0.7130860892841723


In [26]:
# for accepted 3 models: XGBC,SVC,LR
acc_avg = (0.7985803016858918+0.7515527950310559+0.7471162377994676)/3
auc_avg = (0.7130860892841723+0.762135865171009+0.7590646120997558)/3
acc_avg,auc_avg

(0.7657497781721384, 0.7447621888516457)

### Baseline (Upsampling by SMOTE)

In [15]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=2) 
X_train_res,y_train_res=smote.fit_resample(X_train,y_train)

In [17]:
X_train_res.shape,y_train_res.shape

((6598, 25), (6598,))

In [18]:
def modeling2(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))

In [19]:
modeling2(LR)  # acc up, auc down  # accept

Confusion_Matrix:
[[747  67]
 [156 157]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       814
           1       0.70      0.50      0.58       313

    accuracy                           0.80      1127
   macro avg       0.76      0.71      0.73      1127
weighted avg       0.79      0.80      0.79      1127

Accuracy:0.8021295474711624
AUC_ROC:0.7096439308899373


In [20]:
modeling2(SVC) # acc up, auc down # accept

Confusion_Matrix:
[[748  66]
 [153 160]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       814
           1       0.71      0.51      0.59       313

    accuracy                           0.81      1127
   macro avg       0.77      0.72      0.73      1127
weighted avg       0.80      0.81      0.79      1127

Accuracy:0.805678793256433
AUC_ROC:0.7150505137725585


In [21]:
modeling2(RF) # all up  # reject

Confusion_Matrix:
[[742  72]
 [161 152]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       814
           1       0.68      0.49      0.57       313

    accuracy                           0.79      1127
   macro avg       0.75      0.70      0.72      1127
weighted avg       0.78      0.79      0.78      1127

Accuracy:0.7932564330079858
AUC_ROC:0.6985854573713999


In [22]:
modeling2(LGBC) # acc up, auc down  # reject

Confusion_Matrix:
[[723  91]
 [155 158]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       814
           1       0.63      0.50      0.56       313

    accuracy                           0.78      1127
   macro avg       0.73      0.70      0.71      1127
weighted avg       0.77      0.78      0.77      1127

Accuracy:0.7817213842058562
AUC_ROC:0.6964993602373794


In [23]:
modeling2(XGBC) # no difference # accept

Confusion_Matrix:
[[737  77]
 [150 163]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       814
           1       0.68      0.52      0.59       313

    accuracy                           0.80      1127
   macro avg       0.76      0.71      0.73      1127
weighted avg       0.79      0.80      0.79      1127

Accuracy:0.7985803016858918
AUC_ROC:0.7130860892841723


In [25]:
# for accepted 3 models: XGBC,SVC,LR
acc_avg = (0.7985803016858918+0.805678793256433+0.8021295474711624)/3
auc_avg = (0.7130860892841723+0.7150505137725585+0.7096439308899373)/3
acc_avg,auc_avg

(0.8021295474711624, 0.712593511315556)

### Final tuning

In [27]:
# For imblance dataset, auc_roc ignore the imbalance, which is a better metric than accuracy
# So the class_weight modification solution is better

In [32]:
params_SVC = [{'kernel': ['rbf'],       # Gaussian kernel
               # punishment, inverse of regularization, bigger C -> overfitting
               'C': [0.1, 1, 10, 100],
               'class_weight': [{0: 1, 1: ratio}],
               'gamma': [1e-2, 1e-3, 1e-4]  # bigger gamma -> overfitting
               }
              ]
svm = SVC()
clf_SVC = GridSearchCV(svm, param_grid=params_SVC, cv=3,n_jobs=-1)
clf_SVC.fit(X_train_origin, y_train_origin)

GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 10, 100],
                          'class_weight': [{0: 1, 1: 2.7683253076511503}],
                          'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [33]:
clf_SVC.best_params_

{'C': 100,
 'class_weight': {0: 1, 1: 2.7683253076511503},
 'gamma': 0.01,
 'kernel': 'rbf'}

In [39]:
params_LR = {'C': [0.001, 0.01, 0.1, 1, 10],  # C is the inverse of lambda
             'class_weight': [{0: 1, 1: ratio}],
             }
lr = LR()
clf_LR = GridSearchCV(lr, param_grid=params_LR, cv=5)
clf_LR.fit(X_train_origin, y_train_origin)
clf_LR.best_params_

{'C': 0.01, 'class_weight': {0: 1, 1: 2.7683253076511503}}

In [43]:
params_XGB = {
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'min_child_weight': [0, 2, 5, 10, 20],
    'max_delta_step': [0, 0.2, 0.6, 1, 2],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1],
    'class_weight': [{0: 1, 1: ratio}]
}
xgbc = XGBC()
clf_XGBC = RandomizedSearchCV(xgbc, n_iter=20, param_distributions=params_XGB, cv=3,n_jobs=-1)
clf_XGBC.fit(X_train_origin, y_train_origin)
clf_XGBC.best_params_

{'subsample': 0.85,
 'scale_pos_weight': 1,
 'reg_lambda': 0.8,
 'reg_alpha': 0.25,
 'n_estimators': 500,
 'min_child_weight': 5,
 'max_depth': 10,
 'max_delta_step': 1,
 'learning_rate': 0.01,
 'colsample_bytree': 0.9,
 'class_weight': {0: 1, 1: 2.7683253076511503}}

### Train best models

In [50]:
best_svc = SVC(C=100,gamma=0.01,kernel='rbf',class_weight={0: 1, 1: ratio})
best_svc.fit(X_train_origin,y_train_origin)
joblib.dump(best_svc, '../models/svc.m')

['../models/svc.m']

In [51]:
best_lr = LR(C=0.01,class_weight={0: 1, 1: ratio})
best_lr.fit(X_train_origin,y_train_origin)
joblib.dump(best_lr, '../models/lr.m')

['../models/lr.m']

In [64]:
best_xgbc = XGBC(class_weight={0: 1, 1: ratio}, subsample=0.85, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=0.25,
                 n_estimators=500, min_child_weight=5, max_depth=10, max_delta_step=1,
                 learning_rate=0.01, colsample_bytree=0.9)
best_xgbc.fit(X_train_origin, y_train_origin)
joblib.dump(best_xgbc, '../models/xgbc.m')

['../models/xgbc.m']

### Combine 3 best models into a voter

In [63]:
voting_clf = VotingClassifier(
    estimators=[('svc', best_svc), ('lr', best_lr), ('xgbc', best_xgbc)], voting='hard')
voting_clf.fit(X_train_origin, y_train_origin)
joblib.dump(voting_clf, '../models/voter.m')

['../models/voter.m']