In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostClassifier as ABC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC
from sklearn.ensemble import VotingClassifier

import joblib

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [3]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [4]:
X_train_origin.shape,y_train_origin.shape

((5634, 25), (5634,))

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1)

In [6]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((4507, 25), (1127, 25), (4507,), (1127,))

In [8]:
data=pd.read_csv('../data/raw_data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
ratio = data[data.Churn=='No'].shape[0]/data[data.Churn=='Yes'].shape[0]
ratio

2.7683253076511503

In [19]:
def modeling(model_name):
    model=model_name(class_weight={0: 1, 1: ratio})
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))

### Baseline

In [20]:
modeling(LR)  # reject

Confusion_Matrix:
[[596 218]
 [ 67 246]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       814
           1       0.53      0.79      0.63       313

    accuracy                           0.75      1127
   macro avg       0.71      0.76      0.72      1127
weighted avg       0.80      0.75      0.76      1127

Accuracy:0.7471162377994676
AUC_ROC:0.7590646120997558


In [21]:
modeling(SVC) # accept

Confusion_Matrix:
[[601 213]
 [ 67 246]]
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       814
           1       0.54      0.79      0.64       313

    accuracy                           0.75      1127
   macro avg       0.72      0.76      0.72      1127
weighted avg       0.80      0.75      0.76      1127

Accuracy:0.7515527950310559
AUC_ROC:0.762135865171009


In [22]:
modeling(RF) # reject

Confusion_Matrix:
[[745  69]
 [165 148]]
              precision    recall  f1-score   support

           0       0.82      0.92      0.86       814
           1       0.68      0.47      0.56       313

    accuracy                           0.79      1127
   macro avg       0.75      0.69      0.71      1127
weighted avg       0.78      0.79      0.78      1127

Accuracy:0.7923691215616682
AUC_ROC:0.6940384328563242


In [23]:
# use Out-of-bag data as validation set
model = RF(oob_score=True,class_weight={0: 1, 1: ratio})
model.fit(X_train_origin, y_train_origin)
print(model.oob_score_)  # almost the same

0.7795527156549521


In [24]:
modeling(LGBC) # accept

Confusion_Matrix:
[[666 148]
 [115 198]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       814
           1       0.57      0.63      0.60       313

    accuracy                           0.77      1127
   macro avg       0.71      0.73      0.72      1127
weighted avg       0.77      0.77      0.77      1127

Accuracy:0.7666370896184561
AUC_ROC:0.7253848388033691


In [25]:
modeling(XGBC) #accept

Confusion_Matrix:
[[737  77]
 [150 163]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       814
           1       0.68      0.52      0.59       313

    accuracy                           0.80      1127
   macro avg       0.76      0.71      0.73      1127
weighted avg       0.79      0.80      0.79      1127

Accuracy:0.7985803016858918
AUC_ROC:0.7130860892841723
