In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)

import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
# load models
import xgboost
from xgboost import XGBClassifier

import sklearn.svm
from sklearn.svm import SVC

from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier

In [3]:
# Set working directory
os.chdir("C:\\Users\Mumtaz\Desktop\Thesis data")

In [4]:
# load data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') 

In [5]:
# Convert to right types
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['SeniorCitizen'] = df['SeniorCitizen'].map({1: 'Yes', 0: 'No'})

In [6]:
# replace missing values with median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [7]:
#drop column customerID, this is not relevant
df.drop(columns=['customerID'], inplace = True)

In [8]:
# transform to binary
df['gender'].replace({'Female':1,'Male':0},inplace=True)

In [9]:
# simplify value
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)

In [10]:
# get dummies for features with multiple categories
multiple_categories = ['InternetService' ,'Contract' ,'PaymentMethod']
df = pd.get_dummies(data=df, columns= multiple_categories)

In [11]:
# encode binary values
categories = df[['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup','DeviceProtection','TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']]
for i in categories:
    df[i].replace({"No":0, "Yes":1}, inplace=True)

In [12]:
# split data into features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

In [13]:
# split the data with stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2, random_state=42)

In [14]:
# scale numeric values
from sklearn.preprocessing import StandardScaler
X_train[['tenure', 'MonthlyCharges', 'TotalCharges']] = StandardScaler().fit_transform(X_train[['tenure', 'MonthlyCharges', 'TotalCharges']])
X_test[['tenure', 'MonthlyCharges', 'TotalCharges']] = StandardScaler().fit_transform(X_test[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [15]:
# Smote for training set
from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
for label, count in zip(*np.unique(y_train, return_counts=True)):
    print('Class {} has {} samples'.format(label, count))

kmeans_smote = KMeansSMOTE(
    sampling_strategy = 'not majority',
    random_state = 42,
    k_neighbors = 10,
    cluster_balance_threshold = 0.1,
    kmeans_estimator = MiniBatchKMeans(n_clusters=100, random_state=42)
)
X_train, y_train = kmeans_smote.fit_resample(X_train, y_train)

for label, count in zip(*np.unique(y_train, return_counts=True)):
    print('Class {} has {} samples after oversampling'.format(label, count))

Class 0 has 4139 samples
Class 1 has 1495 samples




Class 0 has 4139 samples after oversampling
Class 1 has 4162 samples after oversampling


In [16]:
X_train = X_train.rename(columns={'InternetService_Fiber optic': 'InternetService_Fiber_optic', 'Contract_One year' : 'Contract_One_year', 'Contract_Two year': 'Contract_Two_year',
                      'PaymentMethod_Bank transfer (automatic)':'PaymentMethod_Bank_transfer_automatic', 'PaymentMethod_Credit card (automatic)':'PaymentMethod_Creditcard_automatic)',
                      'PaymentMethod_Electronic check': 'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed check': 'PaymentMethod_Mailed_check'})

In [17]:
X_test = X_test.rename(columns={'InternetService_Fiber optic': 'InternetService_Fiber_optic', 'Contract_One year' : 'Contract_One_year', 'Contract_Two year': 'Contract_Two_year',
                      'PaymentMethod_Bank transfer (automatic)':'PaymentMethod_Bank_transfer_automatic', 'PaymentMethod_Credit card (automatic)':'PaymentMethod_Creditcard_automatic)',
                      'PaymentMethod_Electronic check': 'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed check': 'PaymentMethod_Mailed_check'})

In [24]:
X_train_part = X_train.sample(frac = 0.3)
y_train_part = y_train[X_train_part.index]

In [25]:
# convert to numpy array
X_train_part_np = X_train_part.to_numpy()
y_train_part_np = y_train_part.to_numpy().squeeze()

In [26]:
# convert test data to numpy array
X_test_np  = X_test.to_numpy()
y_test_np = y_test.to_numpy().squeeze()

# XGBoost Classifier

In [27]:
XGBoost_params = ({'n_estimators': 1986, 'max_depth': 13, 'learning_rate': 0.044627773598530054, 'gamma': 0.280206721005955, 'subsample': 0.58545974420027, 'min_child_weight': 2.3451283944922916, 'reg_lambda': 1.9688536003000663, 'reg_alpha': 1.6563747476472204, 'colsample_bytree': 0.7486193155195076})

In [28]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
XGBOOST_model = XGBClassifier(**XGBoost_params)
kfold_val = KFold(10, shuffle=True, random_state=42)
cv_score = cross_val_score(XGBOOST_model, X_train_part, y_train_part, cv=kfold_val, verbose=1)
print(cv_score)
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_score.mean(), cv_score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.82329317 0.82730924 0.82730924 0.83935743 0.85140562 0.85542169
 0.84738956 0.85140562 0.7751004  0.78313253]
0.83 accuracy with a standard deviation of 0.03


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.9min finished


In [29]:
import time
final_xgb_model = XGBClassifier(**XGBoost_params)
xgb_train_start = time.perf_counter()
final_xgb_model.fit(X_train_part, y_train_part)
xgb_train_end = time.perf_counter()

xgb_pred_start = time.perf_counter()
final_xgb_preds = final_xgb_model.predict(X_test)
xgb_pred_end = time.perf_counter()

xgb_train_time = xgb_train_end-xgb_train_start
xgb_pred_time = xgb_pred_end-xgb_pred_start

print("Time consumed for training:" ,xgb_train_time, "seconds")
print("Time consumed for prediction:" ,xgb_pred_time, "seconds")

Time consumed for training: 13.438946499999929 seconds
Time consumed for prediction: 0.12413059999971665 seconds


In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, final_xgb_preds))

              precision    recall  f1-score   support

           0       0.87      0.77      0.82      1035
           1       0.53      0.69      0.60       374

    accuracy                           0.75      1409
   macro avg       0.70      0.73      0.71      1409
weighted avg       0.78      0.75      0.76      1409



In [31]:
final_xgb_model = XGBClassifier(**XGBoost_params)
xgb_train_start = time.perf_counter()
final_xgb_model.fit(X_train_part, y_train_part)
xgb_train_end = time.perf_counter()

xgb_pred_start = time.perf_counter()
final_xgb_preds_proba = final_xgb_model.predict_proba(X_test)[:,1]
xgb_pred_end = time.perf_counter()

xgb_train_time = xgb_train_end-xgb_train_start
xgb_pred_time = xgb_pred_end-xgb_pred_start

print("Time consumed for training:" ,xgb_train_time, "seconds")
print("Time consumed for prediction:" ,xgb_pred_time, "seconds")

Time consumed for training: 12.503556099999969 seconds
Time consumed for prediction: 0.12098250000008193 seconds


In [32]:
xgb_score=roc_auc_score(y_test, final_xgb_preds_proba)
print("The ROC AUC score is" ,xgb_score)

The ROC AUC score is 0.8132191480017567


# Support Vector Classifier

In [33]:
SVC_params = ({'C': 1, 'gamma': 1, 'kernel': 'rbf'})

In [34]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
SVM_model = SVC(**SVC_params)
kfold_val = KFold(10, shuffle=True, random_state=42)
cv_score = cross_val_score(SVM_model, X_train_part, y_train_part, cv=kfold_val, verbose=1)
print(cv_score)
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_score.mean(), cv_score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.77911647 0.83935743 0.83534137 0.83935743 0.8313253  0.82730924
 0.82730924 0.81124498 0.79919679 0.7751004 ]
0.82 accuracy with a standard deviation of 0.02


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    8.7s finished


In [35]:
import time
final_svm_model = SVC(**SVC_params)
svm_train_start = time.perf_counter()
final_svm_model.fit(X_train_part, y_train_part)
svm_train_end = time.perf_counter()

svm_pred_start = time.perf_counter()
final_svm_preds = final_svm_model.predict(X_test)
svm_pred_end = time.perf_counter()

svm_train_time = svm_train_end-svm_train_start
svm_pred_time = svm_pred_end-svm_pred_start

print("Time consumed for training:" ,svm_train_time, "seconds")
print("Time consumed for prediction:" ,svm_pred_time, "seconds")

Time consumed for training: 1.1776635999999598 seconds
Time consumed for prediction: 1.1046696000003067 seconds


In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_test, final_svm_preds))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1035
           1       0.56      0.60      0.58       374

    accuracy                           0.77      1409
   macro avg       0.71      0.72      0.71      1409
weighted avg       0.78      0.77      0.77      1409



In [37]:
final_svm_model = SVC(**SVC_params, probability=True)
svm_train_start = time.perf_counter()
final_svm_model.fit(X_train_part, y_train_part)
svm_train_end = time.perf_counter()

svm_pred_start = time.perf_counter()
final_svm_preds_proba = final_svm_model.predict_proba(X_test)[:,1]
svm_pred_end = time.perf_counter()

svm_train_time = svm_train_end-svm_train_start
svm_pred_time = svm_pred_end-svm_pred_start

print("Time consumed for training:" ,svm_train_time, "seconds")
print("Time consumed for prediction:" ,svm_pred_time, "seconds")

Time consumed for training: 5.226253499999984 seconds
Time consumed for prediction: 0.8278095000000576 seconds


In [38]:
svm_score=roc_auc_score(y_test, final_svm_preds_proba)
print("The ROC AUC score is" ,svm_score)

The ROC AUC score is 0.7864527629233511


# TabNet pre-trainer and classifier

In [40]:
TabNet_params = ({'mask_type': 'entmax', 'n_da': 64, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 1, 'lambda_sparse': 0.00031950274925103057, 'patienceScheduler': 5, 'patience': 18, 'epochs': 91})

In [41]:
final_TabNet_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'], n_steps=TabNet_params['n_steps'], gamma=TabNet_params['gamma'],
                     lambda_sparse=TabNet_params['lambda_sparse'], optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=TabNet_params['mask_type'], n_shared=TabNet_params['n_shared'],
                     scheduler_params=dict(mode="min",
                                           patience=TabNet_params['patienceScheduler'],
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=0,
                     )
epochs = TabNet_params['epochs']

In [43]:
TabNetclf_model = TabNetClassifier(**final_TabNet_params)
kfold_val = KFold(10, shuffle=True, random_state=42)
cv_score = cross_val_score(TabNetclf_model, X_train_part_np, y_train_part_np, cv=kfold_val, verbose=1, scoring='accuracy')
print(cv_score)
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_score.mean(), cv_score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.72690763 0.77911647 0.78313253 0.77911647 0.82730924 0.77108434
 0.76706827 0.77108434 0.7751004  0.72289157]
0.77 accuracy with a standard deviation of 0.03


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 48.2min finished


In [44]:
import time
TabNet_clf = TabNetClassifier(**final_TabNet_params)
clf_train_start = time.perf_counter()
TabNet_clf.fit(X_train_part_np, y_train_part_np, patience=TabNet_params['patience'], max_epochs=epochs)
clf_train_end = time.perf_counter()

clf_pred_start = time.perf_counter()
clf_preds = TabNet_clf.predict(X_test_np)
clf_pred_end = time.perf_counter()

clf_train_time = clf_train_end-clf_train_start
clf_pred_time = clf_pred_end-clf_pred_start

print("Time consumed for training:" ,clf_train_time, "seconds")
print("Time consumed for prediction:" ,clf_pred_time, "seconds")

Time consumed for training: 266.74683060000007 seconds
Time consumed for prediction: 0.8186074999994162 seconds


In [45]:
print(classification_report(y_test_np, clf_preds))

              precision    recall  f1-score   support

           0       0.84      0.78      0.81      1035
           1       0.49      0.60      0.54       374

    accuracy                           0.73      1409
   macro avg       0.67      0.69      0.68      1409
weighted avg       0.75      0.73      0.74      1409



In [47]:
TabNet_clf = TabNetClassifier(**final_TabNet_params)
clf_train_start = time.perf_counter()
TabNet_clf.fit(X_train_part_np, y_train_part_np, patience=TabNet_params['patience'], max_epochs=epochs)
clf_train_end = time.perf_counter()

clf_pred_start = time.perf_counter()
clf_preds_proba = TabNet_clf.predict_proba(X_test_np)[:,1]
clf_pred_end = time.perf_counter()

clf_train_time = clf_train_end-clf_train_start
clf_pred_time = clf_pred_end-clf_pred_start

print("Time consumed for training:" ,clf_train_time, "seconds")
print("Time consumed for prediction:" ,clf_pred_time, "seconds")

Time consumed for training: 257.7512950999999 seconds
Time consumed for prediction: 0.7815465000003314 seconds


In [48]:
clf_score=roc_auc_score(y_test, clf_preds_proba)
print("The ROC AUC score is" ,clf_score)

The ROC AUC score is 0.7581596011263532


In [49]:
#transform data to numpy arrays for Pytorch (since it only deals with tensors)
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy().squeeze()

In [50]:
# Build self-supervised model and train on training data WITHOUT target
pre_trainer = TabNetPretrainer()

pre_trainer.fit(
    X_train=X_train_np,
    pretraining_ratio=0.8,
)

epoch 0  | loss: 9.72764 |  0:00:01s
epoch 1  | loss: 2.51224 |  0:00:03s
epoch 2  | loss: 1.2975  |  0:00:05s
epoch 3  | loss: 1.05941 |  0:00:08s
epoch 4  | loss: 1.01068 |  0:00:10s
epoch 5  | loss: 0.99454 |  0:00:12s
epoch 6  | loss: 0.98306 |  0:00:13s
epoch 7  | loss: 0.97809 |  0:00:15s
epoch 8  | loss: 0.96541 |  0:00:17s
epoch 9  | loss: 0.95513 |  0:00:19s
epoch 10 | loss: 0.94671 |  0:00:22s
epoch 11 | loss: 0.93974 |  0:00:25s
epoch 12 | loss: 0.93337 |  0:00:28s
epoch 13 | loss: 0.92183 |  0:00:30s
epoch 14 | loss: 0.9184  |  0:00:32s
epoch 15 | loss: 0.91302 |  0:00:34s
epoch 16 | loss: 0.90998 |  0:00:36s
epoch 17 | loss: 0.91092 |  0:00:39s
epoch 18 | loss: 0.90849 |  0:00:41s
epoch 19 | loss: 0.90018 |  0:00:43s
epoch 20 | loss: 0.8931  |  0:00:45s
epoch 21 | loss: 0.89594 |  0:00:47s
epoch 22 | loss: 0.89285 |  0:00:49s
epoch 23 | loss: 0.89122 |  0:00:51s
epoch 24 | loss: 0.88841 |  0:00:54s
epoch 25 | loss: 0.88206 |  0:00:56s
epoch 26 | loss: 0.88285 |  0:00:59s
e

In [51]:
# Build self-supervised learning model and train on 30 percent of training data
ssl_model = TabNetClassifier(**final_TabNet_params)
ssl_train_start = time.perf_counter()
ssl_model.fit(X_train_part_np, y_train_part_np, patience=TabNet_params['patience'], 
               max_epochs=epochs, from_unsupervised = pre_trainer)
ssl_train_end = time.perf_counter()

ssl_pred_start = time.perf_counter()
ssl_preds = ssl_model.predict(X_test_np)
ssl_pred_end = time.perf_counter()

ssl_train_time = ssl_train_end-ssl_train_start
ssl_pred_time = ssl_pred_end-ssl_pred_start

print("Time consumed for training:" ,ssl_train_time, "seconds")
print("Time consumed for prediction:" ,ssl_pred_time, "seconds")

Time consumed for training: 37.16862820000006 seconds
Time consumed for prediction: 0.12362909999956173 seconds


In [52]:
print(classification_report(y_test_np, ssl_preds))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1035
           1       0.54      0.54      0.54       374

    accuracy                           0.75      1409
   macro avg       0.69      0.69      0.69      1409
weighted avg       0.75      0.75      0.75      1409



In [54]:
ssl_model = TabNetClassifier(**final_TabNet_params)
ssl_train_start = time.perf_counter()
ssl_model.fit(X_train_part_np, y_train_part_np, patience=TabNet_params['patience'], 
               max_epochs=epochs, from_unsupervised = pre_trainer)
ssl_train_end = time.perf_counter()

ssl_pred_start = time.perf_counter()
ssl_preds_proba = ssl_model.predict_proba(X_test_np)[:,1]
ssl_pred_end = time.perf_counter()

ssl_train_time = ssl_train_end-ssl_train_start
ssl_pred_time = ssl_pred_end-ssl_pred_start

print("Time consumed for training:" ,ssl_train_time, "seconds")
print("Time consumed for prediction:" ,ssl_pred_time, "seconds")

Time consumed for training: 37.98411820000001 seconds
Time consumed for prediction: 0.1944127999995544 seconds


In [55]:
ssl_score=roc_auc_score(y_test, ssl_preds_proba)
print("The ROC AUC score is" ,ssl_score)

The ROC AUC score is 0.7992882792115529
