https://www.kaggle.com/code/captaingaga/water-quality-70-acc-optuna-svm-iterativeimputer/notebook
https://www.kaggle.com/code/muhammetgamal5/kfold-cross-validation-optuna-tuning/notebook
https://www.kaggle.com/code/neilgibbons/tuning-tabnet-with-optuna
    

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)

import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
# load models
import xgboost
from xgboost import XGBClassifier

import sklearn.svm
from sklearn.svm import SVC

from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier

In [3]:
# Set working directory
os.chdir("C:\\Users\Mumtaz\Desktop\Thesis data")

In [4]:
# load data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') 

In [5]:
# Check the target variable
df['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [6]:
# Convert to right types
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['SeniorCitizen'] = df['SeniorCitizen'].map({1: 'Yes', 0: 'No'})

In [7]:
# replace missing values with median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [8]:
#drop column customerID, this is not relevant
df.drop(columns=['customerID'], inplace = True)

In [9]:
# transform to binary
df['gender'].replace({'Female':1,'Male':0},inplace=True)

In [10]:
# simplify value
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)

In [11]:
# get dummies for features with multiple categories
multiple_categories = ['InternetService' ,'Contract' ,'PaymentMethod']
df = pd.get_dummies(data=df, columns= multiple_categories)

In [12]:
# encode binary values
categories = df[['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup','DeviceProtection','TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']]
for i in categories:
    df[i].replace({"No":0, "Yes":1}, inplace=True)

In [13]:
# split data into features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

In [14]:
# split the data with stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2, random_state=42)

In [15]:
# scale numeric values
from sklearn.preprocessing import StandardScaler
X_train[['tenure', 'MonthlyCharges', 'TotalCharges']] = StandardScaler().fit_transform(X_train[['tenure', 'MonthlyCharges', 'TotalCharges']])
X_test[['tenure', 'MonthlyCharges', 'TotalCharges']] = StandardScaler().fit_transform(X_test[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [16]:
# Smote for training set
from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
for label, count in zip(*np.unique(y_train, return_counts=True)):
    print('Class {} has {} samples'.format(label, count))

kmeans_smote = KMeansSMOTE(
    sampling_strategy = 'not majority',
    random_state = 42,
    k_neighbors = 10,
    cluster_balance_threshold = 0.1,
    kmeans_estimator = MiniBatchKMeans(n_clusters=100, random_state=42)
)
X_train, y_train = kmeans_smote.fit_resample(X_train, y_train)

for label, count in zip(*np.unique(y_train, return_counts=True)):
    print('Class {} has {} samples after oversampling'.format(label, count))

Class 0 has 4139 samples
Class 1 has 1495 samples




Class 0 has 4139 samples after oversampling
Class 1 has 4162 samples after oversampling


In [17]:
# Check the target variable
y_train.value_counts()

1    4162
0    4139
Name: Churn, dtype: int64

In [18]:
print(f"X_train shape :{X_train.shape}, y_train shape: {y_train.shape}")

X_train shape :(8301, 26), y_train shape: (8301,)


In [19]:
X_train = X_train.rename(columns={'InternetService_Fiber optic': 'InternetService_Fiber_optic', 'Contract_One year' : 'Contract_One_year', 'Contract_Two year': 'Contract_Two_year',
                      'PaymentMethod_Bank transfer (automatic)':'PaymentMethod_Bank_transfer_automatic', 'PaymentMethod_Credit card (automatic)':'PaymentMethod_Creditcard_automatic)',
                      'PaymentMethod_Electronic check': 'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed check': 'PaymentMethod_Mailed_check'})

In [20]:
X_test = X_test.rename(columns={'InternetService_Fiber optic': 'InternetService_Fiber_optic', 'Contract_One year' : 'Contract_One_year', 'Contract_Two year': 'Contract_Two_year',
                      'PaymentMethod_Bank transfer (automatic)':'PaymentMethod_Bank_transfer_automatic', 'PaymentMethod_Credit card (automatic)':'PaymentMethod_Creditcard_automatic)',
                      'PaymentMethod_Electronic check': 'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed check': 'PaymentMethod_Mailed_check'})

# XGBOOST

In [21]:
XGBoost_params = ({'n_estimators': 1986, 'max_depth': 13, 'learning_rate': 0.044627773598530054, 'gamma': 0.280206721005955, 'subsample': 0.58545974420027, 'min_child_weight': 2.3451283944922916, 'reg_lambda': 1.9688536003000663, 'reg_alpha': 1.6563747476472204, 'colsample_bytree': 0.7486193155195076})

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
XGBOOST_model = XGBClassifier(**XGBoost_params)
kfold_val = KFold(10, shuffle=True, random_state=42)
cv_score = cross_val_score(XGBOOST_model, X_train, y_train, cv=kfold_val, verbose=1)
print(cv_score)
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_score.mean(), cv_score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.85920578 0.82771084 0.85542169 0.84216867 0.86385542 0.84819277
 0.87710843 0.86024096 0.84337349 0.82650602]
0.85 accuracy with a standard deviation of 0.02


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  4.9min finished


In [23]:
import time
final_xgb_model = XGBClassifier(**XGBoost_params)
xgb_train_start = time.perf_counter()
final_xgb_model.fit(X_train, y_train)
xgb_train_end = time.perf_counter()

xgb_pred_start = time.perf_counter()
final_xgb_preds = final_xgb_model.predict(X_test)
xgb_pred_end = time.perf_counter()

xgb_train_time = xgb_train_end-xgb_train_start
xgb_pred_time = xgb_pred_end-xgb_pred_start

print("Time consumed for training:" ,xgb_train_time, "seconds")
print("Time consumed for prediction:" ,xgb_pred_time, "seconds")

Time consumed for training: 32.54776720000001 seconds
Time consumed for prediction: 0.11529039999999213 seconds


In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test, final_xgb_preds))

              precision    recall  f1-score   support

           0       0.89      0.76      0.82      1035
           1       0.52      0.74      0.61       374

    accuracy                           0.75      1409
   macro avg       0.71      0.75      0.72      1409
weighted avg       0.79      0.75      0.76      1409



In [25]:
final_xgb_model = XGBClassifier(**XGBoost_params)
xgb_train_start = time.perf_counter()
final_xgb_model.fit(X_train, y_train)
xgb_train_end = time.perf_counter()

xgb_pred_start = time.perf_counter()
final_xgb_preds_proba = final_xgb_model.predict_proba(X_test)[:,1]
xgb_pred_end = time.perf_counter()

xgb_train_time = xgb_train_end-xgb_train_start
xgb_pred_time = xgb_pred_end-xgb_pred_start

print("Time consumed for training:" ,xgb_train_time, "seconds")
print("Time consumed for prediction:" ,xgb_pred_time, "seconds")

Time consumed for training: 33.11593190000002 seconds
Time consumed for prediction: 0.11392019999999548 seconds


In [26]:
xgb_score=roc_auc_score(y_test, final_xgb_preds_proba)
print("The ROC AUC score is" ,xgb_score)

The ROC AUC score is 0.8232555736392054


# SVM CLASSIFIER

In [27]:
SVC_params = ({'C': 1, 'gamma': 1, 'kernel': 'rbf'})

In [28]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
SVM_model = SVC(**SVC_params)
kfold_val = KFold(10, shuffle=True, random_state=42)
cv_score = cross_val_score(SVM_model, X_train, y_train, cv=kfold_val, verbose=1)
print(cv_score)
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_score.mean(), cv_score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.86762936 0.83253012 0.85301205 0.8373494  0.8626506  0.84698795
 0.88192771 0.86626506 0.84457831 0.82891566]
0.85 accuracy with a standard deviation of 0.02


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   57.1s finished


In [29]:
import time
final_svm_model = SVC(**SVC_params)
svm_train_start = time.perf_counter()
final_svm_model.fit(X_train, y_train)
svm_train_end = time.perf_counter()

svm_pred_start = time.perf_counter()
final_svm_preds = final_svm_model.predict(X_test)
svm_pred_end = time.perf_counter()

svm_train_time = svm_train_end-svm_train_start
svm_pred_time = svm_pred_end-svm_pred_start

print("Time consumed for training:" ,svm_train_time, "seconds")
print("Time consumed for prediction:" ,svm_pred_time, "seconds")

Time consumed for training: 5.983721900000035 seconds
Time consumed for prediction: 1.376708399999984 seconds


In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, final_svm_preds))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1035
           1       0.57      0.58      0.58       374

    accuracy                           0.77      1409
   macro avg       0.71      0.71      0.71      1409
weighted avg       0.77      0.77      0.77      1409



In [31]:
final_svm_model = SVC(**SVC_params, probability=True)
svm_train_start = time.perf_counter()
final_svm_model.fit(X_train, y_train)
svm_train_end = time.perf_counter()

svm_pred_start = time.perf_counter()
final_svm_preds_proba = final_svm_model.predict_proba(X_test)[:,1]
svm_pred_end = time.perf_counter()

svm_train_time = svm_train_end-svm_train_start
svm_pred_time = svm_pred_end-svm_pred_start

print("Time consumed for training:" ,svm_train_time, "seconds")
print("Time consumed for prediction:" ,svm_pred_time, "seconds")

Time consumed for training: 36.5313256 seconds
Time consumed for prediction: 1.4222895000000335 seconds


In [32]:
svm_score=roc_auc_score(y_test, final_svm_preds_proba)
print("The ROC AUC score is" ,svm_score)

The ROC AUC score is 0.783189955824227


# TabNet pre-trainer & classifier

In [33]:
#transform data to numpy arrays for Pytorch (since it only deals with tensors)
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy().squeeze()
X_test_np  = X_test.to_numpy()
y_test_np = y_test.to_numpy().squeeze()

In [34]:
TabNet_params = ({'mask_type': 'entmax', 'n_da': 64, 'n_steps': 2, 'gamma': 1.4, 'n_shared': 1, 'lambda_sparse': 0.00031950274925103057, 'patienceScheduler': 5, 'patience': 18, 'epochs': 91})

In [35]:
final_TabNet_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'], n_steps=TabNet_params['n_steps'], gamma=TabNet_params['gamma'],
                     lambda_sparse=TabNet_params['lambda_sparse'], optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=TabNet_params['mask_type'], n_shared=TabNet_params['n_shared'],
                     scheduler_params=dict(mode="min",
                                           patience=TabNet_params['patienceScheduler'],
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=0,
                     )
epochs = TabNet_params['epochs']

In [36]:
TabNetclf_model = TabNetClassifier(**final_TabNet_params)
kfold_val = KFold(10, shuffle=True, random_state=42)
cv_score = cross_val_score(TabNetclf_model, X_train_np, y_train_np, cv=kfold_val, verbose=1, scoring='accuracy')
print(cv_score)
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_score.mean(), cv_score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.84115523 0.80722892 0.83493976 0.82650602 0.8626506  0.82650602
 0.83253012 0.8313253  0.82048193 0.80722892]
0.83 accuracy with a standard deviation of 0.02


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 117.2min finished


In [37]:
import time
TabNet_clf = TabNetClassifier(**final_TabNet_params)
clf_train_start = time.perf_counter()
TabNet_clf.fit(X_train_np, y_train_np, patience=TabNet_params['patience'], max_epochs=epochs)
clf_train_end = time.perf_counter()

clf_pred_start = time.perf_counter()
clf_preds = TabNet_clf.predict(X_test_np)
clf_pred_end = time.perf_counter()

clf_train_time = clf_train_end-clf_train_start
clf_pred_time = clf_pred_end-clf_pred_start

print("Time consumed for training:" ,clf_train_time, "seconds")
print("Time consumed for prediction:" ,clf_pred_time, "seconds")

Time consumed for training: 716.9731405999992 seconds
Time consumed for prediction: 0.5530815999991319 seconds


In [38]:
print(classification_report(y_test_np, clf_preds))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82      1035
           1       0.50      0.55      0.52       374

    accuracy                           0.73      1409
   macro avg       0.67      0.67      0.67      1409
weighted avg       0.74      0.73      0.74      1409



In [39]:
TabNet_clf = TabNetClassifier(**final_TabNet_params)
clf_train_start = time.perf_counter()
TabNet_clf.fit(X_train_np, y_train_np, patience=TabNet_params['patience'], max_epochs=epochs)
clf_train_end = time.perf_counter()

clf_pred_start = time.perf_counter()
clf_preds_proba = TabNet_clf.predict_proba(X_test_np)[:,1]
clf_pred_end = time.perf_counter()

clf_train_time = clf_train_end-clf_train_start
clf_pred_time = clf_pred_end-clf_pred_start

print("Time consumed for training:" ,clf_train_time, "seconds")
print("Time consumed for prediction:" ,clf_pred_time, "seconds")

Time consumed for training: 699.1366355999999 seconds
Time consumed for prediction: 0.712472400000479 seconds


In [40]:
clf_score=roc_auc_score(y_test, clf_preds_proba)
print("The ROC AUC score is" ,clf_score)

The ROC AUC score is 0.763734532020977
