In [81]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [82]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [83]:
df.drop('customerID', axis=1, inplace=True)

In [84]:
df.keys()


Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [85]:
df.columns = df.columns.str.replace(' ', '_')

In [86]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [87]:
df.dtypes

Unnamed: 0,0
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object
OnlineBackup,object


In [88]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.loc[(df['TotalCharges'] == ''), 'TotalCharges'] = 0

In [89]:
df.dtypes

Unnamed: 0,0
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object
OnlineBackup,object


In [90]:
df.replace(' ','_',regex=True, inplace=True)

In [91]:
X = df.drop('Churn', axis=1).copy()
y = df['Churn']

In [92]:
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No_phone_service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic_check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One_year,No,Mailed_check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed_check,53.85,108.15
3,Male,0,No,No,45,No,No_phone_service,DSL,Yes,No,Yes,Yes,No,No,One_year,No,Bank_transfer_(automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber_optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic_check,70.7,151.65


In [93]:
y.head()

Unnamed: 0,Churn
0,No
1,No
2,Yes
3,No
4,Yes


In [95]:
X_encoded = pd.get_dummies(X, columns=['gender',
                                       'SeniorCitizen',
                                       'Dependents',
                                        'Partner',
                                        'PhoneService',
                                        'MultipleLines',
                                        'InternetService',
                                        'OnlineSecurity',
                                        'OnlineBackup',
                                        'DeviceProtection',
                                        'TechSupport',
                                        'StreamingTV',
                                        'StreamingMovies',
                                        'Contract',
                                        'PaperlessBilling',
                                        'PaymentMethod'])

In [96]:
X_encoded.dtypes

Unnamed: 0,0
tenure,int64
MonthlyCharges,float64
TotalCharges,float64
gender_Female,bool
gender_Male,bool
SeniorCitizen_0,bool
SeniorCitizen_1,bool
Dependents_No,bool
Dependents_Yes,bool
Partner_No,bool


In [97]:
bool_columns = X_encoded.select_dtypes(include='bool').columns.tolist()
bool_columns
X_encoded[bool_columns] = X_encoded[bool_columns].astype(int)
X_encoded.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Dependents_No,Dependents_Yes,Partner_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One_year,Contract_Two_year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank_transfer_(automatic),PaymentMethod_Credit_card_(automatic),PaymentMethod_Electronic_check,PaymentMethod_Mailed_check
0,1,29.85,29.85,1,0,1,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
1,34,56.95,1889.5,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,2,53.85,108.15,0,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,45,42.3,1840.75,0,1,1,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,2,70.7,151.65,1,0,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [98]:
value_mapping = {'Yes': 1, 'No': 0}

y = y.map(value_mapping)

In [99]:
y.unique()

array([0, 1])

In [100]:
sum(y)/len(y)

0.2653698707936959

In [101]:
# Based on the output of the above cell the dataset is imbalanced thus we use stratification based on the target variable `y`.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2,stratify=y, random_state=42)

In [102]:
sum(y_train)/len(y_train)

0.2653532126375577

In [103]:
sum(y_test)/len(y_test)

0.2654364797728886

In [104]:
# eval_metric="aucpr" because it is useful for imbalanced datasets.
clf_xgb = xgb.XGBClassifier(objective="binary:logistic",
                            random_state=42,
                            early_stopping_rounds=10,
                            eval_metric="aucpr"
                            )
clf_xgb.fit(X_train,y_train,verbose=True,eval_set=[(X_test, y_test)])

[0]	validation_0-aucpr:0.63847
[1]	validation_0-aucpr:0.64713
[2]	validation_0-aucpr:0.64264
[3]	validation_0-aucpr:0.63888
[4]	validation_0-aucpr:0.63198
[5]	validation_0-aucpr:0.63577
[6]	validation_0-aucpr:0.64166
[7]	validation_0-aucpr:0.64274
[8]	validation_0-aucpr:0.64074
[9]	validation_0-aucpr:0.64443
[10]	validation_0-aucpr:0.64720
[11]	validation_0-aucpr:0.64515
[12]	validation_0-aucpr:0.64519
[13]	validation_0-aucpr:0.64798
[14]	validation_0-aucpr:0.64782
[15]	validation_0-aucpr:0.64566
[16]	validation_0-aucpr:0.63974
[17]	validation_0-aucpr:0.63942
[18]	validation_0-aucpr:0.63505
[19]	validation_0-aucpr:0.63635
[20]	validation_0-aucpr:0.63604
[21]	validation_0-aucpr:0.63683
[22]	validation_0-aucpr:0.63682
[23]	validation_0-aucpr:0.63666


In [105]:
print(confusion_matrix(y_test, clf_xgb.predict(X_test)))

[[925 110]
 [177 197]]


In [106]:
print(classification_report(y_test, clf_xgb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.64      0.53      0.58       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



## Summary
Learned how to use the XGBoost classifier for predicting customer churn, especially how to handle imbalanced data by focusing on the customers who are more likely to leave using AUC-ROC Curve. I got a better understanding of setting up XGBoost for binary classifier.