In [1]:
%pip install catboost
from catboost import CatBoostClassifier, Pool
import re
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [None]:
df.drop('customerID', axis=1, inplace=True)

In [None]:
df.keys()


In [None]:
df.columns = df.columns.str.replace(' ', '_')

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.loc[(df['TotalCharges'] == ''), 'TotalCharges'] = 0

In [None]:
df.dtypes

In [None]:
df.replace(' ','_',regex=True, inplace=True)

In [None]:
X = df.drop('Churn', axis=1).copy()
y = df['Churn']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_encoded = pd.get_dummies(X, columns=['gender',
                                       'SeniorCitizen',
                                       'Dependents',
                                        'Partner',
                                        'PhoneService',
                                        'MultipleLines',
                                        'InternetService',
                                        'OnlineSecurity',
                                        'OnlineBackup',
                                        'DeviceProtection',
                                        'TechSupport',
                                        'StreamingTV',
                                        'StreamingMovies',
                                        'Contract',
                                        'PaperlessBilling',
                                        'PaymentMethod'])

In [None]:
X_encoded.dtypes

In [None]:
bool_columns = X_encoded.select_dtypes(include='bool').columns.tolist()
bool_columns
X_encoded[bool_columns] = X_encoded[bool_columns].astype(int)
X_encoded.head()

In [None]:
value_mapping = {'Yes': 1, 'No': 0}

y = y.map(value_mapping)

In [None]:
y.unique()

In [None]:
sum(y)/len(y)

In [None]:
# Based on the output of the above cell the dataset is imbalanced thus we use stratification based on the target variable `y`.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2,stratify=y, random_state=42)

In [None]:
sum(y_train)/len(y_train)

In [None]:
sum(y_test)/len(y_test)

In [None]:
# eval_metric="aucpr" because it is useful for imbalanced datasets.
clf_xgb = xgb.XGBClassifier(objective="binary:logistic",
                            random_state=42,
                            early_stopping_rounds=10,
                            eval_metric="aucpr"
                            )
clf_xgb.fit(X_train,y_train,verbose=True,eval_set=[(X_test, y_test)])

In [None]:
print(confusion_matrix(y_test, clf_xgb.predict(X_test)))

In [None]:
print(classification_report(y_test, clf_xgb.predict(X_test)))

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

In [None]:
parameters = {'objective': 'binary',
              'metric': 'auc',
              'is_unbalance': 'true',
              'boosting': 'gbdt',
              'num_leaves': 63,
              'feature_fraction': 0.5,
              'bagging_fraction': 0.5,
              'bagging_freq': 20,
              'learning_rate': 0.01,
              'verbose': -1
             }

In [None]:
model_lgbm = lgb.train(parameters,
                            train_data,
                            valid_sets=valid_data,
                            num_boost_round=5000
                       )

In [None]:
y_train_pred = model_lgbm.predict(X_train)
y_valid_pred = model_lgbm.predict(X_valid)

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_valid, y_valid_pred)))

In [None]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

In [None]:
clf = CatBoostClassifier(
    iterations=50,
    random_seed=42,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy']
)

clf.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=False,
    plot=True
)

In [None]:
print(clf.predict(data=X_val))

## Summary
Learned how to use the XGBoost classifier for predicting customer churn, especially how to handle imbalanced data by focusing on the customers who are more likely to leave using AUC-ROC Curve. I got a better understanding of setting up XGBoost for binary classifier.