In [108]:
import pandas as pd
import numpy as np
from collections import Counter

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

from xgboost import XGBClassifier, plot_importance

import hyperopt as hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

## Load data

In [93]:
df = pd.read_csv("data/customer_churn_telecom_services.csv")
df["TotalCharges"] = df["TotalCharges"].fillna(df["tenure"]*df["MonthlyCharges"])

## Churn Prediction

* Churn prediction modelling can be an important tool for any company that aims to maximize customer retention, thus increase revenues and profitability.
* Predicting churn and successfully lowering it by a relatively small fraction, ensure increase in profits especially when it comes down to acquiring new customers. This can be
better modelled by using numbers such as Customer Lifetime Value (CLTV), Customer Acquisition Cost (CAC), Churn rate and Retention rate.

In addition, having historic data such as the purchase history (membership or upgrades), engagemnt data with the product/service sentiment metrics (complaints, feedback) etc can provide event better insights compared to using static data. 

## Use Case

* The dataset provides a plethora of data with regards to telecom customers. The data are static and it doesnt include any history data.
* The objective is to use the dataset and 'profile' the type of customer who is more likely to churn.

## Data Split

In [94]:
categ_feat = df.select_dtypes(include="object").columns.tolist()
categ_feat.remove("Churn")

df = pd.get_dummies(df, columns=categ_feat, drop_first=True)
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

#### Generate synthetic data

In [95]:
X = df.drop('Churn', axis=1)
y = df['Churn']

# sm = SMOTE(random_state=3)
# ada = ADASYN(random_state=3)
smtom = SMOTETomek(random_state=3)

# X, y = sm.fit_resample(X, y)
# X, y = ada.fit_resample(X, y)
X, y = smtom.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({0: 4765, 1: 4765})


In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

## Hyperparameter tuning - MLflow experiments

In [109]:
mlflow.set_experiment("Churn prediction")

2025/02/27 23:32:51 INFO mlflow.tracking.fluent: Experiment with name 'Churn prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/pj00/projects/Github/churn-prediction-deploy/mlruns/361393555524918426', creation_time=1740699171386, experiment_id='361393555524918426', last_update_time=1740699171386, lifecycle_stage='active', name='Churn prediction', tags={}>

In [104]:
# hyperparameter space
space={
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform ('gamma', 1,9),
    'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
    'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 50, 200, 10)
    }

In [105]:
def hyperparameter_tuning(space):
    model = XGBClassifier(n_estimators = int(space['n_estimators']), 
                          max_depth = int(space['max_depth']), 
                          gamma = space['gamma'],
                          reg_alpha = int(space['reg_alpha']), 
                          min_child_weight=space['min_child_weight'],
                          colsample_bytree=space['colsample_bytree'],
                          eval_metric=["rmse"],
                          early_stopping_rounds=10,)
    
    evaluation = [(X_train, y_train), (X_test, y_test)]

    model.fit(X_train, 
              y_train,
              eval_set=evaluation,
              verbose=False)

    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)

    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

In [106]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=30,
            trials=trials)

SCORE:                                                
0.7990556138509969                                    
SCORE:                                                                           
0.816894018887723                                                                
SCORE:                                                                           
0.8210912906610703                                                              
SCORE:                                                                           
0.8163693599160545                                                               
SCORE:                                                                           
0.8095487932843651                                                               
SCORE:                                                                           
0.8016789087093389                                                               
SCORE:                                                                 

In [107]:
best

{'colsample_bytree': 0.7875297076296592,
 'gamma': 2.5337780899710385,
 'max_depth': 14.0,
 'min_child_weight': 0.0,
 'n_estimators': 130.0,
 'reg_alpha': 53.0,
 'reg_lambda': 0.4747038661567912}