In [69]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [70]:
# Read the file

df = pd.read_csv('Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [71]:
df.Churn.value_counts()/len(df)*100

Churn
No     73.463013
Yes    26.536987
Name: count, dtype: float64

### **Churn Rate: 26.53%

- Which means, 26.53% of the customers churn out of this telecom company


In [72]:
# Divide data into X and y - X (Independent feature), y (Dependent variable)

# Define y (target variable)
y = df['Churn']

# Define X (features) by dropping 'customerID' and 'Churn'
X = df.drop(columns=['customerID', 'Churn'])

# Optional: Verify the shapes
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (7043, 19)
Shape of y: (7043,)


# Train Test Split

In [73]:
X = pd.get_dummies(X, drop_first=True)
y = df['Churn'].map({'No':0, 'Yes':1})

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model Building

In [75]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)

0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [76]:
y_pred_dt = model_dt.predict(X_test)

In [77]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1044
           1       0.56      0.46      0.51       365

    accuracy                           0.77      1409
   macro avg       0.69      0.67      0.68      1409
weighted avg       0.75      0.77      0.76      1409



In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# **Initial Insights**

- Base model has a accuracy of 76% which is not reliable because of the imbalance dataset
- TotalCharges needs to be a float/int type (Data Cleaning)
- Need to perform Feature Scaling

# Data Cleaning

In [79]:
telco_data = df.copy()

In [80]:
telco_data.TotalCharges = pd.to_numeric(telco_data.TotalCharges, errors='coerce')

In [81]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [82]:
telco_data.loc[telco_data['TotalCharges'].isnull() == True]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [83]:
telco_data.dropna(how='any', inplace=True)

In [84]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  

In [85]:
# Define the bin edges (0 to 72, stepping by 12 months)
bins = [0, 12, 24, 36, 48, 60, 72]

# Define the labels for each bin
labels = ['1-12', '13-24', '15-36', '36-48', '49-60', '61-72']

# Create the new binned column
telco_data['tenure_bin'] = pd.cut(telco_data['tenure'], bins=bins, labels=labels, include_lowest=True)

# Optional: Check the result
print(telco_data[['tenure', 'tenure_bin']].head(10))
print(telco_data['tenure_bin'].value_counts().sort_index())

   tenure tenure_bin
0       1       1-12
1      34      15-36
2       2       1-12
3      45      36-48
4       2       1-12
5       8       1-12
6      22      13-24
7      10       1-12
8      28      15-36
9      62      61-72
tenure_bin
1-12     2175
13-24    1024
15-36     832
36-48     762
49-60     832
61-72    1407
Name: count, dtype: int64


In [86]:
# Divide data into X and y - X (Independent feature), y (Dependent variable)

# Define y (target variable)
y = telco_data['Churn']

# Define X (features) by dropping 'customerID' and 'Churn'
X = telco_data.drop(columns=['customerID', 'Churn', 'tenure'])

# Optional: Verify the shapes
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (7032, 19)
Shape of y: (7032,)


In [87]:
X = pd.get_dummies(X, drop_first=True)
y = telco_data['Churn'].map({'No':0, 'Yes':1})

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Feature Scaling

In [89]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [90]:
model_dt2 = DecisionTreeClassifier()
model_dt2.fit(X_train, y_train)

y_pred_dt2 = model_dt2.predict(X_test)

In [91]:
print(classification_report(y_test, y_pred_dt2))

              precision    recall  f1-score   support

           0       0.80      0.81      0.80      1018
           1       0.48      0.47      0.48       389

    accuracy                           0.71      1407
   macro avg       0.64      0.64      0.64      1407
weighted avg       0.71      0.71      0.71      1407



# Feature Scaling - MinmaxScaler

In [92]:
mms = MinMaxScaler()
X_train_mms = mms.fit_transform(X_train)
X_test_mms = mms.transform(X_test)

In [93]:
model_dt3 = DecisionTreeClassifier()
model_dt3.fit(X_train_mms, y_train)

y_pred_dt3 = model_dt3.predict(X_test_mms)

In [94]:
print(classification_report(y_test, y_pred_dt3))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1018
           1       0.48      0.48      0.48       389

    accuracy                           0.71      1407
   macro avg       0.64      0.64      0.64      1407
weighted avg       0.71      0.71      0.71      1407



### SMOTEENN () [UpSampling + ENN]

In [95]:
from imblearn.combine import SMOTEENN

In [96]:
sm = SMOTEENN()
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

In [97]:
model_dt_smoteenn = DecisionTreeClassifier()
model_dt_smoteenn.fit(X_train_resampled, y_train_resampled)

y_pred_dt_smoteenn = model_dt_smoteenn.predict(X_test)

In [98]:
print(classification_report(y_test, y_pred_dt_smoteenn))

              precision    recall  f1-score   support

           0       0.89      0.71      0.79      1018
           1       0.51      0.78      0.61       389

    accuracy                           0.73      1407
   macro avg       0.70      0.75      0.70      1407
weighted avg       0.79      0.73      0.74      1407



In [99]:
model_rf_smoteenn = RandomForestClassifier()
model_rf_smoteenn.fit(X_train_resampled, y_train_resampled)

y_pred_rf_smoteenn = model_rf_smoteenn.predict(X_test)

In [100]:
print(classification_report(y_test, y_pred_rf_smoteenn))

              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1018
           1       0.52      0.81      0.64       389

    accuracy                           0.74      1407
   macro avg       0.72      0.76      0.72      1407
weighted avg       0.80      0.74      0.76      1407



### XGBoost without SMOTEENN

In [101]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
model_xgb = XGBClassifier(random_state=42) # random_state for reproducibility
model_xgb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = model_xgb.predict(X_test)

In [102]:
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1018
           1       0.65      0.56      0.60       389

    accuracy                           0.79      1407
   macro avg       0.74      0.72      0.73      1407
weighted avg       0.79      0.79      0.79      1407



### XGBoost with SMOTEENN

In [103]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
model_xgb_smoteenn = XGBClassifier(random_state=42) # random_state for reproducibility
model_xgb_smoteenn.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_xgb_smoteenn = model_xgb_smoteenn.predict(X_test)

In [104]:
print(classification_report(y_test, y_pred_xgb_smoteenn))

              precision    recall  f1-score   support

           0       0.90      0.73      0.81      1018
           1       0.53      0.80      0.64       389

    accuracy                           0.75      1407
   macro avg       0.72      0.76      0.72      1407
weighted avg       0.80      0.75      0.76      1407



### XGBoost with SMOTE

In [105]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from collections import Counter

# Check class distribution before SMOTE
print("Before SMOTE:", Counter(y_train))

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("After SMOTE:", Counter(y_train_smote))

# Initialize and train the XGBoost model on resampled data
model_xgb_smote = XGBClassifier(random_state=42)
model_xgb_smote.fit(X_train_smote, y_train_smote)

# Make predictions on the original test set (unchanged)
y_pred_xgb_smote = model_xgb_smote.predict(X_test)


Before SMOTE: Counter({0: 4145, 1: 1480})
After SMOTE: Counter({0: 4145, 1: 4145})


In [106]:
print(classification_report(y_test, y_pred_xgb_smote))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1018
           1       0.61      0.58      0.59       389

    accuracy                           0.78      1407
   macro avg       0.73      0.72      0.72      1407
weighted avg       0.78      0.78      0.78      1407



### XGBoost with ADASYN

In [107]:
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
from collections import Counter

# Check original distribution
print("Before ADASYN:", Counter(y_train))

# Apply ADASYN to training data only
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

# Check new distribution
print("After ADASYN:", Counter(y_train_adasyn))

# Train XGBoost on ADASYN-resampled data
model_xgb_adasyn = XGBClassifier(random_state=42)
model_xgb_adasyn.fit(X_train_adasyn, y_train_adasyn)

# Predict on the original test set
y_pred_xgb_adasyn =model_xgb_adasyn.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred_xgb_adasyn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb_adasyn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_adasyn))

Before ADASYN: Counter({0: 4145, 1: 1480})
After ADASYN: Counter({0: 4145, 1: 4130})
Accuracy: 0.7860696517412935

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      1018
           1       0.62      0.58      0.60       389

    accuracy                           0.79      1407
   macro avg       0.73      0.72      0.73      1407
weighted avg       0.78      0.79      0.78      1407


Confusion Matrix:
 [[879 139]
 [162 227]]


In [108]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate scale_weight (ratio of negative to possitive class)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

# Train XGBoost with scale_pos_weight
model_xgb_weighted = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42, 
    eval_metric='logloss' # avoids warnings
    )

model_xgb_weighted.fit(X_train, y_train)
y_pred_weighted = model_xgb_weighted.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_weighted))
print("\nclassifciation Report:\n", classification_report(y_test, y_pred_weighted))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_weighted))

scale_pos_weight: 2.80
Accuracy: 0.7633262260127932

classifciation Report:
               precision    recall  f1-score   support

           0       0.87      0.79      0.83      1018
           1       0.56      0.68      0.62       389

    accuracy                           0.76      1407
   macro avg       0.71      0.74      0.72      1407
weighted avg       0.78      0.76      0.77      1407


Confusion Matrix:
 [[808 210]
 [123 266]]


In [109]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Calculate the weight for the positive (minority) class
# Common approach: weight_positive = negative_count / positive_count
negative_count = (y_train == 0).sum()
positive_count = (y_train == 1).sum()
weight_positive = negative_count / positive_count if positive_count > 0 else 1
print(f"Negative class count: {negative_count}")
print(f"Positive class count: {positive_count}")
print(f"Weight for positive class: {weight_positive:.2f}")

# Create sample weights: higher weight for minority class
sample_weight = np.where(y_train == 1, weight_positive, 1.0)

# Train AdaBoost with sample weights
model_ada_weighted = AdaBoostClassifier(
    n_estimators=50,  # default, you can tune
    random_state=42
)

model_ada_weighted.fit(X_train, y_train, sample_weight=sample_weight)

# Predict and evaluate
y_pred_ada_weighted = model_ada_weighted.predict(X_test)

print("AdaBoost with sample weighting - Accuracy:", accuracy_score(y_test, y_pred_ada_weighted))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ada_weighted))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_ada_weighted))

Negative class count: 4145
Positive class count: 1480
Weight for positive class: 2.80
AdaBoost with sample weighting - Accuracy: 0.7540867093105899

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.73      0.81      1018
           1       0.54      0.83      0.65       389

    accuracy                           0.75      1407
   macro avg       0.73      0.78      0.73      1407
weighted avg       0.81      0.75      0.77      1407


Confusion Matrix:
 [[740 278]
 [ 68 321]]


### **Hyper Parameter Optimization**

In [110]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 400],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'scale_pos_weight': [1, scale_pos_weight] # try both balanced and weighted
}

xgb = XGBClassifier(random_state=42, eval_metric='logloss')

# Use 'f1' or 'recall' scoring since churn is imbalanced
search = RandomizedSearchCV(
    xgb,
    param_grid,
    n_iter=50,
    cv=5,
    scoring='f1',  # or 'recall' if catching churn is priority
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best CV F1 score:", search.best_score_)

# Use best model
best_model = search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("\nTest Classification Report:\n", classification_report(y_test, y_pred_best))

Best parameters: {'subsample': 0.8, 'scale_pos_weight': np.float64(2.800675675675676), 'n_estimators': 400, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Best CV F1 score: 0.6256985293859109

Test Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.74      0.82      1018
           1       0.55      0.82      0.66       389

    accuracy                           0.76      1407
   macro avg       0.73      0.78      0.74      1407
weighted avg       0.81      0.76      0.77      1407



In [111]:
# Save this model

import joblib

# Save the best model to a .pkl file
joblib.dump(best_model, 'best_xgboost_churn_model.pkl')

print("Best model successfully saved as 'best_xgboost_churn_model.pkl'")

Best model successfully saved as 'best_xgboost_churn_model.pkl'


### Trying out few more techniques

In [112]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    n_estimators=300,
    max_depth=6
)

model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.92      0.73      0.81      1018
           1       0.54      0.83      0.65       389

    accuracy                           0.75      1407
   macro avg       0.73      0.78      0.73      1407
weighted avg       0.81      0.75      0.77      1407



In [113]:
from catboost import CatBoostClassifier

model_cat = CatBoostClassifier(
    auto_class_weights='Balanced',  # or 'SqrtBalanced'
    verbose=0,
    random_state=42
)
model_cat.fit(X_train, y_train)
y_pred_cat = model_cat.predict(X_test)
print(classification_report(y_test, y_pred_cat))

              precision    recall  f1-score   support

           0       0.90      0.78      0.83      1018
           1       0.57      0.77      0.65       389

    accuracy                           0.78      1407
   macro avg       0.73      0.77      0.74      1407
weighted avg       0.81      0.78      0.78      1407



In [114]:
from lightgbm import LGBMClassifier

model_lgb = LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42
)
model_lgb.fit(X_train, y_train)
y_pred_lgb = model_lgb.predict(X_test)
print(classification_report(y_test, y_pred_lgb))

[LightGBM] [Info] Number of positive: 1480, number of negative: 4145
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000880 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 606
[LightGBM] [Info] Number of data points in the train set: 5625, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.263111 -> initscore=-1.029861
[LightGBM] [Info] Start training from score -1.029861
              precision    recall  f1-score   support

           0       0.89      0.76      0.82      1018
           1       0.55      0.76      0.64       389

    accuracy                           0.76      1407
   macro avg       0.72      0.76      0.73      1407
weighted avg       0.80      0.76      0.77      1407





### **Optuna Fine Tuning**

In [115]:
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),  # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2), # L2 regularization
        'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [1, scale_pos_weight]),
        'random_state': 42,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)

    # Use F1 for minority class as scorer
    f1_scorer = make_scorer(f1_score, average='binary', pos_label=1)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring=f1_scorer, n_jobs=-1).mean()

    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Increase trials for better results

print("Best parameters:", study.best_params)
print("Best CV F1 score:", study.best_value)

# Train final model with best params
best_model_optuna = XGBClassifier(**study.best_params)
best_model_optuna.fit(X_train, y_train)
y_pred_optuna = best_model_optuna.predict(X_test)
print(classification_report(y_test, y_pred_optuna))

[32m[I 2026-02-07 21:13:38,233][0m A new study created in memory with name: no-name-cfc0d1d8-31c8-4558-9d78-3ef65daeea6a[0m
[32m[I 2026-02-07 21:13:40,349][0m Trial 0 finished with value: 0.5631112721892338 and parameters: {'max_depth': 9, 'learning_rate': 0.10681983161989432, 'n_estimators': 840, 'subsample': 0.780614953531384, 'colsample_bytree': 0.667472038016988, 'min_child_weight': 8, 'gamma': 0.4755029158282375, 'reg_alpha': 0.3259847594949741, 'reg_lambda': 1.8738536787984807, 'scale_pos_weight': np.float64(2.800675675675676)}. Best is trial 0 with value: 0.5631112721892338.[0m
[32m[I 2026-02-07 21:13:41,557][0m Trial 1 finished with value: 0.5475574899202104 and parameters: {'max_depth': 3, 'learning_rate': 0.16013107504618393, 'n_estimators': 671, 'subsample': 0.6027910032327594, 'colsample_bytree': 0.8589800915727963, 'min_child_weight': 10, 'gamma': 0.1618927058309687, 'reg_alpha': 0.6507157346751621, 'reg_lambda': 0.5579807807259112, 'scale_pos_weight': 1}. Best is 

Best parameters: {'max_depth': 3, 'learning_rate': 0.014878616418793669, 'n_estimators': 352, 'subsample': 0.667275736143591, 'colsample_bytree': 0.9227396394027141, 'min_child_weight': 3, 'gamma': 0.041952592371565435, 'reg_alpha': 0.19438883640380047, 'reg_lambda': 1.427998949321776, 'scale_pos_weight': np.float64(2.800675675675676)}
Best CV F1 score: 0.6305019369172468
              precision    recall  f1-score   support

           0       0.92      0.74      0.82      1018
           1       0.55      0.82      0.66       389

    accuracy                           0.76      1407
   macro avg       0.73      0.78      0.74      1407
weighted avg       0.81      0.76      0.77      1407



In [116]:
# Save this model

import joblib

# Save the best model to a .pkl file
joblib.dump(best_model_optuna, 'best_optuna_churn_model.pkl')

print("Best model successfully saved as 'best_optuna_churn_model.pkl'")

Best model successfully saved as 'best_optuna_churn_model.pkl'


In [117]:
# Save this model

import joblib

# Save the best model to a .pkl file
joblib.dump(model_ada_weighted, 'ada_boost_churn_model.pkl')

print("Best model successfully saved as 'ada_boost_churn_model.pkl'")

Best model successfully saved as 'ada_boost_churn_model.pkl'
