In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## Data Preprocessing and Splitting

In [52]:
df = pd.read_csv("train_ready_churn_dataset.csv")

In [53]:
df.head(3)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,True,False,False,True,True,False,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.5,0,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,1,False,True,True,False,True,False,...,False,False,False,True,True,False,False,False,False,False


In [54]:
# converts all the boolean values to 0 and 1
for col in df.select_dtypes(include=['bool']).columns:
    df[col] = df[col].astype(int)

In [55]:
df.head(3)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0


In [56]:
# identifying target variable and predictor variables
X = df.drop('Churn', axis=1)
y = df['Churn']

In [57]:
from sklearn.preprocessing import StandardScaler

In [58]:
# standardizing 'MonthlyCharges' and 'TotalCharges' column
convert_cols = ['MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
X[convert_cols] = scaler.fit_transform(X[convert_cols])

In [59]:
X.sample(3)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
6819,0,-1.482464,-0.705415,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
6423,0,-0.126254,-0.067436,0,1,0,1,1,0,0,...,0,0,0,1,0,0,1,0,0,0
1634,1,-0.656439,-0.667384,0,1,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [60]:
# data splitting 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# stratify=y ensures the same proportion of Churn and Non-Churn in train set and test set

In [61]:
print("Data Preprocessing Complete.\n")
print(f"Training set shape of X : {X_train.shape}")
print(f"Training set shape of y : {y_train.shape}")
print(f"Testing set shape of X  : {X_test.shape}")
print(f"Testing set shape of y  : {y_test.shape}")

Data Preprocessing Complete.

Training set shape of X : (5625, 50)
Training set shape of y : (5625,)
Testing set shape of X  : (1407, 50)
Testing set shape of y  : (1407,)


## SMOTE ENN Initialization

In [62]:
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import json

## Decision Tree Model Training

In [63]:
# initialize smooteenn
smoteenn = SMOTEENN(random_state=42)

In [64]:
# smoteenn on training dataset
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

# Train the dt model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_resampled, y_resampled)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [74]:
# evaluation of the dt model
y_pred_dt = dt_model.predict(X_test)
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt)
dt_model_acc = dt_model.score(X_test,y_test)


print("\n--- Decision Tree Model (No Tuning) ---")
print(f'Accuracy: {dt_model_acc:.4f}')
print("\nConfusion Matrix:\n", conf_matrix_dt)
print("\nClassification Report:\n", report_dt)


--- Decision Tree Model (No Tuning) ---
Accuracy: 0.7050

Confusion Matrix:
 [[699 334]
 [ 81 293]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.68      0.77      1033
           1       0.47      0.78      0.59       374

    accuracy                           0.71      1407
   macro avg       0.68      0.73      0.68      1407
weighted avg       0.78      0.71      0.72      1407



In [16]:
# writing classifiaction report of dt based model to json
with open('Smote_dt_C-Report.json','w') as f:
    json.dump(report_dt, f, indent=4)

In [17]:
# reading classifiaction report of dt based model from json
with open('Smote_dt_C-Report.json','r') as fr:
    dt_c_report = json.load(fr)

In [18]:
print(dt_c_report)

              precision    recall  f1-score   support

           0       0.90      0.68      0.77      1033
           1       0.47      0.78      0.59       374

    accuracy                           0.71      1407
   macro avg       0.68      0.73      0.68      1407
weighted avg       0.78      0.71      0.72      1407



### Decion Tree Model Training with Hyperparameter Tuning

In [19]:
# applying smoteenn on DTC
pipeline_dt = Pipeline([
    ('smoteenn', smoteenn),
    ('dt', DecisionTreeClassifier(random_state=42))
])

In [20]:
# define the parameter grid
param_grid_dt = {
    'dt__max_depth': [3, 5, 7, 10],
    'dt__min_samples_leaf': [5, 10],
    'dt__criterion': ['gini', 'entropy']
}

In [21]:
# initialize and train GridSearchCV
grid_search_dt = GridSearchCV(estimator=pipeline_dt,
                            param_grid=param_grid_dt,
                            scoring='f1',
                            cv=5,
                            n_jobs=4,
                            verbose=1)

In [22]:
# fit trainsets to GridsearchCV
grid_search_dt.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'dt__criterion': ['gini', 'entropy'], 'dt__max_depth': [3, 5, ...], 'dt__min_samples_leaf': [5, 10]}"
,scoring,'f1'
,n_jobs,4
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,sampling_strategy,'auto'
,random_state,42
,smote,
,enn,
,n_jobs,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,7
,min_samples_split,2
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [73]:
# evaluation of the tuned model
best_dt_tuned = grid_search_dt.best_estimator_
y_pred_tuned_dt = best_dt_tuned.predict(X_test)
conf_matrix_tuned_dt = confusion_matrix(y_test, y_pred_tuned_dt)
report_tuned_dt = classification_report(y_test, y_pred_tuned_dt)
tuned_dt_acc = best_dt_tuned.score(X_test,y_test)

print("--- Decision Tree Model (Tuning) ---")
print(f"Best Parameters: {grid_search_dt.best_params_}")
print(f'Accuracy: {tuned_dt_acc:.4f}')
print("\nConfusion Matrix:\n", conf_matrix_tuned_dt)
print("\nClassification Report:\n", report_tuned_dt)

--- Decision Tree Model (Tuning) ---
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 7, 'dt__min_samples_leaf': 10}
Accuracy: 0.6965

Confusion Matrix:
 [[676 357]
 [ 70 304]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.65      0.76      1033
           1       0.46      0.81      0.59       374

    accuracy                           0.70      1407
   macro avg       0.68      0.73      0.67      1407
weighted avg       0.79      0.70      0.71      1407



In [24]:
# writing classifiaction report of dt tuned model to json
with open('Smote_tuned_dt_C-Report.json','w') as f:
    json.dump(report_tuned_dt, f, indent=4)

In [25]:
# reading classifiaction report of dt tuned model from json
with open('Smote_tuned_dt_C-Report.json','r') as fr:
    dt_tuned_c_report = json.load(fr)

In [26]:
print(dt_tuned_c_report)

              precision    recall  f1-score   support

           0       0.91      0.65      0.76      1033
           1       0.46      0.81      0.59       374

    accuracy                           0.70      1407
   macro avg       0.68      0.73      0.67      1407
weighted avg       0.79      0.70      0.71      1407



## Neural Network-base Model Training

In [27]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

In [28]:
# normalize data for NN training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
import warnings
import os

warnings.filterwarnings("ignore")

In [30]:
# apply smoteenn to the scaled training data (can reuse from step 1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train_scaled, y_train)

# 2b. Define a simple, fixed architecture model
nn_baseline = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

nn_baseline.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

In [31]:
# train the NN model
history_baseline = nn_baseline.fit(X_resampled, y_resampled,
                                   epochs=50,
                                   batch_size=32,
                                   verbose=0)

In [78]:
# evaluation the NN Model
loss_baseline, accuracy_baseline_nn = nn_baseline.evaluate(X_test_scaled, y_test, verbose=0)
y_pred_baseline_nn = (nn_baseline.predict(X_test_scaled) > 0.5).astype("int32")

print("\n--- Baseline Neural Network Model (with SMOTEENN, No Tuning) ---")
print(f"Accuracy: {accuracy_baseline_nn:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_baseline_nn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_baseline_nn))

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

--- Baseline Neural Network Model (with SMOTEENN, No Tuning) ---
Accuracy: 0.7136

Confusion Matrix:
 [[708 325]
 [ 78 296]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.69      0.78      1033
           1       0.48      0.79      0.59       374

    accuracy                           0.71      1407
   macro avg       0.69      0.74      0.69      1407
weighted avg       0.79      0.71      0.73      1407



In [33]:
# writing classifiaction report of base nn model to json
with open('Smote_nn_C-Report.json', 'w') as f:
    json.dump(classification_report(y_test, y_pred_baseline_nn), f, indent=4)

In [34]:
# reading classifiaction report of base nn model from json
with open('Smote_nn_C-Report.json', 'r') as fr:
    nn_c_report=json.load(fr)

In [35]:
print(nn_c_report)

              precision    recall  f1-score   support

           0       0.90      0.69      0.78      1033
           1       0.48      0.79      0.59       374

    accuracy                           0.71      1407
   macro avg       0.69      0.74      0.69      1407
weighted avg       0.79      0.71      0.73      1407



### Neural Network-base Model Training with Hyperparameter Tuning

In [36]:
# define model builder function for KerasTuner
def build_model(hp):
    
    # tuning 1 to 3 hidden layers
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(X_train_scaled.shape[1],)))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(keras.layers.Dense(
            # tuning each unit per layer
            units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32),
            activation='relu'
        ))
        # tune the dropout rate
        model.add(keras.layers.Dropout(hp.Float(f'dropout_{i}', 0.1, 0.5, step=0.1)))

    # output layer
    model.add(keras.layers.Dense(1, activation='sigmoid'))

    # tune the learning rate
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [37]:
# initializing the tuner (RandomSearch)
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10, # Number of models to test
    executions_per_trial=1,
    directory='nn_tuning',
    project_name='churn_prediction'
)

Reloading Tuner from nn_tuning\churn_prediction\tuner0.json


In [38]:
# hyperparameter search
tuner.search(X_resampled, y_resampled,
             epochs=20,
             validation_split=0.2,
             verbose=1)

In [39]:
# best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_nn_tuned = tuner.hypermodel.build(best_hps)

In [42]:
# train the best model on the full resampled dataset
history_tuned = best_nn_tuned.fit(X_resampled, y_resampled,
                                  epochs=50, # Final training epoch
                                  batch_size=32,
                                  verbose=1)

Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9165 - loss: 0.2166
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9223 - loss: 0.2004
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9377 - loss: 0.1647
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9335 - loss: 0.1698
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9406 - loss: 0.1536
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9493 - loss: 0.1295
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9468 - loss: 0.1316
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9530 - loss: 0.1226
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━

In [76]:
# evaluation the Tuned Model
loss_tuned, accuracy_tuned_nn = best_nn_tuned.evaluate(X_test_scaled, y_test, verbose=1)
y_pred_tuned_nn = (best_nn_tuned.predict(X_test_scaled) > 0.5).astype("int32")

print("--- Tuned Neural Network Model (with SMOTEENN) ---")
print("Best Hyperparameters:", best_hps.values)
print(f"Test Accuracy: {accuracy_tuned_nn:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned_nn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tuned_nn))

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7008 - loss: 4.0639
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
--- Tuned Neural Network Model (with SMOTEENN) ---
Best Hyperparameters: {'num_layers': 1, 'units_0': 320, 'dropout_0': 0.1, 'learning_rate': 0.01, 'units_1': 512, 'dropout_1': 0.4, 'units_2': 160, 'dropout_2': 0.2}
Test Accuracy: 0.7008

Confusion Matrix:
 [[700 333]
 [ 88 286]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.68      0.77      1033
           1       0.46      0.76      0.58       374

    accuracy                           0.70      1407
   macro avg       0.68      0.72      0.67      1407
weighted avg       0.78      0.70      0.72      1407



In [44]:
# writing classifiaction report of nn tuned model to json
with open('Smote_tuned_nn_C-Report.json', 'w') as f:
    json.dump(classification_report(y_test, y_pred_tuned_nn), f, indent=4)

In [45]:
# reading classifiaction report of nn tuned model from json
with open('Smote_tuned_nn_C-Report.json', 'r') as fr:
    nn_tuned_c_report=json.load(fr)

In [46]:
print(nn_tuned_c_report)

              precision    recall  f1-score   support

           0       0.89      0.68      0.77      1033
           1       0.46      0.76      0.58       374

    accuracy                           0.70      1407
   macro avg       0.68      0.72      0.67      1407
weighted avg       0.78      0.70      0.72      1407



## Compare Accuracy of the Models

### Decsion Tree base model and Decision Tree tuned model

In [102]:
print('Accuracy Comparison between Decsion Tree base model and Decision Tree tuned model\n')
print(f'Decision Tree Base Model  : {dt_model_acc:.4f}')
print(f'Decision Tree Tuned Model : {tuned_dt_acc:.4f}')
print('--------------------------|--------')
difference_1 = abs(dt_model_acc - tuned_dt_acc)
print(f'Difference between Models : {difference_1:.4f}')

Accuracy Comparison between Decsion Tree base model and Decision Tree tuned model

Decision Tree Base Model  : 0.7050
Decision Tree Tuned Model : 0.6965
--------------------------|--------
Difference between Models : 0.0085


### Neural Network base model and Neural Network tuned model

In [101]:
print('Accuracy Comparison between Neural Network base model and Neural Network tuned model\n')
print(f'Neural Network Base Model  : {accuracy_baseline_nn:.4f}')
print(f'Neural Network Tuned Model : {accuracy_tuned_nn:.4f}')
print('---------------------------|--------')
difference_2 = abs(accuracy_baseline_nn - accuracy_tuned_nn)
print(f'Difference between Models  : {difference_2:.4f}')

Accuracy Comparison between Neural Network base model and Neural Network tuned model

Neural Network Base Model  : 0.7136
Neural Network Tuned Model : 0.7008
---------------------------|--------
Difference between Models  : 0.0128


###  Decsion Tree base model and Neural Network base model

In [98]:
print('Accuracy Comparison between Decsion Tree base model and Neural Network base model\n')
print(f'Decsion Tree Base Model   : {dt_model_acc:.4f}')
print(f'Neural Network Base Model : {accuracy_baseline_nn:.4f}')
print('--------------------------|--------')
difference_3 = abs(dt_model_acc - accuracy_baseline_nn)
print(f'Difference between Models : {difference_3:.4f}')

Accuracy Comparison between Decsion Tree base model and Neural Network base model

Decsion Tree Base Model   : 0.7050
Neural Network Base Model : 0.7136
--------------------------|--------
Difference between Models : 0.0085


### Decsion Tree tuned model and Neural Network tuned model

In [100]:
print('Accuracy Comparison between Decsion Tree tuned model and Neural Network tuned model\n')
print(f'Decsion Tree Tuned Model   : {tuned_dt_acc:.4f}')
print(f'Neural Network Tuned Model : {accuracy_tuned_nn:.4f}')
print('---------------------------|--------')
difference_4 = abs(tuned_dt_acc - accuracy_tuned_nn)
print(f'Difference between Models  : {difference_4:.4f}')

Accuracy Comparison between Decsion Tree tuned model and Neural Network tuned model

Decsion Tree Tuned Model   : 0.6965
Neural Network Tuned Model : 0.7008
---------------------------|--------
Difference between Models  : 0.0043


In [115]:
# differences dictionary
dif_dict = {difference_1: 'DT-Base | DT-Tuned', 
           difference_2: 'NN-Base | NN-Tuned',
           difference_3: 'DT-Base | NN-Base',
           difference_4: 'DT-Tuned | NN-Tuned'}

### Accuracy Difference Ascending Sort 

In [112]:
differences = [difference_1, difference_2, difference_3, difference_4]
differences.sort(reverse=True)
print([f'{x:.4f}' for x in differences])

['0.0128', '0.0085', '0.0085', '0.0043']
