In [13]:
# Importing necessary libraries and setup.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report)

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.callbacks import EarlyStopping

# Reproducibility.
RANDOM_STATE = 42

# Path to dataset
DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [14]:
# DATA PREPROCESSING

# Loading dataset.
df = pd.read_csv(DATA_PATH)


# Basic cleanup steps & target encoding.
# Mapping target 'Churn' to binary 1/0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


# Handling TotalCharges blanks and converting to numeric.
# In this dataset TotalCharges may be empty strings for customers with tenure=0
# Converting to numeric coercing errors to NaN, then fill with median.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].replace(' ', np.nan), errors='coerce')
total_median = df['TotalCharges'].median()  # choosing median to be robust
df['TotalCharges'].fillna(total_median, inplace=True)


# Identifying features.
target_col = 'Churn'
# Treat object dtype (excluding customerID) as categorical / nominal.
drop_cols = ['customerID'] if 'customerID' in df.columns else []
feature_cols = [c for c in df.columns if c not in drop_cols + [target_col]]

# Separate numeric vs categorical
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
# Confirm numeric cols exist; if not, detect numerics automatically
numeric_cols = [c for c in numeric_cols if c in df.columns]
# For categorical, use object dtype columns or explicitly exclude numeric
categorical_cols = [c for c in feature_cols if c not in numeric_cols]


# One-Hot Encoding for nominal categorical features.
# Use drop_first=True to reduce collinearity.
df_encoded = pd.get_dummies(df.drop(columns=drop_cols), columns=categorical_cols, drop_first=True)


# Feature matrix and target vector
X = df_encoded.drop(columns=[target_col])
y = df_encoded[target_col]

# Min-Max Scaling for numeric features (its very important to do this on the full data BEFORE the split so transforms are consistent.)
# we scaled across the whole dataset for simplicity.
scaler = MinMaxScaler()
# finding the scaled numeric columns names in X (they exist unchanged since get_dummies didn't touch them)
scale_cols = [c for c in numeric_cols if c in X.columns]
X[scale_cols] = scaler.fit_transform(X[scale_cols])


# Train/test split (70/30) with stratification on churn.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)

print("Original training class distribution:", Counter(y_train))
print("Original test class distribution:", Counter(y_test))


Original training class distribution: Counter({0: 3622, 1: 1308})
Original test class distribution: Counter({0: 1552, 1: 561})


In [15]:
#HANDLING CLASS IMBALANCE WITH SMOTE

# Applying SMOTE on training data only. Keep the test set untouched.
smote = SMOTE(random_state=RANDOM_STATE)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE training class distribution:", Counter(y_train_smote))


After SMOTE training class distribution: Counter({0: 3622, 1: 3622})


In [16]:
#FEATURE SELECTION USING MUTUAL INFORMATION FOR LOGISTIC REGRESSION MODELING

# Computing mutual information scores on the pre-SMOTE training set.
mi_scores = mutual_info_classif(X_train, y_train, random_state=RANDOM_STATE)
mi_series = pd.Series(mi_scores, index=X_train.columns).sort_values(ascending=False)

# Top 10 most informative features.
top_10_features = mi_series.head(10)
print("Top 10 features by Mutual Information (pre-SMOTE training set):")
print(top_10_features)

# For later steps we need the list (as Python list).
top10_feature_list = top_10_features.index.tolist()


Top 10 features by Mutual Information (pre-SMOTE training set):
tenure                                  0.076137
Contract_Two year                       0.060981
InternetService_Fiber optic             0.054962
PaymentMethod_Electronic check          0.051235
MonthlyCharges                          0.044990
TotalCharges                            0.038035
DeviceProtection_No internet service    0.034879
TechSupport_No internet service         0.034649
StreamingMovies_No internet service     0.034344
OnlineBackup_No internet service        0.030516
dtype: float64


In [17]:
#LOGISTIC REGRESSION IMPLEMENTATION AND PARAMETER HYPERPARAMETER TUNING

# Prepare training data for Logistic Regression using SMOTE-balanced training set
# Using only the top 10 features found with MI
X_train_lr = X_train_smote[top10_feature_list]
X_test_lr  = X_test[top10_feature_list]  # test set still from holdout (not SMOTE)

# Defining model and hyperparameter grid.
lr = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE, max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

grid = GridSearchCV(lr, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid.fit(X_train_lr, y_train_smote)

print("Best LR hyperparameters:", grid.best_params_)
best_lr = grid.best_estimator_


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best LR hyperparameters: {'C': 100, 'penalty': 'l1'}
Best LR hyperparameters: {'C': 100, 'penalty': 'l1'}


In [18]:
# DEEP NEURAL NETWORK IMPLEMENTATION

# Preparing full training set (SMOTE-balanced) and full test set (holdout) for DNN
X_train_dnn = X_train_smote.astype('float32').values  # convert to float32 and numpy array
y_train_dnn = y_train_smote.astype('float32').values
X_test_dnn  = X_test.astype('float32').values
y_test_dnn  = y_test.astype('float32').values

input_dim = X_train_dnn.shape[1]

# Building the Keras model as requested:
def build_dnn_model(input_dim):
    model = Sequential()
    model.add(InputLayer(input_shape=(input_dim,)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # binary output
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

dnn_model = build_dnn_model(input_dim)

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the DNN
history = dnn_model.fit(
    X_train_dnn, y_train_dnn,
    validation_split=0.10,  # 10% of training used for validation
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=2
)

Epoch 1/100
102/102 - 3s - 30ms/step - AUC: 0.8107 - loss: 0.5351 - val_AUC: 0.0000e+00 - val_loss: 0.6251
Epoch 2/100
102/102 - 3s - 30ms/step - AUC: 0.8107 - loss: 0.5351 - val_AUC: 0.0000e+00 - val_loss: 0.6251
Epoch 2/100
102/102 - 0s - 4ms/step - AUC: 0.8534 - loss: 0.4724 - val_AUC: 0.0000e+00 - val_loss: 0.5791
Epoch 3/100
102/102 - 0s - 4ms/step - AUC: 0.8534 - loss: 0.4724 - val_AUC: 0.0000e+00 - val_loss: 0.5791
Epoch 3/100
102/102 - 0s - 4ms/step - AUC: 0.8602 - loss: 0.4613 - val_AUC: 0.0000e+00 - val_loss: 0.5185
Epoch 4/100
102/102 - 0s - 4ms/step - AUC: 0.8602 - loss: 0.4613 - val_AUC: 0.0000e+00 - val_loss: 0.5185
Epoch 4/100
102/102 - 0s - 4ms/step - AUC: 0.8646 - loss: 0.4544 - val_AUC: 0.0000e+00 - val_loss: 0.5232
Epoch 5/100
102/102 - 0s - 4ms/step - AUC: 0.8646 - loss: 0.4544 - val_AUC: 0.0000e+00 - val_loss: 0.5232
Epoch 5/100
102/102 - 0s - 4ms/step - AUC: 0.8679 - loss: 0.4493 - val_AUC: 0.0000e+00 - val_loss: 0.5400
Epoch 6/100
102/102 - 0s - 4ms/step - AUC: 0

In [19]:
#COMPARATIVE EVALUATION OF MODELS ON TEST SET

# Logistic Regression predictions
# For LR we used only top10 features
y_pred_lr = best_lr.predict(X_test_lr)
y_proba_lr = best_lr.predict_proba(X_test_lr)[:, 1]

# LR metrics
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_prec = precision_score(y_test, y_pred_lr)
lr_rec = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_proba_lr)

# DNN predictions
y_proba_dnn = dnn_model.predict(X_test_dnn).ravel()
y_pred_dnn = (y_proba_dnn >= 0.5).astype(int)

# DNN metrics
dnn_acc = accuracy_score(y_test_dnn, y_pred_dnn)
dnn_prec = precision_score(y_test_dnn, y_pred_dnn)
dnn_rec = recall_score(y_test_dnn, y_pred_dnn)
dnn_f1 = f1_score(y_test_dnn, y_pred_dnn)
dnn_auc = roc_auc_score(y_test_dnn, y_proba_dnn)

# Confusion matrix for DNN
dnn_cm = confusion_matrix(y_test_dnn, y_pred_dnn)

# Print the confusion matrix
print("DNN Confusion Matrix (rows: true class 0/1, cols: predicted 0/1):")
print(dnn_cm)


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
DNN Confusion Matrix (rows: true class 0/1, cols: predicted 0/1):
[[1202  350]
 [ 153  408]]
DNN Confusion Matrix (rows: true class 0/1, cols: predicted 0/1):
[[1202  350]
 [ 153  408]]


In [20]:
# Building a DataFrame for a clean table
results = pd.DataFrame({
    'Model': ['Logistic Regression (top 10 features)', 'Deep Neural Network (full features)'],
    'Accuracy': [lr_acc, dnn_acc],
    'Precision': [lr_prec, dnn_prec],
    'Recall': [lr_rec, dnn_rec],
    'F1-score': [lr_f1, dnn_f1],
    'ROC-AUC': [lr_auc, dnn_auc]
})

# Round metrics for nicer display.
results[['Accuracy','Precision','Recall','F1-score','ROC-AUC']] = results[['Accuracy','Precision','Recall','F1-score','ROC-AUC']].round(4)

# Show table.
print("\n### Comparative Evaluation Results (Test Set / Holdout 30%)\n")
print(results.to_string(index=False))

# Also print best LR hyperparameters and top-10 features found earlier
print("\nBest Logistic Regression hyperparameters found by GridSearchCV:", grid.best_params_)
print("\nTop 10 features selected by Mutual Information (pre-SMOTE training set):")
for i, feat in enumerate(top10_feature_list, 1):
    print(f"{i}. {feat}")


### Comparative Evaluation Results (Test Set / Holdout 30%)

                                Model  Accuracy  Precision  Recall  F1-score  ROC-AUC
Logistic Regression (top 10 features)    0.7378     0.5039  0.8093    0.6211   0.8354
  Deep Neural Network (full features)    0.7619     0.5383  0.7273    0.6187   0.8322

Best Logistic Regression hyperparameters found by GridSearchCV: {'C': 100, 'penalty': 'l1'}

Top 10 features selected by Mutual Information (pre-SMOTE training set):
1. tenure
2. Contract_Two year
3. InternetService_Fiber optic
4. PaymentMethod_Electronic check
5. MonthlyCharges
6. TotalCharges
7. DeviceProtection_No internet service
8. TechSupport_No internet service
9. StreamingMovies_No internet service
10. OnlineBackup_No internet service
