In [3]:
# Importing necessary libraries and setup.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report)

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.callbacks import EarlyStopping

# Reproducibility.
RANDOM_STATE = 42

# Path to dataset
DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [None]:
# DATA PREPROCESSING

# Loading dataset.
df = pd.read_csv(DATA_PATH)


# Basic cleanup steps & target encoding.
# Mapping target 'Churn' to binary 1/0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


# Handling TotalCharges blanks and converting to numeric.
# In this dataset TotalCharges may be empty strings for customers with tenure=0
# Converting to numeric coercing errors to NaN, then fill with median.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].replace(' ', np.nan), errors='coerce')
total_median = df['TotalCharges'].median()  # choosing median to be robust
df['TotalCharges'].fillna(total_median, inplace=True)


# Identifying features.
target_col = 'Churn'
# Treat object dtype (excluding customerID) as categorical / nominal.
drop_cols = ['customerID'] if 'customerID' in df.columns else []
feature_cols = [c for c in df.columns if c not in drop_cols + [target_col]]

# Separate numeric vs categorical
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
# Confirm numeric cols exist; if not, detect numerics automatically
numeric_cols = [c for c in numeric_cols if c in df.columns]
# For categorical, use object dtype columns or explicitly exclude numeric
categorical_cols = [c for c in feature_cols if c not in numeric_cols]


# One-Hot Encoding for nominal categorical features.
# Use drop_first=True to reduce collinearity.
df_encoded = pd.get_dummies(df.drop(columns=drop_cols), columns=categorical_cols, drop_first=True)


# Feature matrix and target vector
X = df_encoded.drop(columns=[target_col])
y = df_encoded[target_col]

# Min-Max Scaling for numeric features (its very important to do this on the full data BEFORE the split so transforms are consistent.)
# we scaled across the whole dataset for simplicity.
scaler = MinMaxScaler()
# finding the scaled numeric columns names in X (they exist unchanged since get_dummies didn't touch them)
scale_cols = [c for c in numeric_cols if c in X.columns]
X[scale_cols] = scaler.fit_transform(X[scale_cols])


# Train/test split (70/30) with stratification on churn.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)

print("Original training class distribution:", Counter(y_train))
print("Original test class distribution:", Counter(y_test))


Original training class distribution: Counter({0: 3622, 1: 1308})
Original test class distribution: Counter({0: 1552, 1: 561})


In [None]:
#HANDLING CLASS IMBALANCE WITH SMOTE

# Applying SMOTE on training data only. Keep the test set untouched.
smote = SMOTE(random_state=RANDOM_STATE)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE training class distribution:", Counter(y_train_smote))


After SMOTE training class distribution: Counter({0: 3622, 1: 3622})


In [6]:
#FEATURE SELECTION USING MUTUAL INFORMATION FOR LOGISTIC REGRESSION MODELING

# Computing mutual information scores on the pre-SMOTE training set.
mi_scores = mutual_info_classif(X_train, y_train, random_state=RANDOM_STATE)
mi_series = pd.Series(mi_scores, index=X_train.columns).sort_values(ascending=False)

# Top 10 most informative features.
top_10_features = mi_series.head(10)
print("Top 10 features by Mutual Information (pre-SMOTE training set):")
print(top_10_features)

# For later steps we need the list (as Python list).
top10_feature_list = top_10_features.index.tolist()


Top 10 features by Mutual Information (pre-SMOTE training set):
tenure                                  0.076137
Contract_Two year                       0.060981
InternetService_Fiber optic             0.054962
PaymentMethod_Electronic check          0.051235
MonthlyCharges                          0.044990
TotalCharges                            0.038035
DeviceProtection_No internet service    0.034879
TechSupport_No internet service         0.034649
StreamingMovies_No internet service     0.034344
OnlineBackup_No internet service        0.030516
dtype: float64


In [None]:
#LOGISTIC REGRESSION IMPLEMENTATION AND PARAMETER HYPERPARAMETER TUNING