In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import sys

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, roc_auc_score
)
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

!{sys.executable} -m pip install -q keras-tuner
import keras_tuner as kt

In [None]:
df_raw = pd.read_csv("https://raw.githubusercontent.com/mahidul5130/ChurnNet_Deep_Learning_Enhanced_Customer_Churn-Prediction_in_Telecommunication_Industry/refs/heads/main/Churn-data-UCI%20Dataset(5000).csv")

In [None]:
Encoder = "Label Encoder"
OverSamplingTecnique = "SMOTE"
Model_Name = "MLP-Attention"

**Label Encoding**

In [None]:
if Encoder == "Label Encoder":
    print("Applying Label Encoder")
    df_final = df_raw.copy()
    le = LabelEncoder()

    text_data_features = ['internationalplan', 'voicemailplan']

    print('Label Encoder Transformation:')
    for i in text_data_features:
        df_final[i] = le.fit_transform(df_final[i])
        print(i, '→', df_final[i].unique())

    X = df_final.drop(['churn'], axis=1).copy()
    Y = df_final['churn'].copy().astype(int)

Applying Label Encoder
Label Encoder Transformation:
internationalplan → [0 1]
voicemailplan → [1 0]


**One-hot Encoding**

In [None]:
if Encoder == "One-hot Encoder":
    print("Applying One-hot Encoder")

    categorical_columns = ['internationalplan', 'voicemailplan']

    encoder = OneHotEncoder()
    encoded_features = encoder.fit_transform(df_raw[categorical_columns]).toarray()

    numerical_features = df_raw.drop(categorical_columns + ['churn'], axis=1)
    X = np.hstack((encoded_features, numerical_features))

    Y = df_raw['churn'].values.astype(int)
    X = X.astype(float)

**MLP + Feature Attention Block**

In [None]:
def feature_attention(inputs, reduction_ratio=4):
    """Lightweight feature attention mechanism"""
    hidden_units = max(1, inputs.shape[-1] // reduction_ratio)
    x = layers.Dense(hidden_units, activation='relu')(inputs)
    x = layers.Dense(inputs.shape[-1], activation='sigmoid')(x)
    return layers.Multiply()([inputs, x])


def build_mlp_att(hp, input_dim):
    """Build MLP with attention and tunable hyperparameters"""

    # Tunable Hyperparameters (narrowed ranges for faster search)
    hidden_units = hp.Choice('hidden_units', values=[128, 256])
    dropout_rate = hp.Float('dropout_rate', min_value=0.2, max_value=0.4, step=0.1)
    lr = hp.Choice('learning_rate', values=[1e-3, 5e-4, 1e-4])

    # Regularization (narrowed ranges)
    l1_reg = hp.Float('l1_regularization', min_value=1e-5, max_value=1e-3, sampling='log')
    l2_reg = hp.Float('l2_regularization', min_value=1e-5, max_value=1e-3, sampling='log')

    inputs = layers.Input(shape=(input_dim,))

    # Feature Attention
    x = feature_attention(inputs, reduction_ratio=4)

    # MLP layers with regularization
    for units in [hidden_units, hidden_units // 2, hidden_units // 4]:
        x = layers.Dense(
            units,
            kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1_reg, l2=l2_reg)
        )(x)
        x = layers.BatchNormalization()(x)
        x = layers.ReLU()(x)
        x = layers.Dropout(dropout_rate)(x)

    outputs = layers.Dense(
        1,
        activation='sigmoid',
        kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)

    model = models.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model

**Hyperparameter Tuning**

In [None]:
def tune_hyperparameters(X, Y, OverSamplingTecnique):
    """Perform hyperparameter tuning once on a validation set"""
    print("\n" + "="*50)
    print("PHASE 1: HYPERPARAMETER TUNING")
    print("="*50)

    # Create a single train/val split for tuning
    X_tune, X_val, Y_tune, Y_val = train_test_split(
        X, Y, test_size=0.2, random_state=42, stratify=Y
    )

    # Scale
    scaler = StandardScaler()
    X_tune = scaler.fit_transform(X_tune)
    X_val = scaler.transform(X_val)

    # Apply oversampling to training data only
    sampler = None
    if OverSamplingTecnique == "SMOTE":
        sampler = SMOTE(random_state=42)
    elif OverSamplingTecnique == "SMOTE-Tomek":
        sampler = SMOTETomek(random_state=42)
    elif OverSamplingTecnique == "SMOTE-Enn":
        sampler = SMOTEENN(random_state=42)

    if sampler is not None:
        print(f"Applying {OverSamplingTecnique} oversampling...")
        X_tune, Y_tune = sampler.fit_resample(X_tune, Y_tune)

    # Set up tuner with reduced trials for speed
    tuner = kt.RandomSearch(
        hypermodel=lambda hp: build_mlp_att(hp, input_dim=X_tune.shape[1]),
        objective='val_auc',
        max_trials=8,  # Reduced from 10
        executions_per_trial=1,
        overwrite=True,
        directory='keras_tuner_dir',
        project_name='mlp_attention_global'
    )

    early_stopping = EarlyStopping(
        patience=5,
        restore_best_weights=True,
        monitor='val_loss'
    )

    print("Starting hyperparameter search")
    tuner.search(
        X_tune, Y_tune,
        epochs=30,  # Reduced from 20
        batch_size=64,  # Increased for speed
        validation_data=(X_val, Y_val),
        callbacks=[early_stopping],
        verbose=1
    )

    # Get best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print("\n" + "-"*50)
    print("BEST HYPERPARAMETERS FOUND:")
    print("-"*50)
    for param, value in best_hps.values.items():
        print(f"  {param}: {value}")
    print("-"*50)

    return best_hps

**K-Fold Cross-Validation With Best Parameters**

In [None]:
def kfold_with_best_params(X, Y, best_hps, OverSamplingTecnique):
    """K-fold CV using pre-tuned hyperparameters"""
    print("\n" + "="*50)
    print("PHASE 2: K-FOLD CROSS-VALIDATION")
    print("="*50)

    num_folds = 10
    metrics_dict = {
        "accuracy": [], "precision": [], "recall": [],
        "f1": [], "mcc": [], "auc": []
    }

    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_number = 1

    for train_index, test_index in skf.split(X, Y):
        print(f"\nFold {fold_number}/{num_folds}")

        # Split
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Scale
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Oversample
        sampler = None
        if OverSamplingTecnique == "SMOTE":
            sampler = SMOTE(random_state=42)
        elif OverSamplingTecnique == "SMOTE-Tomek":
            sampler = SMOTETomek(random_state=42)
        elif OverSamplingTecnique == "SMOTE-Enn":
            sampler = SMOTEENN(random_state=42)

        if sampler is not None:
            X_train, Y_train = sampler.fit_resample(X_train, Y_train)

        # Build model with best hyperparameters
        model = build_mlp_att(best_hps, input_dim=X_train.shape[1])

        # Train with early stopping
        early_stopping = EarlyStopping(
            patience=15,
            restore_best_weights=True,
            monitor="val_loss"
        )

        model.fit(
            X_train, Y_train,
            epochs=100,
            batch_size=32,
            verbose=0,
            validation_split=0.2,
            callbacks=[early_stopping]
        )

        # Predict
        Y_pred = model.predict(X_test, verbose=0)
        Y_pred_binary = np.round(Y_pred).flatten()

        # Calculate metrics
        metrics_dict["accuracy"].append(accuracy_score(Y_test, Y_pred_binary))
        metrics_dict["precision"].append(precision_score(Y_test, Y_pred_binary, zero_division=0))
        metrics_dict["recall"].append(recall_score(Y_test, Y_pred_binary, zero_division=0))
        metrics_dict["f1"].append(f1_score(Y_test, Y_pred_binary, zero_division=0))
        metrics_dict["mcc"].append(matthews_corrcoef(Y_test, Y_pred_binary))

        try:
            auc_val = roc_auc_score(Y_test, Y_pred)
        except:
            auc_val = 0.5
        metrics_dict["auc"].append(auc_val)

        print(f"  Acc: {metrics_dict['accuracy'][-1]:.4f} | "
              f"F1: {metrics_dict['f1'][-1]:.4f} | "
              f"AUC: {metrics_dict['auc'][-1]:.4f}")

        fold_number += 1

    # Print final results
    print("\n" + "="*50)
    print("FINAL RESULTS (10-FOLD CV)")
    print("="*50)
    print(f"Average Test Accuracy: {np.mean(metrics_dict['accuracy']):.4f} ± {np.std(metrics_dict['accuracy']):.4f}")
    print(f"Average Precision:     {np.mean(metrics_dict['precision']):.4f} ± {np.std(metrics_dict['precision']):.4f}")
    print(f"Average Recall:        {np.mean(metrics_dict['recall']):.4f} ± {np.std(metrics_dict['recall']):.4f}")
    print(f"Average F1 Score:      {np.mean(metrics_dict['f1']):.4f} ± {np.std(metrics_dict['f1']):.4f}")
    print(f"Average MCC:           {np.mean(metrics_dict['mcc']):.4f} ± {np.std(metrics_dict['mcc']):.4f}")
    print(f"Average AUC-ROC:       {np.mean(metrics_dict['auc']):.4f} ± {np.std(metrics_dict['auc']):.4f}")
    print("="*50)

    return metrics_dict

In [None]:
X_arr = X.values if isinstance(X, pd.DataFrame) else np.asarray(X)
Y_arr = Y.values if isinstance(Y, (pd.Series, np.ndarray)) else np.asarray(Y)

# Phase 1: Tune hyperparameters once
best_hps = tune_hyperparameters(X_arr, Y_arr, OverSamplingTecnique)

# Phase 2: Run K-fold with best hyperparameters
final_metrics = kfold_with_best_params(X_arr, Y_arr, best_hps, OverSamplingTecnique)

Trial 8 Complete [00h 00m 25s]
val_auc: 0.9224357008934021

Best val_auc So Far: 0.932306170463562
Total elapsed time: 00h 04m 06s

--------------------------------------------------
BEST HYPERPARAMETERS FOUND:
--------------------------------------------------
  hidden_units: 256
  dropout_rate: 0.4
  learning_rate: 0.001
  l1_regularization: 8.639199562623076e-05
  l2_regularization: 1.7775537216490075e-05
--------------------------------------------------

PHASE 2: K-FOLD CROSS-VALIDATION

Fold 1/10
  Acc: 0.9460 | F1: 0.8029 | AUC: 0.9005

Fold 2/10
  Acc: 0.9540 | F1: 0.8414 | AUC: 0.9211

Fold 3/10
  Acc: 0.9660 | F1: 0.8794 | AUC: 0.9509

Fold 4/10
  Acc: 0.9340 | F1: 0.7898 | AUC: 0.9407

Fold 5/10
  Acc: 0.9260 | F1: 0.7730 | AUC: 0.9517

Fold 6/10
  Acc: 0.9260 | F1: 0.7613 | AUC: 0.9217

Fold 7/10
  Acc: 0.9560 | F1: 0.8472 | AUC: 0.9133

Fold 8/10
  Acc: 0.9240 | F1: 0.7246 | AUC: 0.8893

Fold 9/10
  Acc: 0.9480 | F1: 0.8194 | AUC: 0.9171

Fold 10/10
  Acc: 0.9500 | F1: 0.8