In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
from plotly.subplots import make_subplots

In [None]:
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report

Evaluation uses relevant metrics and applies (repeated/nested) cross validation appropriately. Hyperparameter tuning is done, and models are clearly compared and interpreted.

# Loading data

In [None]:
df = pd.read_csv('./dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# Set once, apply to all
pio.templates.default = "plotly_white"

# Data description

In [None]:
df.head()

In [None]:
df.info()
df.shape

In [None]:
df.dtypes

In [None]:
df.columns.values

In [None]:
df = df.drop(['customerID'], axis=1)
df.head()

# Missing value 

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors="coerce")
df.isna().sum()

In [None]:
# Correlation of missingness (1 = missing, 0 = present)
missing_corr = df.isna().corr()

sns.heatmap(missing_corr, annot=True, cmap='coolwarm')
plt.title('Correlation of Missingness')
plt.show()

In [None]:
df[np.isnan(df['TotalCharges'])]

totalCharges is 0 while tenure must be 0

In [None]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)

impute totalCharges with mean value

In [None]:
df.fillna(df['TotalCharges'].mean(), inplace=True)

In [None]:
df.isna().sum()

In [None]:
df["SeniorCitizen"] = df["SeniorCitizen"].map({0: "No", 1: "Yes"})

In [None]:
df["InternetService"].describe()

In [None]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()

# Visualization

## Gender and churn distribution

In [None]:
# Create individual pie charts with px
fig_gender = px.pie(df, names='gender', title='Gender')
fig_churn = px.pie(df, names='Churn', title='Churn')

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add traces from px figures
fig.add_trace(fig_gender.data[0], row=1, col=1)
fig.add_trace(fig_churn.data[0], row=1, col=2)

# Update for donut style
fig.update_traces(hole=0.4, hoverinfo="label+percent+name", textfont_size=16)

# Better title and annotations
fig.update_layout(
    title_text="Gender and Churn Distributions",
    title_x=0.5,
    annotations=[
        dict(text='Gender', x=0.18, y=0.5, font_size=20, showarrow=False),
        dict(text='Churn', x=0.82, y=0.5, font_size=20, showarrow=False)
    ],
    font=dict(size=14)
)

fig.show()

## Churn rate by gender

In [None]:
df[df["Churn"] == "No"]["gender"].value_counts()

In [None]:
# Create a crosstab
churn_gender = pd.crosstab(df['Churn'], df['gender'], normalize='index') * 100

# Plot
ax = churn_gender.plot(kind='bar', stacked=True, color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Percentage (%)')
ax.set_title('Churn Rate by Gender')
for container in ax.containers:
    ax.bar_label(container, fmt='%.1f%%', label_type='center')
plt.legend(title='Gender')
plt.show()

In [None]:
churn_gender_counts = pd.crosstab(df['Churn'], df['gender'])

ax = churn_gender_counts.plot(kind='bar', color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Count')
ax.set_title('Number of Customers by Churn and Gender')
for container in ax.containers:
    ax.bar_label(container)
plt.legend(title='Gender')
plt.show()

## Contract distribution

In [None]:
fig = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Payment Method Distribution

In [None]:
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()

## Payment for Churn

In [None]:
fig = px.histogram(df, x="Churn", color="PaymentMethod", title="Customer Payment Method distribution w.r.t. Churn")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Internet Service and Gender of Churn

In [None]:
df["InternetService"].unique()

In [None]:
df[df["gender"] == "Male"][["InternetService", "Churn"]].value_counts()

In [None]:
fig = go.Figure()

# Define categories
churn_labels = ['Churn:No', 'Churn:Yes']
genders = ['Female', 'Male']

# Data: [ [DSL_F, DSL_M], [Fiber_F, Fiber_M], [NoInternet_F, NoInternet_M] ] per churn group
data = {
    'DSL': {
        'Churn:No': [965, 992],
        'Churn:Yes': [219, 240]
    },
    'Fiber optic': {
        'Churn:No': [889, 910],
        'Churn:Yes': [664, 633]
    },
    'No Internet': {
        'Churn:No': [690, 717],
        'Churn:Yes': [56, 57]
    }
}

# Build x-axis labels: "Churn:No-Female", "Churn:No-Male".
x_labels = [f"{churn}-{gender}" for churn in churn_labels for gender in genders]

# Add a trace for each InternetService (stacked)
for service, churn_data in data.items():
    y_values = []
    for churn in churn_labels:
        y_values.extend(churn_data[churn])  # [F, M] for this churn group
    fig.add_trace(go.Bar(
        x=x_labels,
        y=y_values,
        name=service,
        text=y_values,
        textposition='auto'
    ))

fig.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")

fig.show()

## Dependents churn distribution

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Partner Churn

In [None]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Chrun distribution w.r.t. Partners</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## SeniorCitizen distribution

In [None]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Chrun distribution w.r.t. Senior Citizen</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Online security churn

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## paperless billing

In [None]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Chrun distribution w.r.t. Paperless Billing</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## TechSupport distribution

In [None]:
fig = px.histogram(df, x="Churn", color="TechSupport", barmode="group",
                   title="<b>Chrun distribution w.r.t. TechSupport</b>", text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="PhoneService", title="<b>Chrun distribution w.r.t. Phone Service</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No')],
                 color="Red", fill=True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes')],
                 ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')

In [None]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No')],
                 color="Red", fill=True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes')],
                 ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')

In [None]:
fig = px.box(df, x='Churn', y='tenure')

# Update yaxis properties
fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Churn', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=750, height=600,
                  title_font=dict(size=25, family='Courier'),
                  title='Tenure vs Churn',
                  )

fig.show()

In [None]:
plt.figure(figsize=(25, 10))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2,
                 cmap='coolwarm', vmin=-1, vmax=1)

# Data preprocessing

In [None]:
df.nunique()

In [None]:
for col in df.columns:
    n_unique = df[col].nunique(dropna=True)
    if n_unique < 5:
        uniques = df[col].dropna().unique()
        print(f"{col}: {list(uniques)}")

## Encoding

In [None]:
binary_cols = ['Partner', 'Dependents', 'PhoneService',
               'PaperlessBilling', 'Churn', 'SeniorCitizen']

for col in binary_cols:
    df[col] = df[col].map({'No': 0, 'Yes': 1})

df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

categorical_cols = [
    'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
# drop_first=False 保留所有类别（避免信息丢失）；若担心共线性可设为 True

In [None]:
# Get Churn correlations
churn_corr = df.corr()['Churn'].sort_values(ascending=False)

# Remove 'Churn' itself if you don't want it (it will be 1.0)
churn_corr = churn_corr.drop('Churn')

# Create figure
plt.figure(figsize=(16, 9))

# Use a diverging colormap: red (positive), white (0), blue (negative)
colors = plt.cm.RdYlBu_r((churn_corr + 1) / 2)  # Normalize to [0,1] for colormap

# Plot
bars = plt.bar(churn_corr.index, churn_corr.values, color=colors, edgecolor='black', linewidth=0.5)

# Rotate x-axis labels for readability
plt.xticks(rotation=60, ha='right', fontsize=11)
plt.yticks(fontsize=11)

# Labels and title
plt.ylabel('Correlation with Churn', fontsize=13)
plt.title('Feature Correlation with Churn (Higher = More Likely to Churn)', fontsize=16, weight='bold')

# Add horizontal line at 0
plt.axhline(0, color='gray', linewidth=0.8)

# Optional: Add value labels on bars (only for strong correlations to avoid clutter)
for bar, corr in zip(bars, churn_corr.values):
    if abs(corr) > 0.1:  # Only label if |correlation| > 0.1
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + (0.01 if bar.get_height() >= 0 else -0.02),
            f'{corr:.2f}',
            ha='center', va='bottom' if bar.get_height() >= 0 else 'top',
            fontsize=9, fontweight='bold'
        )

plt.tight_layout()
plt.show()

## Split train/test set

In [None]:
y = df['Churn'].values
X = df.drop(columns=['Churn'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40, stratify=y)

In [None]:
def distplot(feature, frame, color='r'):
    plt.figure(figsize=(8, 3))
    plt.title("Distribution for {}".format(feature))
    sns.histplot(frame[feature], color=color)

In [None]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)

In [None]:
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')), columns=num_cols)
for feat in numerical_cols: distplot(feat, df_std, color='c')

In [None]:
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Model

In [None]:
# Define outer CV (for unbiased performance estimation)
outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# Define inner CV (for hyperparameter tuning)
inner_cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=1, random_state=1)

## Knn

In [None]:
np.random.seed(325)
# Define KNN model
knn_model = KNeighborsClassifier()

# Define hyperparameter search space
param_dist_knn = {
    'n_neighbors': np.arange(3, 31, 2),  # search from 3 to 30
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Set up randomized search with INNER CV
knn_random = RandomizedSearchCV(
    estimator=knn_model,
    param_distributions=param_dist_knn,
    n_iter=30,  # try 30 random combinations
    cv=inner_cv,
    scoring='roc_auc',  # optimize for AUC
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit on training data
knn_random.fit(X_train, y_train)

print("Best KNN parameters:", knn_random.best_params_)
print("Best CV AUC:", knn_random.best_score_)

In [None]:
# Outer CV for unbiased performance estimation
cv_outer = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# Scoring metrics
scoring = ['roc_auc', 'f1', 'recall', 'precision']

# Run nested CV
nested_scores_knn = cross_validate(
    knn_random,
    X_train, y_train,
    cv=cv_outer,
    scoring=scoring,
    n_jobs=-1
)

# Summarize
for metric in scoring:
    scores = nested_scores_knn[f'test_{metric}']
    print(f"{metric.upper()}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
# Get best model
best_knn = knn_random.best_estimator_

# Predict on test set
predicted_knn_y = best_knn.predict(X_test)
y_pred_prob = best_knn.predict_proba(X_test)[:, 1]

# Final metrics
accuracy_knn = best_knn.score(X_test, y_test)
print("\nFinal Test Results (KNN):")
print("Accuracy:", accuracy_knn)
print(classification_report(y_test, predicted_knn_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, predicted_knn_y),
    annot=True,
    fmt="d",
    linecolor="k",
    linewidths=3,
    cmap="Blues"
)
plt.title("KNN CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label=f'KNN (AUC = {best_knn.score(X_test, y_test):.2f})', color="r", linewidth=2)
# Better: use roc_auc_score
auc_test = roc_auc_score(y_test, y_pred_prob)
plt.plot(fpr, tpr, label=f'KNN (AUC = {auc_test:.3f})', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Svm

In [None]:
np.random.seed(325)
# Train SVM model WITH probability=True (required for predict_proba)
svc_model = SVC(probability=True, random_state=1)

# 2. DEFINE HYPERPARAMETER SEARCH SPACE
param_dist_svm = {
    'C': np.logspace(-2, 2, 20),  # Regularization: [0.01, 100]
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 20)),  # Kernel coefficient
    'kernel': ['rbf', 'linear', 'poly'],
    'class_weight': [None, 'balanced']
}

svm_search = RandomizedSearchCV(
    svc_model,
    param_distributions=param_dist_svm,
    n_iter=30,  # balance quality vs. time
    scoring='roc_auc',  # appropriate for imbalanced data
    cv=inner_cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

svm_search.fit(X_train, y_train)

# Fit on training data
print("Tuning SVM hyperparameters...")
svm_search.fit(X_train, y_train)
print("\nBest SVM Parameters:")
print(svm_search.best_params_)
print("Best CV AUC:", round(svm_search.best_score_, 4))

In [None]:
# Outer CV for robust performance estimation
print("\nRunning nested cross-validation for unbiased evaluation...")
nested_scores_svm = cross_validate(
    svm_search,
    X_train, y_train,
    cv=outer_cv,
    scoring=scoring,
    n_jobs=-1
)

print("\nNested CV Results (SVM):")
for metric in scoring:
    scores = nested_scores_svm[f'test_{metric}']
    print(f"{metric.upper()}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
# Get best estimator
best_svm = svm_search.best_estimator_

# Predictions
predicted_svm_y = best_svm.predict(X_test)
y_pred_prob = best_svm.predict_proba(X_test)[:, 1]

# Final metrics
accuracy_svm = best_svm.score(X_test, y_test)
auc_svm = roc_auc_score(y_test, y_pred_prob)

print("\nFinal Test Performance (SVM):")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"AUC: {auc_svm:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, predicted_svm_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, predicted_svm_y),
    annot=True,
    fmt="d",
    linecolor="k",
    linewidths=3,
    cmap="Blues"
)
plt.title("SVM CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ROC Curve (requires predict_proba → enabled by probability=True)
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label=f'SVM (AUC = {auc_svm:.3f})', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SVM ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## ramdom tree

In [None]:
np.random.seed(325)
# Train Random Forest model
rt_model = RandomForestClassifier(
    n_jobs=-1,
    random_state=50
)

# 2. DEFINE HYPERPARAMETER SEARCH SPACE
# Focus on key parameters that impact performance & overfitting
param_dist_rt = {
    'n_estimators': [100, 200, 500, 800, 1000],
    'max_depth': [3, 5, 8, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', 0.5],
    'bootstrap': [True, False]
}

# 3. INNER CROSS-VALIDATION FOR HYPERPARAMETER TUNING
rt_search = RandomizedSearchCV(
    rt_model,
    param_distributions=param_dist_rt,
    n_iter=40,  # more iterations for robust tuning
    scoring='roc_auc',
    cv=inner_cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit on training data
print("Tuning Random Forest hyperparameters...")
rt_search.fit(X_train, y_train)

print("\nBest Random Forest Parameters:")
print(rt_search.best_params_)
print("Best CV AUC:", round(rt_search.best_score_, 4))

In [None]:
# Outer CV: RepeatedStratifiedKFold (same as your global CV)
print("\nRunning nested cross-validation for unbiased evaluation...")
nested_scores_rt = cross_validate(
    rt_search,
    X_train, y_train,
    cv=outer_cv,
    scoring=scoring,
    n_jobs=-1
)

print("\nNested CV Results (Random Forest):")
for metric in scoring:
    scores = nested_scores_rt[f'test_{metric}']
    print(f"{metric.upper()}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
# Get best estimator
best_rf = rt_search.best_estimator_

# Predictions
predicted_rf_y = best_rf.predict(X_test)
y_pred_prob = best_rf.predict_proba(X_test)[:, 1]

# Metrics
accuracy_rf = best_rf.score(X_test, y_test)
auc_rf = roc_auc_score(y_test, y_pred_prob)

print("\nFinal Test Performance (Random Forest):")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"AUC: {auc_rf:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, predicted_rf_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, predicted_rf_y),
    annot=True, fmt="d", linecolor="k", linewidths=3, cmap="Blues"
)
plt.title("RANDOM FOREST CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label=f'Random Forest (AUC = {auc_rf:.3f})', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RANDOM FOREST ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## logistic regression

In [None]:
np.random.seed(325)
# Train Logistic Regression model
lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000,  # Prevent convergence warnings
    solver='liblinear'  # Works well for small datasets & supports L1/L2
)

# Hyperparameter grid
param_dist_lr = {
    'C': np.logspace(-4, 4, 50),  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],
    'class_weight': [None, 'balanced']  # Critical for imbalanced churn data!
}

lr_search = RandomizedSearchCV(
    lr_model,
    param_distributions=param_dist_lr,
    n_iter=40,
    scoring='roc_auc',  # or 'f1' if recall/precision balance is key
    cv=inner_cv,
    n_jobs=-1,
    random_state=42
)

# Fit on scaled training data
print("Tuning Logistic Regression hyperparameters...")
lr_search.fit(X_train, y_train)

print("\nBest Logistic Regression Parameters:")
print(lr_search.best_params_)
print("Best CV AUC:", round(lr_search.best_score_, 4))

In [None]:
print("\nRunning nested cross-validation...")
nested_scores_lr = cross_validate(
    lr_search,
    X_train, y_train,
    cv=outer_cv,
    scoring=scoring,
    n_jobs=-1
)

print("\nNested CV Results (Logistic Regression):")
for metric in scoring:
    scores = nested_scores_lr[f'test_{metric}']
    print(f"{metric.upper()}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
best_lr = lr_search.best_estimator_

# Predictions on scaled test data
predicted_lr_y = best_lr.predict(X_test)
y_pred_prob = best_lr.predict_proba(X_test)[:, 1]

# Metrics
accuracy_lr = best_lr.score(X_test, y_test)
auc_lr = roc_auc_score(y_test, y_pred_prob)

print("\nFinal Test Performance (Logistic Regression):")
print(f"Accuracy: {accuracy_lr:.4f}")
print(f"AUC: {auc_lr:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, predicted_lr_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, predicted_lr_y),
    annot=True, fmt="d", linecolor="k", linewidths=3, cmap="Blues"
)
plt.title("LOGISTIC REGRESSION CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {auc_lr:.3f})', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LOGISTIC REGRESSION ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Decision tree

In [None]:
np.random.seed(325)
# Train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)  # added for reproducibility
# 2. DEFINE HYPERPARAMETER SEARCH SPACE
# Prevent overfitting by limiting depth, nodes, and samples per leaf
param_dist_dt = {
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 5, 10, 20, 50],
    'min_samples_leaf': [1, 2, 5, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced']  # Important for imbalanced churn data
}

dt_search = RandomizedSearchCV(
    dt_model,
    param_distributions=param_dist_dt,
    n_iter=50,  # Decision Trees are fast → more trials
    scoring='roc_auc',
    cv=inner_cv,
    n_jobs=-1,
    random_state=42
)

# Fit on training data
print("Tuning Decision Tree hyperparameters...")
dt_search.fit(X_train, y_train)

print("\nBest Decision Tree Parameters:")
print(dt_search.best_params_)
print("Best CV AUC:", round(dt_search.best_score_, 4))

In [None]:
# Outer CV
print("\nRunning nested cross-validation...")
nested_scores_dt = cross_validate(
    dt_search,
    X_train, y_train,
    cv=outer_cv,
    scoring=scoring,
    n_jobs=-1
)

print("\nNested CV Results (Decision Tree):")
for metric in scoring:
    scores = nested_scores_dt[f'test_{metric}']
    print(f"{metric.upper()}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
best_dt = dt_search.best_estimator_

# Predictions
predicted_dt_y = best_dt.predict(X_test)
y_pred_prob = best_dt.predict_proba(X_test)[:, 1]

# Metrics
accuracy_dt = best_dt.score(X_test, y_test)
auc_dt = roc_auc_score(y_test, y_pred_prob)

print("\nFinal Test Performance (Decision Tree):")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"AUC: {auc_dt:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, predicted_dt_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, predicted_dt_y),
    annot=True, fmt="d", linecolor="k", linewidths=3, cmap="Blues"
)
plt.title("DECISION TREE CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label=f'Decision Tree (AUC = {auc_dt:.3f})', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('DECISION TREE ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Compare models

In [None]:
def summarize_nested_cv(cv_results, model_name):
    summary = {'Model': model_name}
    for metric in ['roc_auc', 'f1', 'recall', 'precision']:
        key = f'test_{metric}'
        summary[f'{metric.upper()} (mean)'] = np.mean(cv_results[key])
        summary[f'{metric.upper()} (std)'] = np.std(cv_results[key])
    return summary


# Build comparison table
comparison_data = [
    summarize_nested_cv(nested_scores_knn, 'KNN'),
    summarize_nested_cv(nested_scores_svm, 'SVM'),
    summarize_nested_cv(nested_scores_rt, 'Random Forest'),
    summarize_nested_cv(nested_scores_lr, 'Logistic Regression'),
    summarize_nested_cv(nested_scores_dt, 'Decision Tree')
]

results_df = pd.DataFrame(comparison_data).round(4)
results_df.set_index('Model', inplace=True)
print(results_df)

In [None]:
# Melt for plotting
melted = results_df.reset_index().melt(
    id_vars='Model',
    value_vars=['ROC_AUC (mean)', 'F1 (mean)', 'RECALL (mean)'],
    var_name='Metric',
    value_name='Score'
)

# Clean metric names
melted['Metric'] = melted['Metric'].str.replace(' (mean)', '')

fig = px.bar(
    melted,
    x='Model',
    y='Score',
    color='Metric',
    barmode='group',
    title='Model Comparison (Nested CV Mean Performance)',
    height=500,
    text_auto=True
)
fig.update_layout(yaxis_range=[0, 1], font=dict(size=12))
fig.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

models_final = {
    'KNN': best_knn,
    'SVM': best_svm,
    'Random Forest': best_rf,
    'Logistic Regression': best_lr,
    'Decision Tree': best_dt
}

# For SVM and LR, use scaled test data!
X_test_dict = {
    'KNN': X_test,
    'SVM': X_test,
    'Random Forest': X_test,
    'Logistic Regression': X_test,
    'Decision Tree': X_test
}

for name, model in models_final.items():
    X_input = X_test_dict[name]
    y_pred_prob = model.predict_proba(X_input)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison (Final Test Set)')
plt.legend()
plt.grid(alpha=0.3)
plt.show()