In [1]:
import warnings
warnings.filterwarnings("ignore")

import re
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
)

# Load data
try:
    df = pd.read_csv("Churn_Modelling.csv")
except FileNotFoundError:
    print("Churn_Modelling.csv not found. Using a small dummy dataset.")
    df = pd.DataFrame({
        "RowNumber": range(10), "CustomerId": range(10), "Surname": ["A"] * 10,
        "CreditScore": np.random.randint(400, 850, 10),
        "Geography": ["France", "Spain"] * 5,
        "Gender": ["Female", "Male"] * 5,
        "Age": np.random.randint(20, 60, 10),
        "Tenure": np.random.randint(0, 10, 10),
        "Balance": np.random.rand(10) * 150000,
        "NumOfProducts": np.random.randint(1, 4, 10),
        "HasCrCard": np.random.randint(0, 2, 10),
        "IsActiveMember": np.random.randint(0, 2, 10),
        "EstimatedSalary": np.random.rand(10) * 200000,
        "Exited": np.random.randint(0, 2, 10),
    })

# Basic prep
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1, errors="ignore")
X = df.drop("Exited", axis=1)
y = df["Exited"]

categorical_features = ["Geography", "Gender"]
numerical_features = [c for c in X.columns if c not in categorical_features]

# Transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), categorical_features),
    ],
    remainder="passthrough",
)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

def evaluate_model(name, pipeline):
    print(f"\nModel: {name}")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"ROC AUC : {auc:.4f}")
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("\nConfusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    return {
        "Model": name,
        "Accuracy": acc,
        "ROC_AUC": auc,
        "Precision (Churn)": report.get("1", {}).get("precision", np.nan),
        "Recall (Churn)": report.get("1", {}).get("recall", np.nan),
    }

# Pipelines
log_reg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", LogisticRegression(random_state=42, solver="liblinear")),
])

rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42)),
])

gb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)),
])

# Run
results = []
results.append(evaluate_model("Logistic Regression", log_reg_pipeline))
results.append(evaluate_model("Random Forest", rf_pipeline))
results.append(evaluate_model("Gradient Boosting", gb_pipeline))

# Summary
print("\n" + "=" * 50)
print("MODEL PERFORMANCE SUMMARY")
print("=" * 50)
summary_df = pd.DataFrame(results).set_index("Model")
print(summary_df.to_string(float_format=lambda x: f"{x:.4f}"))

print("\nNotes:")
print("1) Gradient Boosting often gives strong ROC AUC.")
print("2) High accuracy can hide low recall for churners on imbalanced data.")
print("3) Precision shows how many predicted churners actually churned.")
print("4) ROC AUC reflects overall ranking quality.")

# Example prediction with the full-data Gradient Boosting pipeline
gb_model = gb_pipeline.fit(X, y)
new_customer = pd.DataFrame({
    "CreditScore": [650], "Geography": ["France"], "Gender": ["Female"],
    "Age": [40], "Tenure": [3], "Balance": [50000],
    "NumOfProducts": [1], "HasCrCard": [1], "IsActiveMember": [1],
    "EstimatedSalary": [100000],
})
pred = gb_model.predict(new_customer)[0]
proba = gb_model.predict_proba(new_customer)[0, 1]

print("\n" + "=" * 50)
print("Example Prediction (Gradient Boosting)")
print(f"Predicted Churn (1=yes, 0=no): {pred}")
print(f"Churn Probability: {proba:.2%}")
print("=" * 50)



Model: Logistic Regression
Accuracy: 0.8085
ROC AUC : 0.7749

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1593
           1       0.59      0.19      0.28       407

    accuracy                           0.81      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000


Confusion matrix:
[[1541   52]
 [ 331   76]]

Model: Random Forest
Accuracy: 0.8640
ROC AUC : 0.8522

Classification report:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.78      0.46      0.58       407

    accuracy                           0.86      2000
   macro avg       0.83      0.71      0.75      2000
weighted avg       0.86      0.86      0.85      2000


Confusion matrix:
[[1541   52]
 [ 220  187]]

Model: Gradient Boosting
Accuracy: 0.8700
ROC AUC : 0.8708

Classification report:
    