In [1]:
# ============================================================
# End-to-end implementation for the interview task
# - Loads train features + labels (50k rows)
# - Aligns/merges safely
# - Cleans columns (drops 100% missing + constant cols)
# - Builds preprocessing for numeric + categorical
# - Trains model (tries LightGBM -> XGBoost -> HistGB fallback)
# - Evaluates with PR-AUC, ROC-AUC, LogLoss
# - Finds best threshold on validation (max F1 by default)
# - Retrains on full data and saves artifacts
# ============================================================

import os
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    log_loss,
    precision_recall_curve,
    f1_score,
    classification_report,
    confusion_matrix
)

RANDOM_STATE = 42

# -----------------------------
# Paths (edit if needed)
# -----------------------------
FEATURES_PATH = "train (6).csv"
LABELS_PATH   = "train_churn_labels.csv"

OUT_DIR = "model_out"
os.makedirs(OUT_DIR, exist_ok=True)

In [4]:
# -----------------------------
# Helper classes
# -----------------------------
class ToDense(BaseEstimator, TransformerMixin):
    """Convert sparse matrix to dense (required for HistGradientBoostingClassifier)."""
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.toarray() if hasattr(X, "toarray") else np.asarray(X)


# -----------------------------
# Helper functions
# -----------------------------
def load_and_align(features_path: str, labels_path: str):
    X = pd.read_csv(features_path)
    y_df = pd.read_csv(labels_path)

    possible_label_cols = ["label", "Label", "target", "Target", "y", "Y"]
    label_col = None
    for c in possible_label_cols:
        if c in y_df.columns:
            label_col = c
            break
    if label_col is None:
        label_col = y_df.columns[0]

    y_raw = y_df[label_col].copy()

    if len(X) != len(y_raw):
        raise ValueError(f"Row mismatch: X={len(X)} but y={len(y_raw)}. Need ID-based join.")

    y = y_raw.replace({-1: 0, 1: 1}).astype(int)
    return X, y


def drop_bad_columns(df: pd.DataFrame):
    df2 = df.copy()
    all_missing = [c for c in df2.columns if df2[c].isna().all()]
    df2.drop(columns=all_missing, inplace=True)

    constant = [c for c in df2.columns if df2[c].nunique(dropna=True) <= 1]
    df2.drop(columns=constant, inplace=True)

    dropped = {"all_missing": all_missing, "constant": constant}
    return df2, dropped


def add_missingness_features(df: pd.DataFrame):
    df2 = df.copy()
    miss_count = df2.isna().sum(axis=1)
    miss_ratio = miss_count / max(df2.shape[1], 1)
    df2["__missing_count__"] = miss_count
    df2["__missing_ratio__"] = miss_ratio
    return df2


def get_feature_types(df: pd.DataFrame):
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [c for c in df.columns if c not in cat_cols]
    return num_cols, cat_cols


def choose_model():
    # --- LightGBM ---
    try:
        from lightgbm import LGBMClassifier
        model = LGBMClassifier(
            objective="binary",
            n_estimators=2000,
            learning_rate=0.03,
            num_leaves=63,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
        return "lightgbm", model
    except Exception:
        pass

    # --- XGBoost ---
    try:
        from xgboost import XGBClassifier
        model = XGBClassifier(
            n_estimators=1500,
            learning_rate=0.03,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            objective="binary:logistic",
            eval_metric="logloss",
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
        return "xgboost", model
    except Exception:
        pass

    # --- Fallback: HistGradientBoosting ---
    from sklearn.ensemble import HistGradientBoostingClassifier
    model = HistGradientBoostingClassifier(
        learning_rate=0.06,
        max_depth=6,
        max_iter=600,
        random_state=RANDOM_STATE
    )
    return "hist_gb", model


def build_pipeline(X: pd.DataFrame, base_model, model_name: str):
    """
    Key fix:
    - HistGradientBoostingClassifier requires dense input.
    - So we add ToDense() right after preprocessing only for hist_gb.
    """
    num_cols, cat_cols = get_feature_types(X)

    numeric_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ])

    categorical_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop"
    )

    steps = [("preprocess", preprocessor)]

    if model_name == "hist_gb":
        steps.append(("todense", ToDense()))  # ✅ critical fix

    steps.append(("model", base_model))

    return Pipeline(steps=steps)


def find_best_threshold(y_true, y_proba, method="max_f1"):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    if len(thresholds) == 0:
        return 0.5

    if method == "max_f1":
        f1s = (2 * precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-12)
        best_idx = int(np.argmax(f1s))
        return float(thresholds[best_idx])

    return 0.5


# -----------------------------
# Main training flow
# -----------------------------
def main():
    print("1) Loading data...")
    X_raw, y = load_and_align(FEATURES_PATH, LABELS_PATH)

    print("2) Cleaning columns...")
    X_clean, dropped = drop_bad_columns(X_raw)

    print("3) Adding missingness features...")
    X_feat = add_missingness_features(X_clean)

    print("4) Train/validation split (stratified)...")
    X_train, X_val, y_train, y_val = train_test_split(
        X_feat, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    pos_rate = y_train.mean()
    print(f"   Positive rate (train): {pos_rate:.4f} (imbalance expected)")

    print("5) Model selection...")
    model_name, base_model = choose_model()
    print(f"   Using model: {model_name}")

    neg = int((y_train == 0).sum())
    pos = int((y_train == 1).sum())
    if pos == 0:
        raise ValueError("No positive samples in training split.")
    scale_pos_weight = neg / pos
    print(f"   scale_pos_weight ~ {scale_pos_weight:.2f}")

    pipe = build_pipeline(X_train, base_model, model_name)

    print("6) Hyperparameter search (lightweight)...")
    if model_name == "lightgbm":
        param_dist = {
            "model__num_leaves": [31, 63, 127],
            "model__learning_rate": [0.01, 0.03, 0.06],
            "model__n_estimators": [800, 1500, 2500],
            "model__subsample": [0.7, 0.8, 0.9],
            "model__colsample_bytree": [0.7, 0.8, 0.9],
            "model__reg_lambda": [0.0, 1.0, 5.0],
        }
    elif model_name == "xgboost":
        param_dist = {
            "model__max_depth": [4, 6, 8],
            "model__learning_rate": [0.01, 0.03, 0.06],
            "model__n_estimators": [800, 1500, 2500],
            "model__subsample": [0.7, 0.8, 0.9],
            "model__colsample_bytree": [0.7, 0.8, 0.9],
            "model__reg_lambda": [0.0, 1.0, 5.0],
        }
    else:
        param_dist = {
            "model__learning_rate": [0.03, 0.06, 0.1],
            "model__max_depth": [4, 6, 8],
            "model__max_iter": [300, 600, 900],
        }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=min(15, sum(len(v) for v in param_dist.values())),
        scoring="average_precision",
        cv=cv,
        verbose=1,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        error_score="raise"   # ✅ so you see the REAL root error immediately
    )

    # sample weights (try to pass them; if estimator doesn't support, fall back)
    sample_weight = np.where(y_train.values == 1, scale_pos_weight, 1.0)

    try:
        search.fit(X_train, y_train, model__sample_weight=sample_weight)
    except TypeError:
        search.fit(X_train, y_train)

    best_pipe = search.best_estimator_
    print("Best CV PR-AUC:", search.best_score_)
    print("Best params:", search.best_params_)

    print("7) Validation evaluation...")
    if hasattr(best_pipe, "predict_proba"):
        y_val_proba = best_pipe.predict_proba(X_val)[:, 1]
    else:
        # fallback; most should have predict_proba
        y_val_proba = best_pipe.predict(X_val)

    pr_auc = average_precision_score(y_val, y_val_proba)
    roc_auc = roc_auc_score(y_val, y_val_proba)
    ll = log_loss(y_val, y_val_proba)

    print(f"   PR-AUC:  {pr_auc:.6f}")
    print(f"   ROC-AUC: {roc_auc:.6f}")
    print(f"   LogLoss: {ll:.6f}")

    thr = find_best_threshold(y_val, y_val_proba, method="max_f1")
    print(f"   Best threshold (max F1): {thr:.4f}")

    y_val_pred = (y_val_proba >= thr).astype(int)

    print("\nClassification report (val):")
    print(classification_report(y_val, y_val_pred, digits=4))
    print("Confusion matrix (val):")
    print(confusion_matrix(y_val, y_val_pred))

    print("8) Retrain best model on full data and save artifacts...")
    X_full = X_feat
    y_full = y

    neg_full = int((y_full == 0).sum())
    pos_full = int((y_full == 1).sum())
    spw_full = neg_full / max(pos_full, 1)
    sample_weight_full = np.where(y_full.values == 1, spw_full, 1.0)

    try:
        best_pipe.fit(X_full, y_full, model__sample_weight=sample_weight_full)
    except TypeError:
        best_pipe.fit(X_full, y_full)

    model_path = os.path.join(OUT_DIR, f"final_model_{model_name}.joblib")
    joblib.dump(best_pipe, model_path)

    meta = {
        "model_name": model_name,
        "dropped_columns": dropped,
        "best_params": search.best_params_,
        "val_metrics": {
            "pr_auc": float(pr_auc),
            "roc_auc": float(roc_auc),
            "log_loss": float(ll),
            "threshold_max_f1": float(thr)
        }
    }

    meta_path = os.path.join(OUT_DIR, "training_metadata.json")
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    print("\nDONE ✅")
    print("Saved model:", model_path)
    print("Saved metadata:", meta_path)


if __name__ == "__main__":
    main()

1) Loading data...
2) Cleaning columns...
3) Adding missingness features...
4) Train/validation split (stratified)...
   Positive rate (train): 0.0735 (imbalance expected)
5) Model selection...
   Using model: lightgbm
   scale_pos_weight ~ 12.61
6) Hyperparameter search (lightweight)...
Fitting 5 folds for each of 15 candidates, totalling 75 fits
[LightGBM] [Info] Number of positive: 2938, number of negative: 37062
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23169
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 3751
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Best CV PR-AUC: 0.2000481371590344
Best params: {'model__subsample': 0.9, 'model__reg_lambda': 0.0, 'model__num_leaves': 31, 'model__n_estimators': 1500, 'm

ValueError: X has 71675 features, but LGBMClassifier is expecting 63722 features as input.

In [5]:
"""
=============================================================
CRISP-DM Machine Learning Pipeline — Customer Churn Prediction
=============================================================
HOW TO RUN:
  pip install pandas numpy matplotlib seaborn scikit-learn
  python crisp_dm_pipeline.py

INPUT FILES (same folder as script):
  - train__6_.csv
  - train_churn_labels.csv

OUTPUT FILES:
  - crisp_dm_results.png
  - best_model.pkl
  - model_metadata.json
  - crisp_dm_report.txt
  - validation_predictions.csv
=============================================================
"""

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from pathlib import Path
import json, time, pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score,
    f1_score, precision_score, recall_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier

REPORT_LINES = []

def log(msg=""):
    print(msg)
    REPORT_LINES.append(str(msg))

# =============================================================
# PHASE 1 — BUSINESS UNDERSTANDING
# =============================================================
log("=" * 65)
log("PHASE 1 — BUSINESS UNDERSTANDING")
log("=" * 65)
log("""
Objective : Predict customer churn (binary classification).
           Label  1 -> churned customer
           Label -1 -> retained customer
Business Metric : Maximize recall on churners (avoid missed
                  churners) while keeping precision reasonable.
                  Primary metric: ROC-AUC + F1-churn.
Success Criteria: ROC-AUC > 0.75 | F1 (churn class) > 0.40
""")

# =============================================================
# PHASE 2 — DATA UNDERSTANDING
# =============================================================
log("=" * 65)
log("PHASE 2 — DATA UNDERSTANDING")
log("=" * 65)


df  = pd.read_csv("train (6).csv", low_memory=False)
labels = pd.read_csv("train_churn_labels.csv")

log(f"Dataset shape     : {df.shape}")
log(f"Labels shape      : {labels.shape}")

y = labels["Label"].map({-1: 0, 1: 1}).values
log(f"Class distribution: Retained(0)={int((y==0).sum())}  Churned(1)={int((y==1).sum())}")
log(f"Churn rate        : {y.mean()*100:.2f}%")



PHASE 1 — BUSINESS UNDERSTANDING

Objective : Predict customer churn (binary classification).
           Label  1 -> churned customer
           Label -1 -> retained customer
Business Metric : Maximize recall on churners (avoid missed
                  churners) while keeping precision reasonable.
                  Primary metric: ROC-AUC + F1-churn.
Success Criteria: ROC-AUC > 0.75 | F1 (churn class) > 0.40

PHASE 2 — DATA UNDERSTANDING
Dataset shape     : (50000, 230)
Labels shape      : (50000, 1)
Class distribution: Retained(0)=46328  Churned(1)=3672
Churn rate        : 7.34%


In [6]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols     = df.select_dtypes(include=["object"]).columns.tolist()
log(f"Numeric features  : {len(numeric_cols)}")
log(f"Categorical feats : {len(cat_cols)}")
log(f"Overall missing % : {df.isnull().mean().mean()*100:.1f}%")

miss_pct  = df.isnull().mean() * 100
high_miss = miss_pct[miss_pct > 80]
log(f"Cols > 80% missing: {len(high_miss)}")



Numeric features  : 192
Categorical feats : 38
Overall missing % : 69.8%
Cols > 80% missing: 154


In [7]:
# =============================================================
# PHASE 3 — DATA PREPARATION
# =============================================================
log("\n" + "=" * 65)
log("PHASE 3 — DATA PREPARATION")
log("=" * 65)

orig_df  = df.copy()
df_clean = df.copy()

drop_cols = high_miss.index.tolist()
df_clean.drop(columns=drop_cols, inplace=True)
log(f"Dropped {len(drop_cols)} high-missing columns. Remaining: {df_clean.shape[1]}")

numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols     = df_clean.select_dtypes(include=["object"]).columns.tolist()

useful_cat = [c for c in cat_cols if df_clean[c].nunique() < 50]
drop_cat   = [c for c in cat_cols if c not in useful_cat]
df_clean.drop(columns=drop_cat, inplace=True)
log(f"Dropped {len(drop_cat)} high-cardinality cat cols. Remaining: {df_clean.shape[1]}")

numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols     = df_clean.select_dtypes(include=["object"]).columns.tolist()
log(f"Final numeric: {len(numeric_cols)}, categorical: {len(cat_cols)}")

le = LabelEncoder()
for c in cat_cols:
    df_clean[c] = df_clean[c].fillna("__missing__")
    df_clean[c] = le.fit_transform(df_clean[c].astype(str))

all_feat_cols = numeric_cols + cat_cols

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", num_pipe,                               numeric_cols),
    ("cat", SimpleImputer(strategy="most_frequent"), cat_cols)
])

X = df_clean[all_feat_cols]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
log(f"Train size: {len(X_train)} | Val size: {len(X_val)}")




PHASE 3 — DATA PREPARATION
Dropped 154 high-missing columns. Remaining: 76
Dropped 14 high-cardinality cat cols. Remaining: 62
Final numeric: 42, categorical: 20
Train size: 40000 | Val size: 10000


In [8]:
# =============================================================
# PHASE 4 — MODELING
# =============================================================
log("\n" + "=" * 65)
log("PHASE 4 — MODELING")
log("=" * 65)

def make_pipeline(clf):
    return Pipeline([("prep", preprocessor), ("clf", clf)])

models = {
    "Baseline (Majority)"   : DummyClassifier(strategy="most_frequent"),
    "Logistic Regression"   : make_pipeline(
        LogisticRegression(max_iter=1000, C=0.1, class_weight="balanced", random_state=42)
    ),
    "Random Forest"         : make_pipeline(
        RandomForestClassifier(n_estimators=200, max_depth=8,
                               class_weight="balanced", random_state=42, n_jobs=-1)
    ),
    "Gradient Boosting"     : make_pipeline(
        GradientBoostingClassifier(n_estimators=150, max_depth=4,
                                   learning_rate=0.05, random_state=42)
    ),
    "RF Balanced Subsample" : make_pipeline(
        RandomForestClassifier(n_estimators=200, max_depth=10,
                               class_weight="balanced_subsample", random_state=1, n_jobs=-1)
    ),
}

results = {}
for name, model in models.items():
    t0 = time.time()
    log(f"\n  Training: {name} ...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_prob = np.zeros(len(y_val)) if name == "Baseline (Majority)" else model.predict_proba(X_val)[:, 1]
    auc     = roc_auc_score(y_val, y_prob) if y_prob.sum() > 0 else 0.5
    ap      = average_precision_score(y_val, y_prob) if y_prob.sum() > 0 else 0
    f1c     = f1_score(y_val, y_pred, pos_label=1, zero_division=0)
    elapsed = time.time() - t0
    results[name] = {"model": model, "y_pred": y_pred, "y_prob": y_prob,
                     "auc": auc, "ap": ap, "f1_churn": f1c, "time": elapsed}
    log(f"    ROC-AUC={auc:.4f}  AP={ap:.4f}  F1-Churn={f1c:.4f}  [{elapsed:.1f}s]")

best_name = max([k for k in results if k != "Baseline (Majority)"],
                key=lambda k: results[k]["auc"])
log(f"\n  Best model: {best_name}  (ROC-AUC={results[best_name]['auc']:.4f})")




PHASE 4 — MODELING

  Training: Baseline (Majority) ...
    ROC-AUC=0.5000  AP=0.0000  F1-Churn=0.0000  [0.0s]

  Training: Logistic Regression ...
    ROC-AUC=0.6718  AP=0.1379  F1-Churn=0.1968  [2.9s]

  Training: Random Forest ...
    ROC-AUC=0.7003  AP=0.1578  F1-Churn=0.2291  [3.7s]

  Training: Gradient Boosting ...
    ROC-AUC=0.7158  AP=0.1936  F1-Churn=0.0266  [81.6s]

  Training: RF Balanced Subsample ...
    ROC-AUC=0.6977  AP=0.1524  F1-Churn=0.2294  [5.5s]

  Best model: Gradient Boosting  (ROC-AUC=0.7158)


In [9]:
# =============================================================
# PHASE 5 — EVALUATION
# =============================================================
log("\n" + "=" * 65)
log("PHASE 5 — EVALUATION")
log("=" * 65)

best = results[best_name]
log("\nClassification Report — " + best_name)
log(classification_report(y_val, best["y_pred"], target_names=["Retained", "Churned"]))

probs      = best["y_prob"]
thresholds = np.linspace(0.1, 0.9, 80)
f1s_thresh, recalls_t, precisions_t = [], [], []
for t in thresholds:
    yp = (probs >= t).astype(int)
    f1s_thresh.append(f1_score(y_val, yp, pos_label=1, zero_division=0))
    recalls_t.append(recall_score(y_val, yp, pos_label=1, zero_division=0))
    precisions_t.append(precision_score(y_val, yp, pos_label=1, zero_division=0))
best_t_idx     = int(np.argmax(f1s_thresh))
best_threshold = float(thresholds[best_t_idx])
log(f"\nOptimal decision threshold (max F1): {best_threshold:.2f}")




PHASE 5 — EVALUATION

Classification Report — Gradient Boosting
              precision    recall  f1-score   support

    Retained       0.93      1.00      0.96      9266
     Churned       0.59      0.01      0.03       734

    accuracy                           0.93     10000
   macro avg       0.76      0.51      0.49     10000
weighted avg       0.90      0.93      0.89     10000


Optimal decision threshold (max F1): 0.12


In [12]:
# =============================================================
# VISUALISATIONS — 10 panels
# =============================================================
PALETTE = ["#2196F3", "#4CAF50", "#FF9800", "#E91E63", "#9C27B0"]
sns.set_style("whitegrid")

fig = plt.figure(figsize=(20, 24))
fig.suptitle("CRISP-DM Churn Prediction Pipeline — Results",
             fontsize=16, fontweight="bold", y=0.98)
gs = gridspec.GridSpec(4, 3, figure=fig, hspace=0.45, wspace=0.35)

# Panel 1: Class Distribution
ax1 = fig.add_subplot(gs[0, 0])
class_counts = pd.Series(y).value_counts().sort_index()
bars = ax1.bar(["Retained (0)", "Churned (1)"], class_counts.values,
               color=["#4CAF50", "#E91E63"], edgecolor="white", linewidth=1.5)
for bar, v in zip(bars, class_counts.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height()+300,
             f"{v:,}\n({v/len(y)*100:.1f}%)", ha="center", fontsize=10, fontweight="bold")
ax1.set_title("Class Distribution", fontweight="bold")
ax1.set_ylabel("Count")
ax1.set_ylim(0, class_counts.max() * 1.2)

# Panel 2: Missingness Distribution
ax2 = fig.add_subplot(gs[0, 1])
miss_bins   = pd.cut(orig_df.isnull().mean()*100, bins=[0,20,40,60,80,100],
                     labels=["0-20%","20-40%","40-60%","60-80%","80-100%"])
miss_counts = miss_bins.value_counts().sort_index()
ax2.bar(miss_counts.index, miss_counts.values, color=PALETTE, edgecolor="white")
ax2.set_title("Feature Missingness Distribution", fontweight="bold")
ax2.set_xlabel("Missing %"); ax2.set_ylabel("# Features")

# Panel 3: Model Comparison
ax3 = fig.add_subplot(gs[0, 2])
mnames = list(results.keys())
aucs = [results[m]["auc"] for m in mnames]
f1s  = [results[m]["f1_churn"] for m in mnames]
x, w = np.arange(len(mnames)), 0.35
ax3.bar(x-w/2, aucs, w, label="ROC-AUC",  color="#2196F3", alpha=0.85)
ax3.bar(x+w/2, f1s,  w, label="F1-Churn", color="#E91E63", alpha=0.85)
ax3.set_xticks(x)
ax3.set_xticklabels([m.replace(" ","\n") for m in mnames], fontsize=7)
ax3.axhline(0.5, ls="--", color="gray", alpha=0.5)
ax3.set_ylim(0, 1.05); ax3.set_title("Model Comparison", fontweight="bold")
ax3.legend(fontsize=8); ax3.set_ylabel("Score")

# Panel 4: ROC Curves
ax4 = fig.add_subplot(gs[1, 0:2])
for i, (name, res) in enumerate(results.items()):
    if name == "Baseline (Majority)": continue
    fpr, tpr, _ = roc_curve(y_val, res["y_prob"])
    ax4.plot(fpr, tpr, lw=2, color=PALETTE[i%len(PALETTE)],
             label=f"{name} (AUC={res['auc']:.3f})")
ax4.plot([0,1],[0,1],"k--",alpha=0.4)
fpr_b, tpr_b, _ = roc_curve(y_val, best["y_prob"])
ax4.fill_between(fpr_b, tpr_b, alpha=0.08, color=PALETTE[0])
ax4.set_xlabel("False Positive Rate"); ax4.set_ylabel("True Positive Rate")
ax4.set_title("ROC Curves — All Models", fontweight="bold")
ax4.legend(fontsize=8, loc="lower right")
ax4.set_xlim([0,1]); ax4.set_ylim([0,1])

# Panel 5: Precision-Recall
ax5 = fig.add_subplot(gs[1, 2])
for i, (name, res) in enumerate(results.items()):
    if name == "Baseline (Majority)": continue
    prec, rec, _ = precision_recall_curve(y_val, res["y_prob"])
    ax5.plot(rec, prec, lw=1.8, color=PALETTE[i%len(PALETTE)],
             label=f"{name[:12]} (AP={res['ap']:.3f})")
ax5.axhline(y.mean(), ls="--", color="gray", alpha=0.5, label="Baseline")
ax5.set_xlabel("Recall"); ax5.set_ylabel("Precision")
ax5.set_title("Precision-Recall Curves", fontweight="bold")
ax5.legend(fontsize=6.5, loc="upper right")

# Panel 6: Confusion Matrix
ax6 = fig.add_subplot(gs[2, 0])
cm = confusion_matrix(y_val, best["y_pred"])
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Retained","Churned"],
            yticklabels=["Retained","Churned"], ax=ax6,
            annot_kws={"size":13,"weight":"bold"})
ax6.set_title(f"Confusion Matrix\n{best_name}", fontweight="bold")
ax6.set_ylabel("Actual"); ax6.set_xlabel("Predicted")

# Panel 7: Feature Importances
ax7 = fig.add_subplot(gs[2, 1:])
try:
    clf_step = best["model"].named_steps["clf"]
    if hasattr(clf_step, "feature_importances_"):
        imp = clf_step.feature_importances_
        feat_names = ([f"num_{c}" for c in numeric_cols] +
                      [f"cat_{c}" for c in cat_cols])
        if len(imp) == len(feat_names):
            fi_df = pd.DataFrame({"feature": feat_names, "importance": imp})
            fi_df = fi_df.nlargest(20, "importance")
            ax7.barh(fi_df["feature"], fi_df["importance"], color="#2196F3", alpha=0.8)
            ax7.set_xlabel("Importance")
            ax7.set_title(f"Top-20 Feature Importances\n{best_name}", fontweight="bold")
            ax7.invert_yaxis()
        else:
            ax7.text(0.5, 0.5, "Feature count mismatch", ha="center", transform=ax7.transAxes)
    else:
        ax7.text(0.5, 0.5, "Feature importance not available", ha="center",
                 va="center", transform=ax7.transAxes, fontsize=11)
        ax7.set_title("Feature Importances", fontweight="bold")
except Exception as e:
    ax7.text(0.5, 0.5, f"Could not compute importances:\n{e}",
             ha="center", va="center", transform=ax7.transAxes, fontsize=9)

# Panel 8: Score Distribution
ax8 = fig.add_subplot(gs[3, 0])
ax8.hist(probs[y_val==0], bins=40, alpha=0.6, color="#4CAF50", label="Retained", density=True)
ax8.hist(probs[y_val==1], bins=40, alpha=0.6, color="#E91E63", label="Churned",  density=True)
ax8.axvline(0.5, ls="--", color="black", alpha=0.6, label="Default threshold")
ax8.set_xlabel("Predicted Churn Probability"); ax8.set_ylabel("Density")
ax8.set_title("Score Distribution by Class", fontweight="bold")
ax8.legend()

# Panel 9: Threshold Tuning
ax9 = fig.add_subplot(gs[3, 1])
ax9.plot(thresholds, f1s_thresh,   color="#2196F3", lw=2, label="F1")
ax9.plot(thresholds, recalls_t,    color="#4CAF50", lw=2, label="Recall")
ax9.plot(thresholds, precisions_t, color="#E91E63", lw=2, label="Precision")
ax9.axvline(best_threshold, ls="--", color="black",
            label=f"Best thresh={best_threshold:.2f}")
ax9.set_xlabel("Decision Threshold"); ax9.set_ylabel("Score")
ax9.set_title("Threshold Tuning", fontweight="bold")
ax9.legend(fontsize=8)

# Panel 10: Leaderboard Table
ax10 = fig.add_subplot(gs[3, 2])
ax10.axis("off")
table_data = [
    [name[:22], f"{res['auc']:.4f}", f"{res['ap']:.4f}",
     f"{res['f1_churn']:.4f}", f"{res['time']:.1f}s"]
    for name, res in results.items()
]
tbl = ax10.table(cellText=table_data,
                 colLabels=["Model","ROC-AUC","Avg Prec","F1-Churn","Time"],
                 loc="center", cellLoc="center")
tbl.auto_set_font_size(False); tbl.set_fontsize(7.5); tbl.scale(1.1, 1.6)
for (r, c), cell in tbl.get_celld().items():
    if r == 0:
        cell.set_facecolor("#2196F3")
        cell.set_text_props(color="white", fontweight="bold")
    elif r > 0 and table_data[r-1][0].strip() == best_name[:22].strip():
        cell.set_facecolor("#E8F5E9")
ax10.set_title("Model Leaderboard", fontweight="bold", pad=12)

plt.savefig("crisp_dm_results.png", dpi=150, bbox_inches="tight", facecolor="white")
plt.close()
log("  Saved: crisp_dm_results.png")



  Saved: crisp_dm_results.png


In [13]:
# =============================================================
# PHASE 6 — DEPLOYMENT ARTIFACTS
# =============================================================
log("\n" + "=" * 65)
log("PHASE 6 — DEPLOYMENT")
log("=" * 65)

with open("best_model.pkl", "wb") as f:
    pickle.dump(best["model"], f)
log("  Saved: best_model.pkl")

metadata = {
    "best_model"        : best_name,
    "roc_auc"           : round(float(best["auc"]), 4),
    "avg_precision"     : round(float(best["ap"]), 4),
    "f1_churn"          : round(float(best["f1_churn"]), 4),
    "optimal_threshold" : round(best_threshold, 4),
    "features_used"     : all_feat_cols,
    "n_features"        : len(all_feat_cols),
    "train_size"        : int(len(X_train)),
    "val_size"          : int(len(X_val)),
    "class_distribution": {"retained": int((y==0).sum()), "churned": int((y==1).sum())},
    "churn_rate_pct"    : round(float(y.mean()*100), 2)
}
with open("model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)
log("  Saved: model_metadata.json")

with open("crisp_dm_report.txt", "w") as f:
    f.write("\n".join(REPORT_LINES))
log("  Saved: crisp_dm_report.txt")

pred_df = pd.DataFrame({
    "churn_probability": best["y_prob"],
    "predicted_label"  : (best["y_prob"] >= best_threshold).astype(int),
    "actual_label"     : y_val
})
pred_df.to_csv("validation_predictions.csv", index=False)
log("  Saved: validation_predictions.csv")

log("\n" + "=" * 65)
log("PIPELINE COMPLETE")
log(f"  Best Model    : {best_name}")
log(f"  ROC-AUC       : {best['auc']:.4f}")
log(f"  F1-Churn      : {best['f1_churn']:.4f}")
log(f"  Opt Threshold : {best_threshold:.2f}")
log("=" * 65)


PHASE 6 — DEPLOYMENT
  Saved: best_model.pkl
  Saved: model_metadata.json
  Saved: crisp_dm_report.txt
  Saved: validation_predictions.csv

PIPELINE COMPLETE
  Best Model    : Gradient Boosting
  ROC-AUC       : 0.7158
  F1-Churn      : 0.0266
  Opt Threshold : 0.12
