<a href="https://colab.research.google.com/github/M-M-Sobhy/Analyzing-Customer-Churn/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %% ONE-CELL: Telco Churn Stacking (XGB + LGBM + CatBoost + RF + HGB + MLP) -> Meta LogisticRegression
# Works out-of-the-box on Google Colab with a single cell.

# 0) Install deps (quiet)
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost matplotlib joblib

# 1) Imports
import os, json, sys, math, joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Tuple
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_curve, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 2) Config
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"   # ← لو اسم الملف مختلف عدّله هنا
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)

# 3) Locate dataset anywhere in the left pane (recursively)
def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Make sure it is visible on the left panel.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# 4) Load & basic clean
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in the CSV.")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# 5) Build a preprocessor (OHE for all non-numerics)
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# 6) Split train/test once (temporal info not present in Telco)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# 7) Define base models (diverse)
def get_base_models() -> Dict[str, object]:
    return {
        "xgb": XGBClassifier(
            n_estimators=400, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist"
        ),
        "lgb": LGBMClassifier(
            n_estimators=500, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE
        ),
        "cat": CatBoostClassifier(
            iterations=500, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        ),
        "rf": RandomForestClassifier(n_estimators=350, n_jobs=-1, random_state=RANDOM_STATE),
        "hgb": HistGradientBoostingClassifier(max_iter=350, learning_rate=0.08, random_state=RANDOM_STATE),
        "mlp": MLPClassifier(hidden_layer_sizes=(128,64), activation="relu",
                             alpha=1e-3, max_iter=250, random_state=RANDOM_STATE),
    }

base_models = get_base_models()
base_names = list(base_models.keys())
M = len(base_names)
print(f"Base learners: {base_names}")

# 8) OOF stacking function: builds level-1 OOF matrix for meta-learner
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
) -> Tuple[np.ndarray, np.ndarray, Dict[str, Dict[str, float]], List[Pipeline]]:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores: Dict[str, Dict[str, float]] = {}
    fitted_full_pipes: List[Pipeline] = []

    # For consistent indexing
    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline(steps=[
                ("pre", preprocessor),
                ("clf", clf)
            ])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = pipe.predict_proba(X_va_f)[:,1]
            preds_oof[va_idx] = p_va
            # fold metrics (on OOF)
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        # OOF scores aggregated
        aucs = [a for a, p in fold_scores]
        prs  = [p for a, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        # Fit FULL train for test predictions
        full_pipe = Pipeline(steps=[
            ("pre", preprocessor),
            ("clf", clf.__class__(**clf.get_params()))  # fresh clone with same params
        ])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        test_meta[:, j] = full_pipe.predict_proba(X_te_idx)[:,1]
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | "
              f"PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Building OOF & test meta matrices...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# 9) Meta-learner (Logistic Regression) trained on OOF
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)

# 10) Evaluate on test (using level-1 test_matrix)
y_prob = meta.predict_proba(test_matrix)[:,1]
y_hat  = (y_prob >= 0.5).astype(int)

meta_scores = {
    "roc_auc": float(roc_auc_score(y_test, y_prob)),
    "pr_auc":  float(average_precision_score(y_test, y_prob)),
    "f1":      float(f1_score(y_test, y_hat))
}

# 11) Plots
def plot_curves(y_true, y_score, out_dir: Path):
    # ROC
    fpr, tpr, _ = roc_curve(y_true, y_score)
    plt.figure()
    plt.plot(fpr, tpr, label="Meta ROC")
    plt.plot([0,1],[0,1], linestyle="--")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title("ROC Curve (Meta-Learner)"); plt.legend(); plt.tight_layout()
    plt.savefig(out_dir/"roc_curve_meta.png", dpi=160); plt.close()

    # PR
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    plt.figure()
    plt.plot(recall, precision, label="Meta PR")
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.title("Precision-Recall Curve (Meta-Learner)"); plt.legend(); plt.tight_layout()
    plt.savefig(out_dir/"pr_curve_meta.png", dpi=160); plt.close()

plot_curves(y_test, y_prob, OUT_DIR)

# 12) Business-aware threshold (optional, quick)
def optimal_profit_threshold(y_true, y_score, reward_tp=50.0, cost_fp=10.0, cost_fn=20.0):
    best_t, best_p = 0.5, -1e18
    for t in np.linspace(0.05, 0.95, 181):
        y_pred = (y_score >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        profit = tp*reward_tp - fp*cost_fp - fn*cost_fn
        if profit > best_p:
            best_p, best_t = profit, t
    return float(best_t), float(best_p)

best_t, best_profit = optimal_profit_threshold(y_test, y_prob)

# 13) Save artifacts (model = meta + fitted base pipes)
artifacts = {
    "base_oof_scores": base_oof_scores,
    "meta_scores": meta_scores,
    "best_threshold": best_t,
    "best_threshold_profit": best_profit,
    "base_models_order": base_names
}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))

# Save a single pickle with everything needed for later inference
bundle = {
    "base_pipes": fitted_base_pipes,   # list of (preprocessor+model) pipelines fitted on full train
    "meta": meta,                      # fitted meta-learner
    "base_order": base_names
}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")

print(json.dumps({
    "oof_base_scores": base_oof_scores,
    "meta_scores": meta_scores,
    "best_threshold": best_t,
    "outputs": {
        "metrics_json": str(OUT_DIR/"metrics.json"),
        "roc_curve": str(OUT_DIR/"roc_curve_meta.png"),
        "pr_curve": str(OUT_DIR/"pr_curve_meta.png"),
        "model_bundle": str(OUT_DIR/"stacking_bundle.pkl")
    }
}, indent=2))


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb', 'mlp']
→ Building OOF & test meta matrices...
[xgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6321
[LightGBM] [Info] Number of positive: 1121, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 4225, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265325 -> initscore=-1.018470
[LightGBM] [Info] Start training from score -1.018470




[LightGBM] [Info] Number of positive: 1121, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 4225, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265325 -> initscore=-1.018470
[LightGBM] [Info] Start training from score -1.018470




[LightGBM] [Info] Number of positive: 1122, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 4226, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265499 -> initscore=-1.017579
[LightGBM] [Info] Start training from score -1.017579




[LightGBM] [Info] Number of positive: 1122, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 4226, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265499 -> initscore=-1.017579
[LightGBM] [Info] Start training from score -1.017579




[LightGBM] [Info] Number of positive: 1122, number of negative: 3104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000562 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 4226, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265499 -> initscore=-1.017579
[LightGBM] [Info] Start training from score -1.017579




[LightGBM] [Info] Number of positive: 1402, number of negative: 3880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000777 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 5282, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265430 -> initscore=-1.017935
[LightGBM] [Info] Start training from score -1.017935




[lgb] OOF ROC-AUC=0.8204 | PR-AUC=0.6139
[cat] OOF ROC-AUC=0.8341 | PR-AUC=0.6457
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6159
[hgb] OOF ROC-AUC=0.8238 | PR-AUC=0.6229
[mlp] OOF ROC-AUC=0.7603 | PR-AUC=0.5848
{
  "oof_base_scores": {
    "xgb": {
      "oof_roc_auc_mean": 0.8254507754233513,
      "oof_pr_auc_mean": 0.6321342362150559
    },
    "lgb": {
      "oof_roc_auc_mean": 0.8204364442423702,
      "oof_pr_auc_mean": 0.6138892325593518
    },
    "cat": {
      "oof_roc_auc_mean": 0.8341403633535815,
      "oof_pr_auc_mean": 0.6456745455349153
    },
    "rf": {
      "oof_roc_auc_mean": 0.8214415530217665,
      "oof_pr_auc_mean": 0.6158901692851213
    },
    "hgb": {
      "oof_roc_auc_mean": 0.8238124462785444,
      "oof_pr_auc_mean": 0.6228601950922066
    },
    "mlp": {
      "oof_roc_auc_mean": 0.7602676802551376,
      "oof_pr_auc_mean": 0.5848197350110097
    }
  },
  "meta_scores": {
    "roc_auc": 0.8396834012358143,
    "pr_auc": 0.6437354414400763,
    "f1": 0.62910798

In [3]:
# %% Compare CatBoost (single model) vs Stacking on the SAME test split
# This cell now includes the necessary steps to generate the 'outputs/stacking_bundle.pkl' file
# by incorporating relevant code from the previous cell.

# 0) Install deps (quiet) - Included from previous cell
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost matplotlib joblib

# 1) Imports - Included from previous cell
import os, json, sys, math, joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Tuple
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_curve, roc_curve, confusion_matrix, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tqdm.auto import trange
import warnings

warnings.filterwarnings("ignore")

# 2) Config - Included from previous cell
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)

# 3) Locate dataset anywhere in the left pane (recursively) - Included from previous cell
def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Make sure it is visible on the left panel.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# 4) Load & basic clean - Included from previous cell
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in the CSV.")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# 5) Build a preprocessor (OHE for all non-numerics) - Included from previous cell
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# 6) Split train/test once (temporal info not present in Telco) - Included from previous cell
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# 7) Define base models (diverse) - Included from previous cell
def get_base_models() -> Dict[str, object]:
    return {
        "xgb": XGBClassifier(
            n_estimators=400, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist"
        ),
        "lgb": LGBMClassifier(
            n_estimators=500, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE
        ),
        "cat": CatBoostClassifier(
            iterations=500, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        ),
        "rf": RandomForestClassifier(n_estimators=350, n_jobs=-1, random_state=RANDOM_STATE),
        "hgb": HistGradientBoostingClassifier(max_iter=350, learning_rate=0.08, random_state=RANDOM_STATE),
        "mlp": MLPClassifier(hidden_layer_sizes=(128,64), activation="relu",
                             alpha=1e-3, max_iter=250, random_state=RANDOM_STATE),
    }

base_models = get_base_models()
base_names = list(base_models.keys())
M = len(base_names)
print(f"Base learners: {base_names}")

# 8) OOF stacking function: builds level-1 OOF matrix for meta-learner - Included from previous cell
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
) -> Tuple[np.ndarray, np.ndarray, Dict[str, Dict[str, float]], List[Pipeline]]:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores: Dict[str, Dict[str, float]] = {}
    fitted_full_pipes: List[Pipeline] = []

    # For consistent indexing
    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline(steps=[
                ("pre", preprocessor),
                ("clf", clf)
            ])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = pipe.predict_proba(X_va_f)[:,1]
            preds_oof[va_idx] = p_va
            # fold metrics (on OOF)
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        # OOF scores aggregated
        aucs = [a for a, p in fold_scores]
        prs  = [p for a, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        # Fit FULL train for test predictions
        full_pipe = Pipeline(steps=[
            ("pre", preprocessor),
            ("clf", clf.__class__(**clf.get_params()))  # fresh clone with same params
        ])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        test_meta[:, j] = full_pipe.predict_proba(X_te_idx)[:,1]
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | "
              f"PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Building OOF & test meta matrices...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# 9) Meta-learner (Logistic Regression) trained on OOF - Included from previous cell
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)

# 10) Evaluate on test (using level-1 test_matrix) - Included from previous cell
y_prob = meta.predict_proba(test_matrix)[:,1]
y_hat  = (y_prob >= 0.5).astype(int)

meta_scores = {
    "roc_auc": float(roc_auc_score(y_test, y_prob)),
    "pr_auc":  float(average_precision_score(y_test, y_prob)),
    "f1":      float(f1_score(y_test, y_hat))
}

# 11) Plots - Included from previous cell
def plot_curves(y_true, y_score, out_dir: Path):
    # ROC
    fpr, tpr, _ = roc_curve(y_true, y_score)
    plt.figure()
    plt.plot(fpr, tpr, label="Meta ROC")
    plt.plot([0,1],[0,1], linestyle="--")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title("ROC Curve (Meta-Learner)"); plt.legend(); plt.tight_layout()
    plt.savefig(out_dir/"roc_curve_meta.png", dpi=160); plt.close()

    # PR
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    plt.figure()
    plt.plot(recall, precision, label="Meta PR")
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.title("Precision-Recall Curve (Meta-Learner)"); plt.legend(); plt.tight_layout()
    plt.savefig(out_dir/"pr_curve_meta.png", dpi=160); plt.close()

plot_curves(y_test, y_prob, OUT_DIR)

# 12) Business-aware threshold (optional, quick) - Included from previous cell
def optimal_profit_threshold(y_true, y_score, reward_tp=50.0, cost_fp=10.0, cost_fn=20.0):
    best_t, best_p = 0.5, -1e18
    for t in np.linspace(0.05, 0.95, 181):
        y_pred = (y_score >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        profit = tp*reward_tp - fp*cost_fp - fn*cost_fn
        if profit > best_p:
            best_p, best_t = profit, t
    return float(best_t), float(best_p)

best_t, best_profit = optimal_profit_threshold(y_test, y_prob)

# 13) Save artifacts (model = meta + fitted base pipes) - Included from previous cell
artifacts = {
    "base_oof_scores": base_oof_scores,
    "meta_scores": meta_scores,
    "best_threshold": best_t,
    "best_threshold_profit": best_profit,
    "base_models_order": base_names
}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))

# Save a single pickle with everything needed for later inference - Included from previous cell
bundle = {
    "base_pipes": fitted_base_pipes,   # list of (preprocessor+model) pipelines fitted on full train
    "meta": meta,                      # fitted meta-learner
    "base_order": base_names
}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")

print(json.dumps({
    "oof_base_scores": base_oof_scores,
    "meta_scores": meta_scores,
    "best_threshold": best_t,
    "outputs": {
        "metrics_json": str(OUT_DIR/"metrics.json"),
        "roc_curve": str(OUT_DIR/"roc_curve_meta.png"),
        "pr_curve": str(OUT_DIR/"pr_curve_meta.png"),
        "model_bundle": str(OUT_DIR/"stacking_bundle.pkl")
    }
}, indent=2))

# -------------------
# 14) Load your trained bundle (stacking) and extract CatBoost pipe
# -------------------
# Removed redundant data loading and splitting from original cell
# bundle = joblib.load("outputs/stacking_bundle.pkl") # Already loaded above
# base_pipes = bundle["base_pipes"]          # fitted on FULL train # Already loaded above
# base_order = bundle["base_order"]          # e.g., ['xgb','lgb','cat','rf','hgb','mlp'] # Already loaded above
# meta = bundle["meta"] # Already loaded above

# Find CatBoost pipe by name
try:
    cat_idx = base_order.index("cat")
except ValueError:
    raise RuntimeError("Couldn't find 'cat' in base_order; check your base model names.")

pipe_cat = base_pipes[cat_idx]

# -------------------
# 15) Predict probabilities
# -------------------
# CatBoost alone:
p_cat = pipe_cat.predict_proba(X_test_raw)[:,1] # Using X_test_raw from the combined code

# Stacking:
test_meta = np.column_stack([p.predict_proba(X_test_raw)[:,1] for p in base_pipes]) # Using X_test_raw from the combined code
p_stack = meta.predict_proba(test_meta)[:,1]

# Load profit-aware threshold you computed سابقاً (لو موجود)
t_star = None
mjson = Path("outputs/metrics.json")
if mjson.exists():
    cfg = json.loads(mjson.read_text())
    t_star = float(cfg.get("best_threshold", 0.5))
else:
    t_star = 0.5

# -------------------
# 16) Metrics @probabilities (AUCs) + threshold sweep
# -------------------
def summarize_probs(y_true, y_score, name, thr=0.5):
    roc = roc_auc_score(y_true, y_score)
    pr  = average_precision_score(y_true, y_score)
    y_pred = (y_score >= thr).astype(int)
    f1 = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "model": name, "ROC_AUC": roc, "PR_AUC": pr, "F1@thr": f1,
        "Precision@thr": prec, "Recall@thr": rec, "thr": thr,
        "TP": tp, "FP": fp, "FN": fn, "TN": tn
    }

# Baseline at business threshold
sum_cat   = summarize_probs(y_test, p_cat,   "CatBoost",  t_star) # Using y_test from the combined code
sum_stack = summarize_probs(y_test, p_stack, "Stacking",  t_star) # Using y_test from the combined code
summary_df = pd.DataFrame([sum_cat, sum_stack])
print("== Summary @ business threshold ==")
display(summary_df)

# Threshold sweep table
thr_grid = np.linspace(0.05, 0.95, 37)
def sweep_table(y_true, y_score, name):
    rows = []
    for t in thr_grid:
        rows.append(summarize_probs(y_true, y_score, name, t))
    return pd.DataFrame(rows)

sweep_cat   = sweep_table(y_test, p_cat,   "CatBoost") # Using y_test from the combined code
sweep_stack = sweep_table(y_test, p_stack, "Stacking") # Using y_test from the combined code

# -------------------
# 17) Profit curve (you can tweak costs/rewards)
# -------------------
REWARD_TP = 50.0
COST_FP   = 10.0
COST_FN   = 20.0

def profit_at(y_true, y_score, thr):
    y_pred = (y_score >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tp*REWARD_TP - fp*COST_FP - fn*COST_FN

def profit_curve(y_true, y_score):
    return pd.DataFrame({
        "threshold": thr_grid,
        "profit": [profit_at(y_true, y_score, t) for t in thr_grid]
    })

profit_cat   = profit_curve(y_test, p_cat).assign(model="CatBoost") # Using y_test from the combined code
profit_stack = profit_curve(y_test, p_stack).assign(model="Stacking") # Using y_test from the combined code
profit_all = pd.concat([profit_cat, profit_stack], ignore_index=True)

# Best profit per model
best_cat   = profit_cat.loc[profit_cat["profit"].idxmax()]
best_stack = profit_stack.loc[profit_stack["profit"].idxmax()]
print(f"Best profit CatBoost: thr={best_cat.threshold:.3f}, profit={best_cat.profit:.1f}")
print(f"Best profit Stacking: thr={best_stack.threshold:.3f}, profit={best_stack.profit:.1f}")

# -------------------
# 18) Bootstrap CIs for ROC-AUC and PR-AUC
# -------------------
def bootstrap_ci_auc(y_true, y_score, n=1000, alpha=0.05, seed=42):
    rng = np.random.default_rng(seed)
    n_ = len(y_true)
    rocs, prs = [], []
    for _ in trange(n, leave=False):
        idx = rng.integers(0, n_, n_)
        yt = np.asarray(y_true)[idx]; ps = np.asarray(y_score)[idx]
        # Guard for degenerate samples with single class
        if len(np.unique(yt)) < 2:
            continue
        rocs.append(roc_auc_score(yt, ps))
        prs.append(average_precision_score(yt, ps))
    rocs = np.array(rocs); prs = np.array(prs)
    lo = int((alpha/2)*len(rocs)); hi = int((1-alpha/2)*len(rocs))
    rocs_ci = np.sort(rocs)[[lo, hi-1]]
    prs_ci  = np.sort(prs)[[lo, hi-1]]
    return {
        "ROC_AUC": (float(np.mean(rocs)), float(rocs_ci[0]), float(rocs_ci[1])),
        "PR_AUC":  float(np.mean(prs)),  float(prs_ci[0]),  float(prs_ci[1])),
        "N": int(len(rocs))
    }

ci_cat   = bootstrap_ci_auc(y_test, p_cat,   n=1000, alpha=0.05, seed=123) # Using y_test from the combined code
ci_stack = bootstrap_ci_auc(y_test, p_stack, n=1000, alpha=0.05, seed=456) # Using y_test from the combined code

def ci_to_df(ci, name):
    return pd.DataFrame([
        {"model": name, "metric":"ROC_AUC", "mean": ci["ROC_AUC"][0], "lo": ci["ROC_AUC"][1], "hi": ci["ROC_AUC"][2], "N": ci["N"]},
        {"model": name, "metric":"PR_AUC",  "mean": ci["PR_AUC"][0],  "lo": ci["PR_AUC"][1], "hi": ci["PR_AUC"][2], "N": ci["N"]},
    ])

ci_df = pd.concat([ci_to_df(ci_cat,"CatBoost"), ci_to_df(ci_stack,"Stacking")], ignore_index=True)
print("== 95% Bootstrap CIs for AUCs ==")
display(ci_df)

# -------------------
# 19) Save artifacts (tables + CSV of predictions)
# -------------------
OUT = Path("outputs"); OUT.mkdir(exist_ok=True)
pd.DataFrame({
    "y_true": y_test, # Using y_test from the combined code
    "p_cat": p_cat,
    "p_stack": p_stack
}).to_csv(OUT/"test_preds_cat_vs_stack.csv", index=False)

summary_df.to_csv(OUT/"summary_at_business_threshold.csv", index=False)
sweep_cat.to_csv(OUT/"sweep_cat.csv", index=False)
sweep_stack.to_csv(OUT/"sweep_stack.csv", index=False)
profit_all.to_csv(OUT/"profit_curves.csv", index=False)

print("Saved:")
print(" - outputs/test_preds_cat_vs_stack.csv")
print(" - outputs/summary_at_business_threshold.csv")
print(" - outputs/sweep_cat.csv")
print(" - outputs/sweep_stack.csv")
print(" - outputs/profit_curves.csv")

# -------------------
# 20) Optional: quick plots (ROC/PR + Profit)
# -------------------
import matplotlib.pyplot as plt

def plot_roc_pr(y_true, p1, p2, name1="CatBoost", name2="Stacking"):
    fpr1, tpr1, _ = roc_curve(y_true, p1)
    fpr2, tpr2, _ = roc_curve(y_true, p2)
    plt.figure(); plt.plot(fpr1, tpr1, label=f"{name1}")
    plt.plot(fpr2, tpr2, label=f"{name2}")
    plt.plot([0,1],[0,1],"--"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curve"); plt.legend(); plt.tight_layout()
    plt.savefig(OUT/"roc_cat_vs_stack.png", dpi=160); plt.close()

    pr1, rc1, _ = precision_recall_curve(y_true, p1)
    pr2, rc2, _ = precision_recall_curve(y_true, p2)
    plt.figure(); plt.plot(rc1, pr1, label=f"{name1}")
    plt.plot(rc2, pr2, label=f"{name2}")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("PR Curve"); plt.legend(); plt.tight_layout()
    plt.savefig(OUT/"pr_cat_vs_stack.png", dpi=160); plt.close()

def plot_profit(df_profit):
    plt.figure()
    for name, g in df_profit.groupby("model"):
        g = g.sort_values("threshold")
        plt.plot(g["threshold"], g["profit"], label=name)
    plt.xlabel("Threshold"); plt.ylabel("Profit"); plt.title("Profit Curve"); plt.legend(); plt.tight_layout()
    plt.savefig(OUT/"profit_curves_cat_vs_stack.png", dpi=160); plt.close()

plot_roc_pr(y_test, p_cat, p_stack, "CatBoost", "Stacking") # Using y_test from the combined code
plot_profit(profit_all)
print("Saved plots:")
print(" - outputs/roc_cat_vs_stack.png")
print(" - outputs/pr_cat_vs_stack.png")
print(" - outputs/profit_curves_cat_vs_stack.png")

SyntaxError: closing parenthesis ')' does not match opening parenthesis '{' on line 370 (ipython-input-2177251830.py, line 372)

In [4]:
# %% Robust compare: CatBoost (single) vs Stacking
import numpy as np, pandas as pd, json, joblib, warnings
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    precision_recall_curve, roc_curve, confusion_matrix, precision_score, recall_score
)

warnings.filterwarnings("ignore")

# ===== Safety checks =====
CSV = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
BUNDLE = Path("outputs/stacking_bundle.pkl")
OUT = Path("outputs"); OUT.mkdir(exist_ok=True)

# Try to locate CSV if not in cwd
if not Path(CSV).exists():
    import os
    def find_file_recursively(filename, roots=["/content", "."]):
        for root in roots:
            for r, _, files in os.walk(root):
                if filename in files:
                    return os.path.join(r, filename)
        return None
    alt = find_file_recursively(CSV, ["/content", "."])
    if alt:
        print(f"[INFO] Using CSV at: {alt}")
        CSV = alt
    else:
        raise FileNotFoundError(
            f"Could not find {CSV}. Upload it or set CSV path correctly."
        )

if not BUNDLE.exists():
    raise FileNotFoundError(
        "outputs/stacking_bundle.pkl not found. "
        "Run the training cell first (the one that saves the bundle)."
    )

# ===== Load data & split =====
RANDOM_STATE = 42
TEST_SIZE = 0.25

df = pd.read_csv(CSV)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)

if "Churn" not in df.columns:
    raise ValueError("Column 'Churn' not found in CSV.")

y_all = df["Churn"].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int).values
X_all = df.drop(columns=["Churn","customerID"], errors="ignore")

X_tr_raw, X_te_raw, y_tr, y_te = train_test_split(
    X_all, y_all, test_size=TEST_SIZE, stratify=y_all, random_state=RANDOM_STATE
)

# ===== Load bundle (stacking) =====
bundle = joblib.load(BUNDLE)
base_pipes = bundle.get("base_pipes", None)
base_order = bundle.get("base_order", None)
meta = bundle.get("meta", None)

if base_pipes is None or base_order is None or meta is None:
    raise RuntimeError("Bundle is missing keys. Re-run training cell to regenerate the bundle.")

# Get CatBoost pipe
if "cat" not in base_order:
    raise RuntimeError(f"'cat' not found in base_order {base_order}. "
                       "Ensure the base models dict uses key 'cat'.")

cat_idx = base_order.index("cat")
pipe_cat = base_pipes[cat_idx]

# ===== Predict probs =====
p_cat = pipe_cat.predict_proba(X_te_raw)[:,1]
test_meta = np.column_stack([p.predict_proba(X_te_raw)[:,1] for p in base_pipes])
p_stack = meta.predict_proba(test_meta)[:,1]

# ===== Threshold (profit-aware) =====
t_star = 0.5
mjson = Path("outputs/metrics.json")
if mjson.exists():
    try:
        cfg = json.loads(mjson.read_text())
        t_star = float(cfg.get("best_threshold", 0.5))
    except Exception:
        pass
print(f"[INFO] Using threshold = {t_star:.3f}")

# ===== Helpers =====
def summarize_probs(y_true, y_score, name, thr=0.5):
    roc = roc_auc_score(y_true, y_score)
    pr  = average_precision_score(y_true, y_score)
    y_pred = (y_score >= thr).astype(int)
    f1 = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "model": name, "ROC_AUC": roc, "PR_AUC": pr, "F1@thr": f1,
        "Precision@thr": prec, "Recall@thr": rec, "thr": thr,
        "TP": tp, "FP": fp, "FN": fn, "TN": tn
    }

# ===== Summary @ business threshold =====
sum_cat   = summarize_probs(y_te, p_cat,   "CatBoost",  t_star)
sum_stack = summarize_probs(y_te, p_stack, "Stacking",  t_star)
summary_df = pd.DataFrame([sum_cat, sum_stack])
print("== Summary @ business threshold ==")
print(summary_df.to_string(index=False))

# ===== Profit curve =====
REWARD_TP = 50.0; COST_FP = 10.0; COST_FN = 20.0
thr_grid = np.linspace(0.05, 0.95, 37)

def profit_at(y_true, y_score, thr):
    y_pred = (y_score >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tp*REWARD_TP - fp*COST_FP - fn*COST_FN

profit_cat   = pd.DataFrame({"threshold": thr_grid, "profit": [profit_at(y_te, p_cat, t) for t in thr_grid]}).assign(model="CatBoost")
profit_stack = pd.DataFrame({"threshold": thr_grid, "profit": [profit_at(y_te, p_stack, t) for t in thr_grid]}).assign(model="Stacking")
profit_all = pd.concat([profit_cat, profit_stack], ignore_index=True)

best_cat   = profit_cat.loc[profit_cat["profit"].idxmax()]
best_stack = profit_stack.loc[profit_stack["profit"].idxmax()]
print(f"Best profit CatBoost: thr={best_cat.threshold:.3f}, profit={best_cat.profit:.1f}")
print(f"Best profit Stacking: thr={best_stack.threshold:.3f}, profit={best_stack.profit:.1f}")

# ===== Save outputs =====
pd.DataFrame({"y_true": y_te, "p_cat": p_cat, "p_stack": p_stack}).to_csv(OUT/"test_preds_cat_vs_stack.csv", index=False)
summary_df.to_csv(OUT/"summary_at_business_threshold.csv", index=False)
profit_all.to_csv(OUT/"profit_curves.csv", index=False)

print("\nSaved:")
for f in ["test_preds_cat_vs_stack.csv", "summary_at_business_threshold.csv", "profit_curves.csv"]:
    print(" -", OUT/f)


FileNotFoundError: outputs/stacking_bundle.pkl not found. Run the training cell first (the one that saves the bundle).

In [6]:
# %% Rebuild stacking_bundle.pkl quickly (same split & pipeline)
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost joblib

import os, json, joblib, numpy as np, pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# ---- Config (match your earlier run) ----
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)

# ---- Locate CSV (Colab or CWD) ----
def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Upload it or set CSV_FILE_NAME.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# ---- Load & basic clean ----
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found.")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# ---- Preprocessor ----
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# ---- Split (same as before) ----
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# ---- Base models (slightly lighter to be fast) ----
def get_base_models() -> Dict[str, object]:
    return {
        "xgb": XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1
        ),
        "lgb": LGBMClassifier(
            n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
        ),
        "cat": CatBoostClassifier(
            iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        ),
        "rf": RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE),
        "hgb": HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE),
        # نقدر نسيب الـ MLP مؤقتًا لتسريع الإنقاذ، لكن هنبقيه زي ما هو:
        # "mlp": MLPClassifier(hidden_layer_sizes=(128,64), activation="relu", alpha=1e-3, max_iter=250, random_state=RANDOM_STATE),
    }

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# ---- Build OOF & test meta ----
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores = {}
    fitted_full_pipes = []

    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline([("pre", preprocessor), ("clf", clf)])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = pipe.predict_proba(X_va_f)[:,1]
            preds_oof[va_idx] = p_va
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        aucs = [a for a, _ in fold_scores]
        prs  = [p for _, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        full_pipe = Pipeline([("pre", preprocessor), ("clf", clf.__class__(**clf.get_params()))])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        test_meta[:, j] = full_pipe.predict_proba(X_te_idx)[:,1]
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# ---- Meta-learner ----
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)

# ---- Save bundle ----
bundle = {
    "base_pipes": fitted_base_pipes,
    "meta": meta,
    "base_order": base_names
}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")
print("✓ Saved:", OUT_DIR/"stacking_bundle.pkl")

# (اختياري) نحفظ شوية مقاييس
artifacts = {
    "base_oof_scores": base_oof_scores,
    "meta_placeholder": True
}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))
print("✓ Saved:", OUT_DIR/"metrics.json")



✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420
[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved: outputs/metrics.json


In [3]:
# =========================================
# ONE-CELL COLAB PIPELINE: STACKING + SHAP
# =========================================

# -- Install deps (quiet) --
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib

# -- Python code --
import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")  # for headless savefig
import matplotlib.pyplot as plt

from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# Optional boosters (will be used if available)
try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    XGBClassifier = None
    HAVE_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAVE_LGB = True
except Exception:
    LGBMClassifier = None
    HAVE_LGB = False

try:
    from catboost import CatBoostClassifier
    HAVE_CAT = True
except Exception:
    CatBoostClassifier = None
    HAVE_CAT = False

import shap

# -------------------------
# Config & I/O
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42

OUT_DIR = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(parents=True, exist_ok=True)

def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        if not os.path.exists(root):
            continue
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Upload it or set CSV_FILE_NAME.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# -------------------------
# Load & basic clean
# -------------------------
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Columns: {list(df.columns)}")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# -------------------------
# Preprocessor
# -------------------------
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------
# Train / Test split
# -------------------------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# -------------------------
# Base models
# -------------------------
def get_base_models() -> Dict[str, object]:
    models = {}
    if HAVE_XGB and XGBClassifier is not None:
        models["xgb"] = XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1
        )
    if HAVE_LGB and LGBMClassifier is not None:
        models["lgb"] = LGBMClassifier(
            n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
        )
    if HAVE_CAT and CatBoostClassifier is not None:
        models["cat"] = CatBoostClassifier(
            iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        )
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# Build OOF & Test meta
# -------------------------
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores = {}
    fitted_full_pipes = []

    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline([("pre", preprocessor), ("clf", clf)])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = (pipe.predict_proba(X_va_f)[:,1]
                    if hasattr(pipe.named_steps["clf"], "predict_proba")
                    else pipe.decision_function(X_va_f))
            # map scores to [0,1] if needed
            if p_va.ndim == 1 and (p_va.min() < 0 or p_va.max() > 1):
                from scipy.special import expit
                p_va = expit(p_va)
            preds_oof[va_idx] = p_va
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        aucs = [a for a, _ in fold_scores]
        prs  = [p for _, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        # Fit on full train for test predictions
        full_est = clf.__class__(**clf.get_params())
        full_pipe = Pipeline([("pre", preprocessor), ("clf", full_est)])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        p_test = (full_pipe.predict_proba(X_te_idx)[:,1]
                  if hasattr(full_pipe.named_steps["clf"], "predict_proba")
                  else full_pipe.decision_function(X_te_idx))
        if p_test.ndim == 1 and (p_test.min() < 0 or p_test.max() > 1):
            from scipy.special import expit
            p_test = expit(p_test)
        test_meta[:, j] = p_test
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# -------------------------
# Meta-learner
# -------------------------
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)

# quick holdout sanity
meta_val_auc = roc_auc_score(y_test, meta.predict_proba(test_matrix)[:,1])
meta_val_pr  = average_precision_score(y_test, meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

# -------------------------
# Save bundle & metrics
# -------------------------
bundle = {
    "base_pipes": fitted_base_pipes,  # full pipelines (pre + clf)
    "meta": meta,
    "base_order": base_names
}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")
print("✓ Saved:", OUT_DIR/"stacking_bundle.pkl")

artifacts = {
    "base_oof_scores": base_oof_scores,
    "meta_holdout": {"roc_auc": float(meta_val_auc), "pr_auc": float(meta_val_pr)},
    "used_models": base_names
}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))
print("✓ Saved:", OUT_DIR/"metrics.json")

# =======================================================
# EXPLAINABILITY LAYER (SHAP)
# 1) Meta-level (which base model drove the decision?)
# 2) Feature-level per base model + aggregated Top-10
# 3) Single-customer waterfall example
# =======================================================

# ---- 1) SHAP: meta-learner contributions (base learners as features) ----
X_meta_train = oof_matrix
X_meta_test  = test_matrix

try:
    explainer_meta = shap.LinearExplainer(meta, X_meta_train, feature_names=base_names)
except Exception:
    explainer_meta = shap.Explainer(meta, X_meta_train, feature_names=base_names)

shap_values_meta = explainer_meta(X_meta_test)

plt.figure()
shap.summary_plot(shap_values_meta, feature_names=base_names, show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_base_contributions.png", dpi=200)
plt.close()

plt.figure()
shap.summary_plot(shap_values_meta, feature_names=base_names, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_bar_base_contributions.png", dpi=200)
plt.close()

meta_importance = np.mean(np.abs(shap_values_meta.values), axis=0)
meta_importance = (pd.Series(meta_importance, index=base_names)
                   .sort_values(ascending=False))
meta_importance.to_csv(EXPLAIN_DIR / "meta_base_importance.csv", header=["mean_abs_shap"])

print("✓ Saved meta SHAP summaries & importances.")

# ---- 2) SHAP: original feature-level per base model + aggregated ----
def get_feature_names_from_preprocessor(pre):
    return pre.get_feature_names_out()

def shap_for_pipeline(pipe, X_raw_sample, max_background=512, tag="model"):
    pre = pipe.named_steps["pre"]
    clf = pipe.named_steps["clf"]
    X_enc = pre.transform(X_raw_sample)
    feat_names = get_feature_names_from_preprocessor(pre)

    # Prefer Tree/Linear explainers; fallback to Kernel for anything else
    explainer = None
    # Try a fast path
    try:
        explainer = shap.Explainer(clf, X_enc, feature_names=feat_names)
    except Exception:
        # KernelExplainer (slower) with a small background sample
        bg = shap.sample(X_enc, min(max_background, X_enc.shape[0]))
        try:
            explainer = shap.KernelExplainer(
                lambda data: clf.predict_proba(data)[:,1] if hasattr(clf, "predict_proba") else clf.decision_function(data),
                bg
            )
        except Exception:
            # final fallback: use shap.Explainer with bg
            explainer = shap.Explainer(
                lambda data: clf.predict_proba(data)[:,1] if hasattr(clf, "predict_proba") else clf.decision_function(data),
                bg
            )

    sv = explainer(X_enc)

    # Visual summaries
    plt.figure()
    shap.summary_plot(sv, feature_names=feat_names, show=False)
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / f"{tag}_summary.png", dpi=200)
    plt.close()

    plt.figure()
    shap.summary_plot(sv, feature_names=feat_names, plot_type="bar", show=False)
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / f"{tag}_summary_bar.png", dpi=200)
    plt.close()

    return sv, feat_names

# compact sample for speed
X_for_explain = X_train_raw.sample(n=min(2000, len(X_train_raw)), random_state=RANDOM_STATE)

global_feat_importance = defaultdict(list)
per_model_top10 = {}

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        sv, fn = shap_for_pipeline(pipe, X_for_explain, tag=f"base_{name}")
        vals = sv.values
        if isinstance(vals, list):  # sometimes SV returns list per class
            vals = np.array(vals)
        # collapse multiclass if exists
        if vals.ndim == 3:
            vals = np.mean(np.abs(vals), axis=2)
        mean_abs = np.mean(np.abs(vals), axis=0)
        imp = pd.Series(mean_abs, index=fn).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)
        for f, v in imp.items():
            global_feat_importance[f].append(v)
        print(f"✓ SHAP computed for base model: {name}")
    except Exception as e:
        print(f"! SHAP failed for base model {name}: {e}")

# Aggregate across models (mean of |SHAP|)
if len(global_feat_importance) > 0:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    (pd.DataFrame(per_model_top10).fillna(0.0)
        .to_csv(EXPLAIN_DIR / "per_model_top10.csv"))
    agg.to_csv(EXPLAIN_DIR / "global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR / "global_top10_features.csv", header=["mean_abs_shap"])
    print("✓ Saved per-model & aggregated feature importances.")
else:
    print("! No per-model SHAP importances aggregated (all failed?).")

# ---- 3) Single-customer waterfall example (use strongest base by meta importance if possible) ----
try:
    # pick the base with highest meta contribution
    best_base = meta_importance.index[0]
    base_idx = base_names.index(best_base)
except Exception:
    best_base = base_names[0]
    base_idx = 0

row0 = X_test_raw.iloc[[0]]
strong_pipe = fitted_base_pipes[base_idx]

try:
    pre = strong_pipe.named_steps["pre"]
    clf = strong_pipe.named_steps["clf"]
    X_enc = pre.transform(row0)
    feat_names = pre.get_feature_names_out()
    expl = shap.Explainer(clf, X_enc, feature_names=feat_names)
    sv = expl(X_enc)
    plt.figure()
    shap.plots.waterfall(sv[0], show=False)
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "single_customer_waterfall.png", dpi=200)
    plt.close()
    print(f"✓ Saved single customer waterfall using base '{best_base}'.")
except Exception as e:
    print(f"! Single-customer waterfall failed: {e}")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR} (PNG + CSVs)")


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved: outputs/metrics.json




✓ Saved meta SHAP summaries & importances.


  0%|          | 0/2000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [2]:
# =========================
# FAST SHAP PATCH (drop-in)
# =========================
import shap, numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict
from pathlib import Path

EXPLAIN_DIR = (OUT_DIR / "explain"); EXPLAIN_DIR.mkdir(parents=True, exist_ok=True)

def _to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

def _is_tree_model(clf):
    name = clf.__class__.__name__.lower()
    return any(k in name for k in [
        "xgb", "lgbm", "catboost", "forest", "gradientboosting", "histgradient"
    ])

def _feature_names(pre):
    # works with sklearn >=1.0
    return pre.get_feature_names_out()

# --- 2) SHAP سريع على مستوى الخصائص الأصلية لكل base model ---
# عيّنة أصغر للتفسير (سرعة أعلى مع تمثيل كويس)
N_SAMPLE = min(800, len(X_train_raw))
X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)

# نجهّز تراكيب التجميع
global_feat_importance = defaultdict(list)
per_model_top10 = {}

# عدد نقاط خلفية (kmeans) للخلفية
K_BG = 64

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre = pipe.named_steps["pre"]
        clf = pipe.named_steps["clf"]

        X_enc = pre.transform(X_for_explain)
        X_enc = _to_dense(X_enc)  # مهم جدًا لو الـOneHot بيطلع sparse
        feat_names = _feature_names(pre)

        if _is_tree_model(clf):
            # المسار السريع: TreeExplainer
            bg = shap.kmeans(X_enc, K_BG)  # خلفية مُلخّصة
            explainer = shap.TreeExplainer(
                clf, data=bg, model_output="probability",
                feature_perturbation="interventional"
            )
            # إيقاف additivity check يسرّع جدًا
            sv_values = explainer.shap_values(X_enc, check_additivity=False)
            # اتّساق مع الإخراج الموحّد (array -> object شبيه بـ sv)
            if isinstance(sv_values, list):  # multiclass
                vals = np.mean([np.abs(v) for v in sv_values], axis=0)
            else:
                vals = np.abs(sv_values)
        else:
            # سريع قدر الإمكان للـlinear؛ وإلا fallback بسيط
            try:
                bg = shap.sample(X_enc, min(128, X_enc.shape[0]))
                explainer = shap.LinearExplainer(clf, bg, feature_names=feat_names)
                sv = explainer(X_enc)
                vals = np.abs(sv.values)
            except Exception:
                bg = shap.kmeans(X_enc, min(32, X_enc.shape[0]))
                predict_fn = (lambda data: clf.predict_proba(data)[:,1]
                              if hasattr(clf, "predict_proba")
                              else clf.decision_function(data))
                explainer = shap.KernelExplainer(predict_fn, bg)
                # نقيّد بعدد محدود من التفسيرات لو لزم
                X_batch = shap.sample(X_enc, min(400, X_enc.shape[0]))
                sv = explainer(X_batch)
                vals = np.abs(sv.values)

        # حساب أهمية متوسط |SHAP|
        mean_abs = np.mean(vals, axis=0)
        imp = pd.Series(mean_abs, index=feat_names).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        # حفظ ملخصات مختصرة (bar فقط أسرع من scatter)
        (imp.head(30)
         .sort_values(ascending=True)
         .plot(kind="barh", figsize=(6, 8)))
        plt.tight_layout()
        plt.savefig(EXPLAIN_DIR / f"base_{name}_top30_bar.png", dpi=200)
        plt.close()

        # للتجميع العالمي
        for f, v in imp.items():
            global_feat_importance[f].append(v)

        print(f"✓ Fast SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed/slow for base model {name}: {e}")

# تجميع Across models (متوسط الأهميات)
if len(global_feat_importance) > 0:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    (pd.DataFrame(per_model_top10).fillna(0.0)
        .to_csv(EXPLAIN_DIR / "per_model_top10.csv"))
    agg.to_csv(EXPLAIN_DIR / "global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR / "global_top10_features.csv", header=["mean_abs_shap"])

    (top10_all.sort_values(ascending=True)
        .plot(kind="barh", figsize=(5, 4)))
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "global_top10_bar.png", dpi=200)
    plt.close()

    print("✓ Saved per-model & aggregated feature importances.")

# --- 3) مثال Waterfall لعميل واحد لكن بخطوات سريعة ---
try:
    # اختَر أقوى base بحسب مساهمته لدى الـmeta (محسوبة قبلًا)
    meta_imp = pd.read_csv(EXPLAIN_DIR / "meta_base_importance.csv", index_col=0).iloc[:,0]
    best_base = meta_imp.idxmax()
    base_idx = base_names.index(best_base)
except Exception:
    best_base = base_names[0]
    base_idx = 0

row0 = X_test_raw.iloc[[0]]
pipe = fitted_base_pipes[base_idx]
try:
    pre = pipe.named_steps["pre"]
    clf = pipe.named_steps["clf"]
    X0 = pre.transform(row0)
    X0 = _to_dense(X0)
    fn = _feature_names(pre)

    if _is_tree_model(clf):
        bg = shap.kmeans(_to_dense(pre.transform(X_train_raw.sample(400, random_state=RANDOM_STATE))), 32)
        expl = shap.TreeExplainer(clf, data=bg, model_output="probability", feature_perturbation="interventional")
        sv_vals = expl.shap_values(X0, check_additivity=False)
        vals0 = sv_vals if not isinstance(sv_vals, list) else np.mean([v for v in sv_vals], axis=0)
        # ارسم waterfall يدويًا (bar sorted) أسرع من waterfall الأصلي
        s = pd.Series(vals0[0], index=fn).abs().sort_values(ascending=False).head(15)
        s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
        plt.title(f"Top local contributions for one customer ({best_base})")
        plt.tight_layout()
        plt.savefig(EXPLAIN_DIR / "single_customer_top15_bar.png", dpi=200)
        plt.close()
    else:
        # Linear/Kernel fallback
        bg = shap.sample(_to_dense(pre.transform(X_train_raw.sample(400, random_state=RANDOM_STATE))), 64)
        expl = shap.LinearExplainer(clf, bg, feature_names=fn)
        sv = expl(X0)
        s = pd.Series(np.abs(sv.values[0]), index=fn).sort_values(ascending=False).head(15)
        s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
        plt.title(f"Top local contributions for one customer ({best_base})")
        plt.tight_layout()
        plt.savefig(EXPLAIN_DIR / "single_customer_top15_bar.png", dpi=200)
        plt.close()

    print(f"✓ Saved single-customer quick chart using base '{best_base}'.")
except Exception as e:
    print(f"! Single-customer quick chart failed: {e}")


! SHAP failed/slow for base model xgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model lgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model cat: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model rf: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model hgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! Single-customer quick chart failed: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!


In [4]:
# =========================================
# ONE-CELL COLAB: STACKING + FAST SHAP
# =========================================

# -- Install dependencies --
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib

# -- Python code (headless plotting) --
import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")  # for savefig without GUI
import matplotlib.pyplot as plt

from pathlib import Path
from typing import Dict, List
from collections import defaultdict

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# Optional boosters (used if available)
try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    XGBClassifier = None
    HAVE_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAVE_LGB = True
except Exception:
    LGBMClassifier = None
    HAVE_LGB = False

try:
    from catboost import CatBoostClassifier
    HAVE_CAT = True
except Exception:
    CatBoostClassifier = None
    HAVE_CAT = False

import shap

# -------------------------
# Config & I/O
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"   # عدّل لو الاسم مختلف
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42

# SHAP speed knobs
N_SAMPLE_SHAP = 800     # عدد صفوف التفسير (كان 2000) لتسريع كبير
K_BG = 64               # kmeans background size
EXPLAIN_TOP_BAR = 30    # عدد الخصائص في الرسم الأفقي
SINGLE_LOCAL_TOP = 15   # عدد الخصائص في الشارت المحلي لعميل واحد

OUT_DIR = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(parents=True, exist_ok=True)

def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        if not os.path.exists(root):
            continue
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Upload it to Colab Files or set CSV_FILE_NAME.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# -------------------------
# Load & basic clean
# -------------------------
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Columns: {list(df.columns)}")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# -------------------------
# Preprocessor
# -------------------------
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------
# Train / Test split
# -------------------------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# -------------------------
# Base models
# -------------------------
def get_base_models() -> Dict[str, object]:
    models = {}
    if HAVE_XGB and XGBClassifier is not None:
        models["xgb"] = XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1
        )
    if HAVE_LGB and LGBMClassifier is not None:
        models["lgb"] = LGBMClassifier(
            n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
        )
    if HAVE_CAT and CatBoostClassifier is not None:
        models["cat"] = CatBoostClassifier(
            iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        )
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# Build OOF & Test meta
# -------------------------
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores = {}
    fitted_full_pipes = []

    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline([("pre", preprocessor), ("clf", clf)])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = (pipe.predict_proba(X_va_f)[:,1]
                    if hasattr(pipe.named_steps["clf"], "predict_proba")
                    else pipe.decision_function(X_va_f))
            # Map scores to [0,1] if needed
            if p_va.ndim == 1 and (p_va.min() < 0 or p_va.max() > 1):
                from scipy.special import expit
                p_va = expit(p_va)
            preds_oof[va_idx] = p_va
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        aucs = [a for a, _ in fold_scores]
        prs  = [p for _, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        # Fit on full train for test predictions
        full_est = clf.__class__(**clf.get_params())
        full_pipe = Pipeline([("pre", preprocessor), ("clf", full_est)])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        p_test = (full_pipe.predict_proba(X_te_idx)[:,1]
                  if hasattr(full_pipe.named_steps["clf"], "predict_proba")
                  else full_pipe.decision_function(X_te_idx))
        if p_test.ndim == 1 and (p_test.min() < 0 or p_test.max() > 1):
            from scipy.special import expit
            p_test = expit(p_test)
        test_meta[:, j] = p_test
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# -------------------------
# Meta-learner
# -------------------------
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)

# quick holdout sanity
meta_val_auc = roc_auc_score(y_test, meta.predict_proba(test_matrix)[:,1])
meta_val_pr  = average_precision_score(y_test, meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

# -------------------------
# Save bundle & metrics
# -------------------------
bundle = {
    "base_pipes": fitted_base_pipes,  # full pipelines (pre + clf)
    "meta": meta,
    "base_order": base_names
}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")
print("✓ Saved:", OUT_DIR/"stacking_bundle.pkl")

artifacts = {
    "base_oof_scores": base_oof_scores,
    "meta_holdout": {"roc_auc": float(meta_val_auc), "pr_auc": float(meta_val_pr)},
    "used_models": base_names
}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))
print("✓ Saved:", OUT_DIR/"metrics.json")

# =======================================================
# EXPLAINABILITY (FAST SHAP)
# 1) Meta-level (base learners contributions)
# 2) Feature-level per base model + aggregated Top-10
# 3) Single-customer quick local chart
# =======================================================

# ---- 1) SHAP: meta-learner contributions (base learners as features) ----
X_meta_train = oof_matrix
X_meta_test  = test_matrix

try:
    explainer_meta = shap.LinearExplainer(meta, X_meta_train, feature_names=base_names)
except Exception:
    explainer_meta = shap.Explainer(meta, X_meta_train, feature_names=base_names)

shap_values_meta = explainer_meta(X_meta_test)

plt.figure()
shap.summary_plot(shap_values_meta, feature_names=base_names, show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_base_contributions.png", dpi=200)
plt.close()

plt.figure()
shap.summary_plot(shap_values_meta, feature_names=base_names, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_bar_base_contributions.png", dpi=200)
plt.close()

meta_importance = np.mean(np.abs(shap_values_meta.values), axis=0)
meta_importance = (pd.Series(meta_importance, index=base_names)
                   .sort_values(ascending=False))
meta_importance.to_csv(EXPLAIN_DIR / "meta_base_importance.csv", header=["mean_abs_shap"])
print("✓ Saved meta SHAP summaries & importances.")

# ---- Helpers for fast SHAP over pipelines ----
def _to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

def _is_tree_model(clf):
    name = clf.__class__.__name__.lower()
    return any(k in name for k in [
        "xgb", "lgbm", "catboost", "forest", "gradientboosting", "histgradient"
    ])

def _feature_names(pre):
    return pre.get_feature_names_out()

# ---- 2) Fast SHAP on original features per base model + aggregation ----
N_SAMPLE = min(N_SAMPLE_SHAP, len(X_train_raw))
X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)

global_feat_importance = defaultdict(list)
per_model_top10 = {}

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre = pipe.named_steps["pre"]
        clf = pipe.named_steps["clf"]

        X_enc = pre.transform(X_for_explain)
        X_enc = _to_dense(X_enc)
        feat_names = _feature_names(pre)

        if _is_tree_model(clf):
            # Fast path: TreeExplainer with kmeans background
            bg = shap.kmeans(X_enc, K_BG)
            explainer = shap.TreeExplainer(
                clf, data=bg, model_output="probability",
                feature_perturbation="interventional"
            )
            sv_values = explainer.shap_values(X_enc, check_additivity=False)
            if isinstance(sv_values, list):  # multiclass -> average abs across classes
                vals = np.mean([np.abs(v) for v in sv_values], axis=0)
            else:
                vals = np.abs(sv_values)
        else:
            # Linear/fallback
            try:
                bg = shap.sample(X_enc, min(128, X_enc.shape[0]))
                explainer = shap.LinearExplainer(clf, bg, feature_names=feat_names)
                sv = explainer(X_enc)
                vals = np.abs(sv.values)
            except Exception:
                bg = shap.kmeans(X_enc, min(32, X_enc.shape[0]))
                predict_fn = (lambda data: clf.predict_proba(data)[:,1]
                              if hasattr(clf, "predict_proba")
                              else clf.decision_function(data))
                explainer = shap.KernelExplainer(predict_fn, bg)
                X_batch = shap.sample(X_enc, min(400, X_enc.shape[0]))
                sv = explainer(X_batch)
                vals = np.abs(sv.values)

        mean_abs = np.mean(vals, axis=0)
        imp = pd.Series(mean_abs, index=feat_names).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        # Save compact bar chart (faster than scatter summary)
        (imp.head(EXPLAIN_TOP_BAR)
         .sort_values(ascending=True)
         .plot(kind="barh", figsize=(6, 8)))
        plt.tight_layout()
        plt.savefig(EXPLAIN_DIR / f"base_{name}_top{EXPLAIN_TOP_BAR}_bar.png", dpi=200)
        plt.close()

        for f, v in imp.items():
            global_feat_importance[f].append(v)

        print(f"✓ Fast SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed/slow for base model {name}: {e}")

# Aggregate across models (mean of |SHAP|)
if len(global_feat_importance) > 0:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    (pd.DataFrame(per_model_top10).fillna(0.0)
        .to_csv(EXPLAIN_DIR / "per_model_top10.csv"))
    agg.to_csv(EXPLAIN_DIR / "global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR / "global_top10_features.csv", header=["mean_abs_shap"])

    (top10_all.sort_values(ascending=True)
        .plot(kind="barh", figsize=(5, 4)))
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "global_top10_bar.png", dpi=200)
    plt.close()

    print("✓ Saved per-model & aggregated feature importances.")
else:
    print("! No per-model SHAP importances aggregated.")

# ---- 3) Single-customer quick local chart ----
try:
    # pick base with highest meta contribution
    try:
        meta_imp = pd.read_csv(EXPLAIN_DIR / "meta_base_importance.csv", index_col=0).iloc[:,0]
        best_base = meta_imp.idxmax()
        base_idx = base_names.index(best_base)
    except Exception:
        best_base = base_names[0]
        base_idx = 0

    row0 = X_test_raw.iloc[[0]]
    pipe = fitted_base_pipes[base_idx]
    pre = pipe.named_steps["pre"]
    clf = pipe.named_steps["clf"]
    X0 = pre.transform(row0)
    X0 = _to_dense(X0)
    fn = _feature_names(pre)

    if _is_tree_model(clf):
        bg = shap.kmeans(_to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE))), 32)
        expl = shap.TreeExplainer(clf, data=bg, model_output="probability", feature_perturbation="interventional")
        sv_vals = explainer.shap_values(X0, check_additivity=False) if 'explainer' in locals() else expl.shap_values(X0, check_additivity=False)
        vals0 = sv_vals if not isinstance(sv_vals, list) else np.mean([v for v in sv_vals], axis=0)
        s = pd.Series(np.abs(vals0[0]), index=fn).sort_values(ascending=False).head(SINGLE_LOCAL_TOP)
    else:
        bg = shap.sample(_to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE))), 64)
        expl = shap.LinearExplainer(clf, bg, feature_names=fn)
        sv = expl(X0)
        s = pd.Series(np.abs(sv.values[0]), index=fn).sort_values(ascending=False).head(SINGLE_LOCAL_TOP)

    s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
    plt.title(f"Top local contributions for one customer ({best_base})")
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "single_customer_top_bar.png", dpi=200)
    plt.close()
    print(f"✓ Saved single-customer quick chart using base '{best_base}'.")
except Exception as e:
    print(f"! Single-customer quick chart failed: {e}")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR} (PNGs + CSVs)")


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved: outputs/metrics.json
✓ Saved meta SHAP summaries & importances.
! SHAP failed/slow for base model xgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model lgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model cat: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model rf: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed/slow for base model hgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! No per-model SHAP importances aggregated.
! Single-customer quick chart failed: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!

=== OUTPUTS ===


In [5]:
# =========================================
# ONE-CELL COLAB: STACKING + FAST SHAP (maskers)
# Compatible with newer SHAP (>=0.45)
# =========================================

# -- Install dependencies --
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib

# -- Python code (headless plotting) --
import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")  # for savefig without GUI
import matplotlib.pyplot as plt

from pathlib import Path
from typing import Dict, List
from collections import defaultdict

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# Optional boosters (used if available)
try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    XGBClassifier = None
    HAVE_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAVE_LGB = True
except Exception:
    LGBMClassifier = None
    HAVE_LGB = False

try:
    from catboost import CatBoostClassifier
    HAVE_CAT = True
except Exception:
    CatBoostClassifier = None
    HAVE_CAT = False

import shap

# -------------------------
# Config & I/O
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"   # عدّل لو الاسم مختلف
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42

# SHAP speed knobs
N_SAMPLE_SHAP = 800     # عدد صفوف التفسير (كان 2000) لتسريع كبير
K_BG = 64               # kmeans/Independent background size
EXPLAIN_TOP_BAR = 30    # عدد الخصائص في الرسم الأفقي
SINGLE_LOCAL_TOP = 15   # عدد الخصائص في الشارت المحلي لعميل واحد

OUT_DIR = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(parents=True, exist_ok=True)

def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        if not os.path.exists(root):
            continue
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Upload it to Colab Files or set CSV_FILE_NAME.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# -------------------------
# Load & basic clean
# -------------------------
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Columns: {list(df.columns)}")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# -------------------------
# Preprocessor
# -------------------------
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------
# Train / Test split
# -------------------------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# -------------------------
# Base models
# -------------------------
def get_base_models() -> Dict[str, object]:
    models = {}
    if HAVE_XGB and XGBClassifier is not None:
        models["xgb"] = XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1
        )
    if HAVE_LGB and LGBMClassifier is not None:
        models["lgb"] = LGBMClassifier(
            n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
        )
    if HAVE_CAT and CatBoostClassifier is not None:
        models["cat"] = CatBoostClassifier(
            iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        )
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# Build OOF & Test meta
# -------------------------
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores = {}
    fitted_full_pipes = []

    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline([("pre", preprocessor), ("clf", clf)])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = (pipe.predict_proba(X_va_f)[:,1]
                    if hasattr(pipe.named_steps["clf"], "predict_proba")
                    else pipe.decision_function(X_va_f))
            # Map scores to [0,1] if needed
            if p_va.ndim == 1 and (p_va.min() < 0 or p_va.max() > 1):
                from scipy.special import expit
                p_va = expit(p_va)
            preds_oof[va_idx] = p_va
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        aucs = [a for a, _ in fold_scores]
        prs  = [p for _, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        # Fit on full train for test predictions
        full_est = clf.__class__(**clf.get_params())
        full_pipe = Pipeline([("pre", preprocessor), ("clf", full_est)])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        p_test = (full_pipe.predict_proba(X_te_idx)[:,1]
                  if hasattr(full_pipe.named_steps["clf"], "predict_proba")
                  else full_pipe.decision_function(X_te_idx))
        if p_test.ndim == 1 and (p_test.min() < 0 or p_test.max() > 1):
            from scipy.special import expit
            p_test = expit(p_test)
        test_meta[:, j] = p_test
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# -------------------------
# Meta-learner
# -------------------------
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)

# quick holdout sanity
meta_val_auc = roc_auc_score(y_test, meta.predict_proba(test_matrix)[:,1])
meta_val_pr  = average_precision_score(y_test, meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

# -------------------------
# Save bundle & metrics
# -------------------------
bundle = {
    "base_pipes": fitted_base_pipes,  # full pipelines (pre + clf)
    "meta": meta,
    "base_order": base_names
}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")
print("✓ Saved:", OUT_DIR/"stacking_bundle.pkl")

artifacts = {
    "base_oof_scores": base_oof_scores,
    "meta_holdout": {"roc_auc": float(meta_val_auc), "pr_auc": float(meta_val_pr)},
    "used_models": base_names
}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))
print("✓ Saved:", OUT_DIR/"metrics.json")

# =======================================================
# EXPLAINABILITY (FAST SHAP with maskers)
# 1) Meta-level (base learners contributions)
# 2) Feature-level per base model + aggregated Top-10
# 3) Single-customer quick local chart
# =======================================================

# ---- Helpers ----
def _to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

def _is_tree_model(clf):
    name = clf.__class__.__name__.lower()
    return any(k in name for k in [
        "xgb", "lgbm", "catboost", "forest", "gradientboosting", "histgradient"
    ])

def _feature_names(pre):
    return pre.get_feature_names_out()

# ---- 1) SHAP: meta-learner contributions (base learners as features) ----
X_meta_train = oof_matrix
X_meta_test  = test_matrix

try:
    masker_meta = shap.maskers.Independent(X_meta_train)
    explainer_meta = shap.LinearExplainer(meta, masker_meta, feature_names=base_names)
except Exception:
    explainer_meta = shap.Explainer(meta, shap.maskers.Independent(X_meta_train), feature_names=base_names)

shap_values_meta = explainer_meta(X_meta_test)

plt.figure()
shap.summary_plot(shap_values_meta, feature_names=base_names, show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_base_contributions.png", dpi=200)
plt.close()

plt.figure()
shap.summary_plot(shap_values_meta, feature_names=base_names, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_bar_base_contributions.png", dpi=200)
plt.close()

meta_importance = np.mean(np.abs(shap_values_meta.values), axis=0)
meta_importance = (pd.Series(meta_importance, index=base_names)
                   .sort_values(ascending=False))
meta_importance.to_csv(EXPLAIN_DIR / "meta_base_importance.csv", header=["mean_abs_shap"])
print("✓ Saved meta SHAP summaries & importances.")

# ---- 2) Fast SHAP on original features per base model + aggregation (with maskers) ----
N_SAMPLE = min(N_SAMPLE_SHAP, len(X_train_raw))
X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)

global_feat_importance = defaultdict(list)
per_model_top10 = {}

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre = pipe.named_steps["pre"]
        clf = pipe.named_steps["clf"]

        X_enc = pre.transform(X_for_explain)
        X_enc = _to_dense(X_enc)
        feat_names = _feature_names(pre)

        if _is_tree_model(clf):
            # Fast path: TreeExplainer with Independent masker (bg from kmeans/sample)
            # استخدم kmeans لو تحب؛ لكن Independent + sample سريع ومقبول
            bg = shap.sample(X_enc, min(K_BG*10, X_enc.shape[0]))
            masker = shap.maskers.Independent(bg)
            explainer = shap.TreeExplainer(
                clf, masker=masker, model_output="probability",
                feature_perturbation="interventional"
            )
            sv = explainer(X_enc, check_additivity=False)
            vals = np.abs(sv.values)  # (n_samples, n_features) أو (n_samples, n_classes, n_features)
            if vals.ndim == 3:        # multiclass → متوسط |SHAP| عبر الكلاسات
                vals = np.mean(vals, axis=1)
        else:
            # Linear/fallback with Independent masker
            bg = shap.sample(X_enc, min(128, X_enc.shape[0]))
            masker = shap.maskers.Independent(bg)
            try:
                explainer = shap.LinearExplainer(clf, masker, feature_names=feat_names)
                sv = explainer(X_enc)
                vals = np.abs(sv.values)
            except Exception:
                # Kernel fallback (أبطأ؛ نقيد الحجم)
                predict_fn = (lambda data: clf.predict_proba(data)[:,1]
                              if hasattr(clf, "predict_proba")
                              else clf.decision_function(data))
                explainer = shap.KernelExplainer(predict_fn, bg)
                X_batch = shap.sample(X_enc, min(400, X_enc.shape[0]))
                sv = explainer(X_batch)
                vals = np.abs(sv.values)

        mean_abs = np.mean(vals, axis=0)
        imp = pd.Series(mean_abs, index=feat_names).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        # Save compact bar chart (faster than scatter summary)
        (imp.head(EXPLAIN_TOP_BAR)
         .sort_values(ascending=True)
         .plot(kind="barh", figsize=(6, 8)))
        plt.tight_layout()
        plt.savefig(EXPLAIN_DIR / f"base_{name}_top{EXPLAIN_TOP_BAR}_bar.png", dpi=200)
        plt.close()

        for f, v in imp.items():
            global_feat_importance[f].append(v)

        print(f"✓ Fast SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed for base model {name}: {e}")

# Aggregate across models (mean of |SHAP|)
if len(global_feat_importance) > 0:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    (pd.DataFrame(per_model_top10).fillna(0.0)
        .to_csv(EXPLAIN_DIR / "per_model_top10.csv"))
    agg.to_csv(EXPLAIN_DIR / "global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR / "global_top10_features.csv", header=["mean_abs_shap"])

    (top10_all.sort_values(ascending=True)
        .plot(kind="barh", figsize=(5, 4)))
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "global_top10_bar.png", dpi=200)
    plt.close()

    print("✓ Saved per-model & aggregated feature importances.")
else:
    print("! No per-model SHAP importances aggregated.")

# ---- 3) Single-customer quick local chart (with masker) ----
try:
    # pick base with highest meta contribution
    try:
        meta_imp = pd.read_csv(EXPLAIN_DIR / "meta_base_importance.csv", index_col=0).iloc[:,0]
        best_base = meta_imp.idxmax()
        base_idx = base_names.index(best_base)
    except Exception:
        best_base = base_names[0]
        base_idx = 0

    row0 = X_test_raw.iloc[[0]]
    pipe = fitted_base_pipes[base_idx]
    pre = pipe.named_steps["pre"]
    clf = pipe.named_steps["clf"]
    X0 = _to_dense(pre.transform(row0))
    fn = _feature_names(pre)

    if _is_tree_model(clf):
        bg = _to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)))
        masker = shap.maskers.Independent(bg)
        expl = shap.TreeExplainer(clf, masker=masker, model_output="probability", feature_perturbation="interventional")
        sv0 = expl(X0, check_additivity=False)
        vals0 = np.abs(sv0.values[0])
        if vals0.ndim == 2:  # multiclass (n_classes, n_features)
            vals0 = np.mean(vals0, axis=0)
    else:
        bg = _to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)))
        masker = shap.maskers.Independent(bg)
        expl = shap.LinearExplainer(clf, masker, feature_names=fn)
        sv0 = expl(X0)
        vals0 = np.abs(sv0.values[0])

    s = pd.Series(vals0, index=fn).sort_values(ascending=False).head(SINGLE_LOCAL_TOP)
    s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
    plt.title(f"Top local contributions for one customer ({best_base})")
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "single_customer_top_bar.png", dpi=200)
    plt.close()
    print(f"✓ Saved single-customer quick chart using base '{best_base}'.")
except Exception as e:
    print(f"! Single-customer quick chart failed: {e}")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR} (PNGs + CSVs)")


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved: outputs/metrics.json
✓ Saved meta SHAP summaries & importances.
! SHAP failed for base model xgb: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model lgb: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model cat: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model rf: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model hgb: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! No per-model SHAP importances aggregated.
! Single-customer quick chart failed: TreeExplainer.__init__() got an unexpected keyword argument 'masker'

=== OUTPUTS ===
-

In [6]:
# =========================================
# ONE-CELL COLAB: STACKING + FAST SHAP
# Version-robust for SHAP (old/new APIs)
# =========================================

# -- Install deps --
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib packaging

# -- Python code --
import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from pathlib import Path
from typing import Dict, List
from collections import defaultdict
from packaging.version import Version

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# Optional boosters
try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    XGBClassifier = None
    HAVE_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAVE_LGB = True
except Exception:
    LGBMClassifier = None
    HAVE_LGB = False

try:
    from catboost import CatBoostClassifier
    HAVE_CAT = True
except Exception:
    CatBoostClassifier = None
    HAVE_CAT = False

import shap
SHAP_VER = Version(shap.__version__)
print("SHAP version:", SHAP_VER)

# -------------------------
# Config
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42

# SHAP knobs
N_SAMPLE_SHAP = 800   # speed-friendly sample size
K_BG = 64             # background size (kmeans/sample)
EXPLAIN_TOP_BAR = 30
SINGLE_LOCAL_TOP = 15

OUT_DIR = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(parents=True, exist_ok=True)

def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        if not os.path.exists(root):
            continue
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Upload it or set CSV_FILE_NAME.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# -------------------------
# Load & clean
# -------------------------
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Columns: {list(df.columns)}")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# -------------------------
# Preprocessor
# -------------------------
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------
# Split
# -------------------------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# -------------------------
# Base models
# -------------------------
def get_base_models() -> Dict[str, object]:
    models = {}
    if HAVE_XGB and XGBClassifier is not None:
        models["xgb"] = XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1
        )
    if HAVE_LGB and LGBMClassifier is not None:
        models["lgb"] = LGBMClassifier(
            n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
        )
    if HAVE_CAT and CatBoostClassifier is not None:
        models["cat"] = CatBoostClassifier(
            iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        )
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# OOF & Test matrix
# -------------------------
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores = {}
    fitted_full_pipes = []

    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline([("pre", preprocessor), ("clf", clf)])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = (pipe.predict_proba(X_va_f)[:,1]
                    if hasattr(pipe.named_steps["clf"], "predict_proba")
                    else pipe.decision_function(X_va_f))
            if p_va.ndim == 1 and (p_va.min() < 0 or p_va.max() > 1):
                from scipy.special import expit
                p_va = expit(p_va)
            preds_oof[va_idx] = p_va
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        aucs = [a for a, _ in fold_scores]
        prs  = [p for _, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        full_est = clf.__class__(**clf.get_params())
        full_pipe = Pipeline([("pre", preprocessor), ("clf", full_est)])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        p_test = (full_pipe.predict_proba(X_te_idx)[:,1]
                  if hasattr(full_pipe.named_steps["clf"], "predict_proba")
                  else full_pipe.decision_function(X_te_idx))
        if p_test.ndim == 1 and (p_test.min() < 0 or p_test.max() > 1):
            from scipy.special import expit
            p_test = expit(p_test)
        test_meta[:, j] = p_test
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# -------------------------
# Meta-learner
# -------------------------
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)

meta_val_auc = roc_auc_score(y_test, meta.predict_proba(test_matrix)[:,1])
meta_val_pr  = average_precision_score(y_test, meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

# Save bundle & metrics
bundle = {"base_pipes": fitted_base_pipes, "meta": meta, "base_order": base_names}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")
print("✓ Saved:", OUT_DIR/"stacking_bundle.pkl")
artifacts = {"base_oof_scores": base_oof_scores, "meta_holdout": {"roc_auc": float(meta_val_auc), "pr_auc": float(meta_val_pr)}, "used_models": base_names}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))
print("✓ Saved:", OUT_DIR/"metrics.json")

# =======================================================
# EXPLAINABILITY (FAST SHAP) — version-robust
# =======================================================
def _to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

def _is_tree_model(clf):
    n = clf.__class__.__name__.lower()
    return any(k in n for k in ["xgb", "lgbm", "catboost", "forest", "gradientboosting", "histgradient"])

def _feat_names(pre):
    return pre.get_feature_names_out()

# ---- (1) Meta-level SHAP (base learners as features) ----
X_meta_train, X_meta_test = oof_matrix, test_matrix
try:
    if SHAP_VER >= Version("0.45.0"):
        masker_meta = shap.maskers.Independent(X_meta_train)
        explainer_meta = shap.LinearExplainer(meta, masker_meta, feature_names=base_names)
        sv_meta = explainer_meta(X_meta_test)
    else:
        explainer_meta = shap.LinearExplainer(meta, X_meta_train)
        sv_meta = explainer_meta.shap_values(X_meta_test)  # old API returns np.ndarray
        # Wrap to unified object-like structure
        class _SV:
            def __init__(self, values): self.values = values
        sv_meta = _SV(sv_meta)
except Exception:
    explainer_meta = shap.Explainer(meta, X_meta_train, feature_names=base_names)
    sv_meta = explainer_meta(X_meta_test)

plt.figure(); shap.summary_plot(sv_meta, feature_names=base_names, show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"meta_summary_base_contributions.png", dpi=200); plt.close()
plt.figure(); shap.summary_plot(sv_meta, feature_names=base_names, plot_type="bar", show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"meta_summary_bar_base_contributions.png", dpi=200); plt.close()

meta_imp = np.mean(np.abs(getattr(sv_meta, "values", sv_meta)), axis=0)
meta_imp = pd.Series(meta_imp, index=base_names).sort_values(ascending=False)
meta_imp.to_csv(EXPLAIN_DIR/"meta_base_importance.csv", header=["mean_abs_shap"])
print("✓ Saved meta SHAP summaries & importances.")

# ---- (2) Per-base model SHAP on original features + aggregation ----
N_SAMPLE = min(N_SAMPLE_SHAP, len(X_train_raw))
X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)

global_feat_importance = defaultdict(list)
per_model_top10 = {}

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre = pipe.named_steps["pre"]; clf = pipe.named_steps["clf"]
        X_enc = _to_dense(pre.transform(X_for_explain))
        fn = _feat_names(pre)

        if _is_tree_model(clf):
            if SHAP_VER >= Version("0.45.0"):
                bg = shap.sample(X_enc, min(K_BG*10, X_enc.shape[0]))
                masker = shap.maskers.Independent(bg)
                expl = shap.TreeExplainer(clf, masker=masker, model_output="probability", feature_perturbation="interventional")
                sv = expl(X_enc, check_additivity=False)
                vals = np.abs(sv.values)
                if vals.ndim == 3:  # (n_samples, n_classes, n_features)
                    vals = np.mean(vals, axis=1)
            else:
                # old API
                try:
                    bg = shap.kmeans(X_enc, K_BG)
                except Exception:
                    bg = shap.sample(X_enc, min(K_BG*10, X_enc.shape[0]))
                expl = shap.TreeExplainer(clf, data=bg, model_output="probability", feature_perturbation="interventional")
                sv_vals = expl.shap_values(X_enc, check_additivity=False)
                if isinstance(sv_vals, list):  # multiclass
                    vals = np.mean([np.abs(v) for v in sv_vals], axis=0)
                else:
                    vals = np.abs(sv_vals)
        else:
            # Linear / others
            if SHAP_VER >= Version("0.45.0"):
                bg = shap.sample(X_enc, min(128, X_enc.shape[0]))
                masker = shap.maskers.Independent(bg)
                expl = shap.LinearExplainer(clf, masker, feature_names=fn)
                sv = expl(X_enc)
                vals = np.abs(sv.values)
            else:
                try:
                    bg = shap.sample(X_enc, min(128, X_enc.shape[0]))
                    expl = shap.LinearExplainer(clf, bg)
                    sv_vals = expl.shap_values(X_enc)
                    vals = np.abs(sv_vals)
                except Exception:
                    # Kernel fallback (restrict size)
                    predict_fn = (lambda data: clf.predict_proba(data)[:,1]
                                  if hasattr(clf, "predict_proba")
                                  else clf.decision_function(data))
                    try:
                        bg = shap.kmeans(X_enc, min(32, X_enc.shape[0]))
                    except Exception:
                        bg = shap.sample(X_enc, min(256, X_enc.shape[0]))
                    expl = shap.KernelExplainer(predict_fn, bg)
                    X_batch = shap.sample(X_enc, min(400, X_enc.shape[0]))
                    sv_vals = expl.shap_values(X_batch)
                    vals = np.abs(sv_vals)

        mean_abs = np.mean(vals, axis=0)
        imp = pd.Series(mean_abs, index=fn).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        (imp.head(EXPLAIN_TOP_BAR).sort_values(ascending=True)
            .plot(kind="barh", figsize=(6,8)))
        plt.tight_layout(); plt.savefig(EXPLAIN_DIR/f"base_{name}_top{EXPLAIN_TOP_BAR}_bar.png", dpi=200); plt.close()

        for f, v in imp.items():
            global_feat_importance[f].append(v)

        print(f"✓ Fast SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed for base model {name}: {e}")

# Aggregate
if len(global_feat_importance) > 0:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    (pd.DataFrame(per_model_top10).fillna(0.0)).to_csv(EXPLAIN_DIR/"per_model_top10.csv")
    agg.to_csv(EXPLAIN_DIR/"global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR/"global_top10_features.csv", header=["mean_abs_shap"])

    (top10_all.sort_values(ascending=True)
        .plot(kind="barh", figsize=(5,4)))
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"global_top10_bar.png", dpi=200); plt.close()

    print("✓ Saved per-model & aggregated feature importances.")
else:
    print("! No per-model SHAP importances aggregated.")

# ---- (3) Single-customer quick local chart (version-robust) ----
try:
    try:
        meta_imp = pd.read_csv(EXPLAIN_DIR/"meta_base_importance.csv", index_col=0).iloc[:,0]
        best_base = meta_imp.idxmax()
        base_idx = base_names.index(best_base)
    except Exception:
        best_base, base_idx = base_names[0], 0

    row0 = X_test_raw.iloc[[0]]
    pipe = fitted_base_pipes[base_idx]
    pre = pipe.named_steps["pre"]; clf = pipe.named_steps["clf"]
    X0 = _to_dense(pre.transform(row0)); fn = _feat_names(pre)

    if _is_tree_model(clf):
        if SHAP_VER >= Version("0.45.0"):
            bg = _to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)))
            masker = shap.maskers.Independent(bg)
            expl = shap.TreeExplainer(clf, masker=masker, model_output="probability", feature_perturbation="interventional")
            sv0 = expl(X0, check_additivity=False)
            vals0 = np.abs(sv0.values[0])
            if vals0.ndim == 2:  # (n_classes, n_features)
                vals0 = np.mean(vals0, axis=0)
        else:
            try:
                bg = shap.kmeans(_to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE))), 32)
            except Exception:
                bg = _to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)))
            expl = shap.TreeExplainer(clf, data=bg, model_output="probability", feature_perturbation="interventional")
            sv_vals = expl.shap_values(X0, check_additivity=False)
            if isinstance(sv_vals, list):
                vals0 = np.mean([np.array(v)[0] for v in sv_vals], axis=0)
            else:
                vals0 = np.abs(np.array(sv_vals)[0])
    else:
        if SHAP_VER >= Version("0.45.0"):
            bg = _to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)))
            masker = shap.maskers.Independent(bg)
            expl = shap.LinearExplainer(clf, masker, feature_names=fn)
            sv = expl(X0)
            vals0 = np.abs(sv.values[0])
        else:
            bg = _to_dense(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)))
            expl = shap.LinearExplainer(clf, bg)
            sv_vals = expl.shap_values(X0)
            vals0 = np.abs(np.array(sv_vals)[0])

    s = pd.Series(vals0, index=fn).sort_values(ascending=False).head(SINGLE_LOCAL_TOP)
    s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
    plt.title(f"Top local contributions for one customer ({best_base})")
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"single_customer_top_bar.png", dpi=200); plt.close()
    print(f"✓ Saved single-customer quick chart using base '{best_base}'.")
except Exception as e:
    print(f"! Single-customer quick chart failed: {e}")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR} (PNGs + CSVs)")


SHAP version: 0.49.1
✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved: outputs/metrics.json
✓ Saved meta SHAP summaries & importances.
! SHAP failed for base model xgb: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model lgb: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model cat: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model rf: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! SHAP failed for base model hgb: TreeExplainer.__init__() got an unexpected keyword argument 'masker'
! No per-model SHAP importances aggregated.
! Single-customer quick chart failed: TreeExplainer.__init__() got an unexpected keyword argument 'masker'

=== OUTPUTS ===
-

In [7]:
# =======================================================
# EXPLAINABILITY (FAST SHAP) — TreeExplainer with data=
# Compatible with SHAP 0.49.1 (no `masker` kw)
# =======================================================

import shap, numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict

def _to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

def _is_tree_model(clf):
    n = clf.__class__.__name__.lower()
    return any(k in n for k in ["xgb", "lgbm", "catboost", "forest", "gradientboosting", "histgradient"])

def _feat_names(pre):
    return pre.get_feature_names_out()

# ---- (A) Per-base model SHAP على الخصائص الأصلية + تجميع ----
N_SAMPLE = min(800, len(X_train_raw))   # تقدر تزودها لو حابب الدقة أعلى
K_BG = 64                               # حجم الخلفية لـ kmeans
EXPLAIN_TOP_BAR = 30

X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)

global_feat_importance = defaultdict(list)
per_model_top10 = {}

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre = pipe.named_steps["pre"]; clf = pipe.named_steps["clf"]
        X_enc = _to_dense(pre.transform(X_for_explain))
        fn = _feat_names(pre)

        if _is_tree_model(clf):
            # المسار السريع للموديلات الشجرية: TreeExplainer بـ data=
            try:
                bg = shap.kmeans(X_enc, K_BG)
            except Exception:
                bg = shap.sample(X_enc, min(K_BG*10, X_enc.shape[0]))

            expl = shap.TreeExplainer(
                clf,
                data=bg,                            # << المفتاح هنا
                model_output="probability",
                feature_perturbation="interventional"
            )
            sv_vals = expl.shap_values(X_enc, check_additivity=False)

            # توحيد الشكل: قد تكون list في حالات multiclass
            if isinstance(sv_vals, list):
                vals = np.mean([np.abs(v) for v in sv_vals], axis=0)  # متوسط |SHAP| عبر الكلاسات
            else:
                vals = np.abs(sv_vals)
        else:
            # Linear/fallback: LinearExplainer بـ data= أو KernelExplainer
            try:
                bg = shap.sample(X_enc, min(128, X_enc.shape[0]))
                expl = shap.LinearExplainer(clf, bg)
                vals = np.abs(expl.shap_values(X_enc))
            except Exception:
                # Kernel fallback (أبطأ؛ نقيد الحجم)
                try:
                    bg = shap.kmeans(X_enc, min(32, X_enc.shape[0]))
                except Exception:
                    bg = shap.sample(X_enc, min(256, X_enc.shape[0]))
                predict_fn = (lambda data: clf.predict_proba(data)[:,1]
                              if hasattr(clf, "predict_proba")
                              else clf.decision_function(data))
                expl = shap.KernelExplainer(predict_fn, bg)
                X_batch = shap.sample(X_enc, min(400, X_enc.shape[0]))
                vals = np.abs(expl.shap_values(X_batch))

        mean_abs = np.mean(vals, axis=0)
        imp = pd.Series(mean_abs, index=fn).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        # رسم سريع (bar) بدلاً من scatter
        (imp.head(EXPLAIN_TOP_BAR).sort_values(ascending=True)
            .plot(kind="barh", figsize=(6, 8)))
        plt.tight_layout()
        plt.savefig(EXPLAIN_DIR / f"base_{name}_top{EXPLAIN_TOP_BAR}_bar.png", dpi=200)
        plt.close()

        for f, v in imp.items():
            global_feat_importance[f].append(v)

        print(f"✓ Fast SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed for base model {name}: {e}")

# تجميع عبر الموديلات (متوسط |SHAP|)
if len(global_feat_importance) > 0:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    (pd.DataFrame(per_model_top10).fillna(0.0)
        .to_csv(EXPLAIN_DIR / "per_model_top10.csv"))
    agg.to_csv(EXPLAIN_DIR / "global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR / "global_top10_features.csv", header=["mean_abs_shap"])

    (top10_all.sort_values(ascending=True)
        .plot(kind="barh", figsize=(5, 4)))
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "global_top10_bar.png", dpi=200)
    plt.close()

    print("✓ Saved per-model & aggregated feature importances.")
else:
    print("! No per-model SHAP importances aggregated.")

# ---- (B) Single-customer quick local chart (باستخدام data=) ----
try:
    try:
        meta_imp = pd.read_csv(EXPLAIN_DIR / "meta_base_importance.csv", index_col=0).iloc[:,0]
        best_base = meta_imp.idxmax()
        base_idx = base_names.index(best_base)
    except Exception:
        best_base, base_idx = base_names[0], 0

    row0 = X_test_raw.iloc[[0]]
    pipe = fitted_base_pipes[base_idx]
    pre = pipe.named_steps["pre"]; clf = pipe.named_steps["clf"]
    X0 = _to_dense(pre.transform(row0)); fn = _feat_names(pre)

    if _is_tree_model(clf):
        try:
            bg0 = shap.kmeans(_to_dense(pre.transform(
                X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)
            )), 32)
        except Exception:
            bg0 = _to_dense(pre.transform(
                X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)
            ))

        expl0 = shap.TreeExplainer(
            clf,
            data=bg0,                          # << المهم
            model_output="probability",
            feature_perturbation="interventional"
        )
        sv0 = expl0.shap_values(X0, check_additivity=False)
        if isinstance(sv0, list):
            vals0 = np.mean([np.array(v)[0] for v in sv0], axis=0)
        else:
            vals0 = np.abs(np.array(sv0)[0])
    else:
        bg0 = _to_dense(pre.transform(
            X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)
        ))
        expl0 = shap.LinearExplainer(clf, bg0)
        vals0 = np.abs(np.array(expl0.shap_values(X0))[0])

    s = pd.Series(vals0, index=fn).sort_values(ascending=False).head(15)
    s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
    plt.title(f"Top local contributions for one customer ({best_base})")
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "single_customer_top_bar.png", dpi=200)
    plt.close()
    print(f"✓ Saved single-customer quick chart using base '{best_base}'.")
except Exception as e:
    print(f"! Single-customer quick chart failed: {e}")


! SHAP failed for base model xgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed for base model lgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed for base model cat: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed for base model rf: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! SHAP failed for base model hgb: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!
! No per-model SHAP importances aggregated.
! Single-customer quick chart failed: Unsupported masker type: <class 'shap.utils._legacy.DenseData'>!


In [8]:
# =========================================
# ONE-CELL COLAB: STACKING + SHAP (robust)
# Works with SHAP 0.49.x (no `masker` kw)
# =========================================

# -- Install deps --
!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib

# -- Python code --
import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")  # headless plotting
import matplotlib.pyplot as plt

from pathlib import Path
from typing import Dict, List
from collections import defaultdict

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# Optional boosters
try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    XGBClassifier = None
    HAVE_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAVE_LGB = True
except Exception:
    LGBMClassifier = None
    HAVE_LGB = False

try:
    from catboost import CatBoostClassifier
    HAVE_CAT = True
except Exception:
    CatBoostClassifier = None
    HAVE_CAT = False

import shap

# -------------------------
# Config
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"   # غيّر لو الاسم مختلف
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42

# SHAP speed knobs
N_SAMPLE_SHAP = 800     # حجم العينة للتفسير (سرعة مقابل دقة)
K_BG = 64               # حجم الخلفية (kmeans/sample)
EXPLAIN_TOP_BAR = 30    # عدد الخصائص في الـbar chart
SINGLE_LOCAL_TOP = 15   # عدد الخصائص المحلية (عميل واحد)

OUT_DIR = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(parents=True, exist_ok=True)

def find_file_recursively(filename: str, roots: List[str] = ["/content", "."]) -> str:
    for root in roots:
        if not os.path.exists(root):
            continue
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"Could not find {filename}. Upload it to Colab Files or set CSV_FILE_NAME.")

csv_path = find_file_recursively(CSV_FILE_NAME, ["/content", "."])
print(f"✓ Found dataset at: {csv_path}")

# -------------------------
# Load & clean
# -------------------------
df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Columns: {list(df.columns)}")

y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

# -------------------------
# Preprocessor
# -------------------------
cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------
# Split
# -------------------------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# -------------------------
# Base models
# -------------------------
def get_base_models() -> Dict[str, object]:
    models = {}
    if HAVE_XGB and XGBClassifier is not None:
        models["xgb"] = XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1
        )
    if HAVE_LGB and LGBMClassifier is not None:
        models["lgb"] = LGBMClassifier(
            n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
        )
    if HAVE_CAT and CatBoostClassifier is not None:
        models["cat"] = CatBoostClassifier(
            iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        )
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# OOF & Test matrix
# -------------------------
def build_oof_and_test_matrix(
    models: Dict[str, object],
    X_tr_raw: pd.DataFrame, y_tr: np.ndarray,
    X_te_raw: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train = X_tr_raw.shape[0]
    n_test  = X_te_raw.shape[0]
    oof = np.zeros((n_train, len(models)), dtype=float)
    test_meta = np.zeros((n_test, len(models)), dtype=float)
    per_model_scores = {}
    fitted_full_pipes = []

    X_tr_idx = X_tr_raw.reset_index(drop=True)
    X_te_idx = X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j, (name, clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train, dtype=float)
        fold_scores = []

        for tr_idx, va_idx in skf.split(X_tr_idx, y_tr_idx):
            X_tr_f, X_va_f = X_tr_idx.iloc[tr_idx], X_tr_idx.iloc[va_idx]
            y_tr_f, y_va_f = y_tr_idx[tr_idx], y_tr_idx[va_idx]

            pipe = Pipeline([("pre", preprocessor), ("clf", clf)])
            pipe.fit(X_tr_f, y_tr_f)
            p_va = (pipe.predict_proba(X_va_f)[:,1]
                    if hasattr(pipe.named_steps["clf"], "predict_proba")
                    else pipe.decision_function(X_va_f))
            if p_va.ndim == 1 and (p_va.min() < 0 or p_va.max() > 1):
                from scipy.special import expit
                p_va = expit(p_va)
            preds_oof[va_idx] = p_va
            fold_scores.append((
                roc_auc_score(y_va_f, p_va),
                average_precision_score(y_va_f, p_va)
            ))

        aucs = [a for a, _ in fold_scores]
        prs  = [p for _, p in fold_scores]
        per_model_scores[name] = {
            "oof_roc_auc_mean": float(np.mean(aucs)),
            "oof_pr_auc_mean":  float(np.mean(prs))
        }
        oof[:, j] = preds_oof

        full_est = clf.__class__(**clf.get_params())
        full_pipe = Pipeline([("pre", preprocessor), ("clf", full_est)])
        full_pipe.fit(X_tr_idx, y_tr_idx)
        p_test = (full_pipe.predict_proba(X_te_idx)[:,1]
                  if hasattr(full_pipe.named_steps["clf"], "predict_proba")
                  else full_pipe.decision_function(X_te_idx))
        if p_test.ndim == 1 and (p_test.min() < 0 or p_test.max() > 1):
            from scipy.special import expit
            p_test = expit(p_test)
        test_meta[:, j] = p_test
        fitted_full_pipes.append(full_pipe)

        print(f"[{name}] OOF ROC-AUC={per_model_scores[name]['oof_roc_auc_mean']:.4f} | PR-AUC={per_model_scores[name]['oof_pr_auc_mean']:.4f}")

    return oof, test_meta, per_model_scores, fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix, test_matrix, base_oof_scores, fitted_base_pipes = build_oof_and_test_matrix(
    base_models, X_train_raw, y_train, X_test_raw, n_splits=N_FOLDS, random_state=RANDOM_STATE
)

# -------------------------
# Meta-learner
# -------------------------
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix, y_train)
meta_val_auc = roc_auc_score(y_test, meta.predict_proba(test_matrix)[:,1])
meta_val_pr  = average_precision_score(y_test, meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

# Save bundle & metrics
bundle = {"base_pipes": fitted_base_pipes, "meta": meta, "base_order": base_names}
joblib.dump(bundle, OUT_DIR/"stacking_bundle.pkl")
print("✓ Saved:", OUT_DIR/"stacking_bundle.pkl")
artifacts = {"base_oof_scores": base_oof_scores, "meta_holdout": {"roc_auc": float(meta_val_auc), "pr_auc": float(meta_val_pr)}, "used_models": base_names}
(OUT_DIR/"metrics.json").write_text(json.dumps(artifacts, indent=2))
print("✓ Saved:", OUT_DIR/"metrics.json")

# =======================================================
# EXPLAINABILITY (SHAP) — robust for 0.49.x
# =======================================================
def _to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

def _to_ndarray(bg):
    """Accept numpy/pandas or SHAP DenseData -> np.ndarray"""
    if hasattr(bg, "data"):  # DenseData legacy
        return np.asarray(bg.data)
    return np.asarray(bg)

def _is_tree_model(clf):
    n = clf.__class__.__name__.lower()
    return any(k in n for k in ["xgb", "lgbm", "catboost", "forest", "gradientboosting", "histgradient"])

def _feat_names(pre):
    return pre.get_feature_names_out()

# ---- (1) Meta-level SHAP (base learners as features) ----
# Use old-style LinearExplainer with data= (compatible & fast)
explainer_meta = shap.LinearExplainer(meta, oof_matrix)
sv_meta = explainer_meta.shap_values(test_matrix)  # ndarray shape (n_test, n_models)

# Save meta-level charts
plt.figure()
shap.summary_plot(sv_meta, feature_names=base_names, show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_base_contributions.png", dpi=200)
plt.close()

plt.figure()
shap.summary_plot(sv_meta, feature_names=base_names, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(EXPLAIN_DIR / "meta_summary_bar_base_contributions.png", dpi=200)
plt.close()

meta_importance = np.mean(np.abs(sv_meta), axis=0)
pd.Series(meta_importance, index=base_names).sort_values(ascending=False)\
  .to_csv(EXPLAIN_DIR / "meta_base_importance.csv", header=["mean_abs_shap"])
print("✓ Saved meta SHAP summaries & importances.")

# ---- (2) Per-base model SHAP on original features + aggregation ----
N_SAMPLE = min(N_SAMPLE_SHAP, len(X_train_raw))
X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)

global_feat_importance = defaultdict(list)
per_model_top10 = {}

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre = pipe.named_steps["pre"]; clf = pipe.named_steps["clf"]
        X_enc = _to_dense(pre.transform(X_for_explain))
        fn = _feat_names(pre)

        if _is_tree_model(clf):
            # TreeExplainer with data= and ndarray background
            try:
                bg = shap.kmeans(X_enc, K_BG)
            except Exception:
                bg = shap.sample(X_enc, min(K_BG*10, X_enc.shape[0]))
            bg_arr = _to_ndarray(bg)

            expl = shap.TreeExplainer(
                clf,
                data=bg_arr,
                model_output="probability",
                feature_perturbation="interventional"
            )
            sv_vals = expl.shap_values(X_enc, check_additivity=False)

            if isinstance(sv_vals, list):   # multiclass -> mean |SHAP| across classes
                vals = np.mean([np.abs(v) for v in sv_vals], axis=0)
            else:
                vals = np.abs(sv_vals)
        else:
            # LinearExplainer with data=; fallback to Kernel if needed
            try:
                bg = shap.sample(X_enc, min(128, X_enc.shape[0]))
                bg_arr = _to_ndarray(bg)
                expl = shap.LinearExplainer(clf, bg_arr)
                vals = np.abs(expl.shap_values(X_enc))
            except Exception:
                try:
                    bg = shap.kmeans(X_enc, min(32, X_enc.shape[0]))
                except Exception:
                    bg = shap.sample(X_enc, min(256, X_enc.shape[0]))
                bg_arr = _to_ndarray(bg)
                predict_fn = (lambda data: clf.predict_proba(data)[:,1]
                              if hasattr(clf, "predict_proba")
                              else clf.decision_function(data))
                expl = shap.KernelExplainer(predict_fn, bg_arr)
                X_batch = shap.sample(X_enc, min(400, X_enc.shape[0]))
                vals = np.abs(expl.shap_values(X_batch))

        mean_abs = np.mean(vals, axis=0)
        imp = pd.Series(mean_abs, index=fn).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        (imp.head(EXPLAIN_TOP_BAR).sort_values(ascending=True)
            .plot(kind="barh", figsize=(6, 8)))
        plt.tight_layout()
        plt.savefig(EXPLAIN_DIR / f"base_{name}_top{EXPLAIN_TOP_BAR}_bar.png", dpi=200)
        plt.close()

        for f, v in imp.items():
            global_feat_importance[f].append(v)

        print(f"✓ Fast SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed for base model {name}: {e}")

# Aggregate across models (mean |SHAP|)
if len(global_feat_importance) > 0:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    (pd.DataFrame(per_model_top10).fillna(0.0)
        .to_csv(EXPLAIN_DIR / "per_model_top10.csv"))
    agg.to_csv(EXPLAIN_DIR / "global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR / "global_top10_features.csv", header=["mean_abs_shap"])

    (top10_all.sort_values(ascending=True)
        .plot(kind="barh", figsize=(5, 4)))
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "global_top10_bar.png", dpi=200)
    plt.close()

    print("✓ Saved per-model & aggregated feature importances.")
else:
    print("! No per-model SHAP importances aggregated.")

# ---- (3) Single-customer quick local chart ----
try:
    try:
        meta_imp = pd.read_csv(EXPLAIN_DIR / "meta_base_importance.csv", index_col=0).iloc[:,0]
        best_base = meta_imp.idxmax(); base_idx = base_names.index(best_base)
    except Exception:
        best_base, base_idx = base_names[0], 0

    row0 = X_test_raw.iloc[[0]]
    pipe = fitted_base_pipes[base_idx]
    pre = pipe.named_steps["pre"]; clf = pipe.named_steps["clf"]
    X0 = _to_dense(pre.transform(row0)); fn = _feat_names(pre)

    if _is_tree_model(clf):
        try:
            bg0 = shap.kmeans(_to_dense(pre.transform(
                X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)
            )), 32)
        except Exception:
            bg0 = _to_dense(pre.transform(
                X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)
            ))
        bg0_arr = _to_ndarray(bg0)

        expl0 = shap.TreeExplainer(
            clf,
            data=bg0_arr,
            model_output="probability",
            feature_perturbation="interventional"
        )
        sv0 = expl0.shap_values(X0, check_additivity=False)
        if isinstance(sv0, list):
            vals0 = np.mean([np.array(v)[0] for v in sv0], axis=0)
        else:
            vals0 = np.abs(np.array(sv0)[0])
    else:
        bg0 = _to_dense(pre.transform(
            X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE)
        ))
        bg0_arr = _to_ndarray(bg0)
        expl0 = shap.LinearExplainer(clf, bg0_arr)
        vals0 = np.abs(np.array(expl0.shap_values(X0))[0])

    s = pd.Series(vals0, index=fn).sort_values(ascending=False).head(SINGLE_LOCAL_TOP)
    s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
    plt.title(f"Top local contributions for one customer ({best_base})")
    plt.tight_layout()
    plt.savefig(EXPLAIN_DIR / "single_customer_top_bar.png", dpi=200)
    plt.close()
    print(f"✓ Saved single-customer quick chart using base '{best_base}'.")
except Exception as e:
    print(f"! Single-customer quick chart failed: {e}")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR} (PNGs + CSVs)")


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved: outputs/metrics.json
✓ Saved meta SHAP summaries & importances.
! SHAP failed for base model xgb: could not convert string to float: '[2.6542976E-1]'




✓ Fast SHAP computed for base model: lgb




✓ Fast SHAP computed for base model: cat




! SHAP failed for base model rf: Data must be 1-dimensional, got ndarray of shape (45, 2) instead
✓ Fast SHAP computed for base model: hgb
✓ Saved per-model & aggregated feature importances.
✓ Saved single-customer quick chart using base 'cat'.

=== OUTPUTS ===
- Bundle: outputs/stacking_bundle.pkl
- Metrics: outputs/metrics.json
- Explain dir: outputs/explain (PNGs + CSVs)


In [9]:
# =========================================
# ONE-CELL COLAB: STACKING + SHAP (Final v2)
# Robust to SHAP 0.49.x & sklearn tree shapes
# =========================================

!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib

import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
from typing import Dict, List

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

try:
    from xgboost import XGBClassifier; HAVE_XGB = True
except: HAVE_XGB = False
try:
    from lightgbm import LGBMClassifier; HAVE_LGB = True
except: HAVE_LGB = False
try:
    from catboost import CatBoostClassifier; HAVE_CAT = True
except: HAVE_CAT = False
import shap

# -------------------------
# Config
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(exist_ok=True)

# -------------------------
# Load dataset
# -------------------------
def find_file_recursively(filename, roots=["/content", "."]):
    for root in roots:
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"{filename} not found")
csv_path = find_file_recursively(CSV_FILE_NAME)
print(f"✓ Found dataset at: {csv_path}")

df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns: df.drop(columns=["customerID"], inplace=True)
y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", SimpleImputer(strategy="median"), num_cols)
], verbose_feature_names_out=False)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# -------------------------
# Models
# -------------------------
def get_base_models():
    models = {}
    if HAVE_XGB:
        models["xgb"] = XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1)
    if HAVE_LGB:
        models["lgb"] = LGBMClassifier(n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1)
    if HAVE_CAT:
        models["cat"] = CatBoostClassifier(iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE)
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# Build OOF and Test Matrices
# -------------------------
def build_oof_and_test_matrix(models, X_tr_raw, y_tr, X_te_raw):
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    n_train, n_test = X_tr_raw.shape[0], X_te_raw.shape[0]
    oof, test_meta = np.zeros((n_train,len(models))), np.zeros((n_test,len(models)))
    fitted_full_pipes, per_model_scores = [], {}
    X_tr_idx, X_te_idx = X_tr_raw.reset_index(drop=True), X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j,(name,clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train)
        fold_scores=[]
        for tr_idx,va_idx in skf.split(X_tr_idx,y_tr_idx):
            X_tr_f,X_va_f=X_tr_idx.iloc[tr_idx],X_tr_idx.iloc[va_idx]
            y_tr_f,y_va_f=y_tr_idx[tr_idx],y_tr_idx[va_idx]
            pipe=Pipeline([("pre",preprocessor),("clf",clf)])
            pipe.fit(X_tr_f,y_tr_f)
            p_va=pipe.predict_proba(X_va_f)[:,1]
            preds_oof[va_idx]=p_va
            fold_scores.append((roc_auc_score(y_va_f,p_va),average_precision_score(y_va_f,p_va)))
        aucs=[a for a,_ in fold_scores]; prs=[p for _,p in fold_scores]
        per_model_scores[name]={"oof_roc_auc_mean":float(np.mean(aucs)),"oof_pr_auc_mean":float(np.mean(prs))}
        oof[:,j]=preds_oof
        full_pipe=Pipeline([("pre",preprocessor),("clf",clf.__class__(**clf.get_params()))])
        full_pipe.fit(X_tr_idx,y_tr_idx)
        test_meta[:,j]=full_pipe.predict_proba(X_te_idx)[:,1]
        fitted_full_pipes.append(full_pipe)
        print(f"[{name}] OOF ROC-AUC={np.mean(aucs):.4f} | PR-AUC={np.mean(prs):.4f}")
    return oof,test_meta,per_model_scores,fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix,test_matrix,base_oof_scores,fitted_base_pipes=build_oof_and_test_matrix(base_models,X_train_raw,y_train,X_test_raw)

# -------------------------
# Meta Learner
# -------------------------
meta=LogisticRegression(max_iter=2000,class_weight="balanced",random_state=RANDOM_STATE)
meta.fit(oof_matrix,y_train)
meta_val_auc=roc_auc_score(y_test,meta.predict_proba(test_matrix)[:,1])
meta_val_pr=average_precision_score(y_test,meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

bundle={"base_pipes":fitted_base_pipes,"meta":meta,"base_order":base_names}
joblib.dump(bundle,OUT_DIR/"stacking_bundle.pkl")
(OUT_DIR/"metrics.json").write_text(json.dumps({
    "base_oof_scores":base_oof_scores,"meta_holdout":{"roc_auc":meta_val_auc,"pr_auc":meta_val_pr}},indent=2))
print("✓ Saved:",OUT_DIR/"stacking_bundle.pkl")

# -------------------------
# SHAP Explainability
# -------------------------
def _to_dense(X): return X.toarray() if hasattr(X,"toarray") else X
def _float32(A): return np.asarray(_to_dense(A),dtype=np.float32)
def _is_tree(clf): return any(k in clf.__class__.__name__.lower() for k in ["xgb","lgbm","cat","forest","gradient"])
def _is_xgb(clf): return clf.__class__.__name__.lower().startswith("xgb")
def _names(pre): return pre.get_feature_names_out()

def _norm(vals,n_feat):
    arr=np.array(vals,dtype=object if isinstance(vals,list) else None)
    try: arr=np.array(vals)
    except: arr=np.stack([np.asarray(v) for v in vals],axis=0)
    arr=np.abs(arr)
    if arr.ndim==2: return arr
    if arr.ndim==3:
        if arr.shape[2]==n_feat: return arr.mean(axis=1)
        elif arr.shape[0] in (2,3): return arr.mean(axis=0)
    if isinstance(vals,list): return np.stack([np.asarray(v) for v in vals],axis=0).mean(axis=0)
    return np.asarray(vals)

def _bg(X,k):
    try: bg=shap.kmeans(X,k); return _float32(bg.data if hasattr(bg,"data") else bg)
    except: return _float32(shap.sample(X,min(k*10,X.shape[0])))

N_SAMPLE=800; K_BG=64; EXPLAIN_TOP=30
X_exp=X_train_raw.sample(n=min(N_SAMPLE,len(X_train_raw)),random_state=RANDOM_STATE)
global_imp=defaultdict(list); per_model_top10={}

for name,pipe in zip(base_names,fitted_base_pipes):
    try:
        pre,clf=pipe.named_steps["pre"],pipe.named_steps["clf"]
        X_enc=_float32(pre.transform(X_exp)); fn=_names(pre); n_feat=len(fn)
        if _is_tree(clf):
            bg_arr=_bg(X_enc,K_BG)
            expl=shap.TreeExplainer(clf.get_booster() if _is_xgb(clf) else clf,
                data=bg_arr,model_output="probability",feature_perturbation="interventional")
            vals=_norm(expl.shap_values(X_enc,check_additivity=False),n_feat)
        else:
            bg_arr=_float32(shap.sample(X_enc,min(128,X_enc.shape[0])))
            expl=shap.LinearExplainer(clf,bg_arr)
            vals=_norm(expl.shap_values(X_enc),n_feat)
        imp=pd.Series(vals.mean(axis=0),index=fn).sort_values(ascending=False)
        per_model_top10[name]=imp.head(10)
        (imp.head(EXPLAIN_TOP).sort_values(ascending=True)
          .plot(kind="barh",figsize=(6,8)))
        plt.tight_layout(); plt.savefig(EXPLAIN_DIR/f"base_{name}_bar.png",dpi=200); plt.close()
        for f,v in imp.items(): global_imp[f].append(v)
        print(f"✓ Fast SHAP computed for base model: {name}")
    except Exception as e: print(f"! SHAP failed for base model {name}: {e}")

if global_imp:
    agg={f:np.mean(vs) for f,vs in global_imp.items()}
    agg=pd.Series(agg).sort_values(ascending=False)
    top10=agg.head(10)
    pd.DataFrame(per_model_top10).fillna(0).to_csv(EXPLAIN_DIR/"per_model_top10.csv")
    agg.to_csv(EXPLAIN_DIR/"global_feature_importance.csv",header=["mean_abs_shap"])
    (top10.sort_values(ascending=True)
     .plot(kind="barh",figsize=(5,4)))
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"global_top10_bar.png",dpi=200); plt.close()
    print("✓ Saved per-model & aggregated feature importances.")
else: print("! No per-model SHAP importances aggregated.")

# Single-customer quick chart
try:
    best_base=base_names[0]
    row0=X_test_raw.iloc[[0]]
    pipe=fitted_base_pipes[0]
    pre,clf=pipe.named_steps["pre"],pipe.named_steps["clf"]
    X0=_float32(pre.transform(row0)); fn=_names(pre); n_feat=len(fn)
    bg0=_bg(_float32(pre.transform(X_train_raw.sample(400,random_state=RANDOM_STATE))),32)
    expl0=shap.TreeExplainer(clf.get_booster() if _is_xgb(clf) else clf,
        data=bg0,model_output="probability",feature_perturbation="interventional")
    vals0=_norm(expl0.shap_values(X0,check_additivity=False),n_feat)[0]
    s=pd.Series(vals0,index=fn).sort_values(ascending=False).head(15)
    s.sort_values(ascending=True).plot(kind="barh",figsize=(6,5))
    plt.title(f"Top local contributions ({best_base})")
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"single_customer_top_bar.png",dpi=200); plt.close()
    print(f"✓ Saved single-customer quick chart using base '{best_base}'.")
except Exception as e: print(f"! Single-customer quick chart failed: {e}")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR}")


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
! SHAP failed for base model xgb: could not convert string to float: '[2.6542976E-1]'




✓ Fast SHAP computed for base model: lgb




✓ Fast SHAP computed for base model: cat




! SHAP failed for base model rf: Data must be 1-dimensional, got ndarray of shape (45, 2) instead
✓ Fast SHAP computed for base model: hgb
✓ Saved per-model & aggregated feature importances.
! Single-customer quick chart failed: could not convert string to float: '[2.6542976E-1]'

=== OUTPUTS ===
- Bundle: outputs/stacking_bundle.pkl
- Metrics: outputs/metrics.json
- Explain dir: outputs/explain


In [10]:
# =======================================================
# SHAP (final robust block): handles XGB fallback + RF shapes
# Replace your SHAP section with this one
# =======================================================
import shap, numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict

def _to_dense(X): return X.toarray() if hasattr(X,"toarray") else X
def _float32(A): return np.asarray(_to_dense(A), dtype=np.float32)
def _is_tree(clf): return any(k in clf.__class__.__name__.lower() for k in ["xgb","lgbm","cat","forest","gradient"])
def _is_xgb(clf): return clf.__class__.__name__.lower().startswith("xgb")
def _names(pre): return pre.get_feature_names_out()

def _bg(X, k, fallback_mult=10):
    # Try kmeans, else random sample; always ndarray float32
    try:
        bg = shap.kmeans(X, k)
        return _float32(bg.data if hasattr(bg,"data") else bg)
    except Exception:
        return _float32(shap.sample(X, min(k*fallback_mult, X.shape[0])))

def _norm(vals, n_feat):
    """Return (n_samples, n_features) no matter what the explainer returned."""
    # Try plain array first
    try:
        arr = np.array(vals)
    except Exception:
        arr = np.stack([np.asarray(v) for v in vals], axis=0)  # list-of-classes
    arr = np.abs(arr)

    if arr.ndim == 2:
        # (n_samples, n_features)  ✅
        return arr

    if arr.ndim == 3:
        # Handle three common layouts:
        # (n_samples, n_classes, n_features) → mean over classes
        if arr.shape[2] == n_feat and arr.shape[1] in (2,3):
            return arr.mean(axis=1)
        # (n_classes, n_samples, n_features) → mean over classes
        if arr.shape[0] in (2,3) and arr.shape[2] == n_feat:
            return arr.mean(axis=0)
        # (n_samples, n_features, n_classes) → mean over classes (last axis)
        if arr.shape[1] == n_feat and arr.shape[2] in (2,3):
            return arr.mean(axis=2)

    # list of per-class arrays (n_classes, n_samples, n_features)
    if isinstance(vals, list):
        stacked = np.stack([np.asarray(v) for v in vals], axis=0)
        return np.abs(stacked).mean(axis=0)

    # Fallback: assume already (n_samples, n_features)
    return np.asarray(vals)

# ---- (1) Meta-level SHAP (base learners as features) ----
explainer_meta = shap.LinearExplainer(meta, oof_matrix)
sv_meta = explainer_meta.shap_values(test_matrix)  # (n_test, n_models)

plt.figure()
shap.summary_plot(sv_meta, feature_names=base_names, show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR / "meta_summary_base_contributions.png", dpi=200); plt.close()

plt.figure()
shap.summary_plot(sv_meta, feature_names=base_names, plot_type="bar", show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR / "meta_summary_bar_base_contributions.png", dpi=200); plt.close()

pd.Series(np.mean(np.abs(sv_meta), axis=0), index=base_names)\
  .sort_values(ascending=False)\
  .to_csv(EXPLAIN_DIR / "meta_base_importance.csv", header=["mean_abs_shap"])
print("✓ Saved meta SHAP summaries & importances.")

# ---- (2) Per-base model SHAP on original features + aggregation ----
N_SAMPLE = min(800, len(X_train_raw))
K_BG = 64
EXPLAIN_TOP_BAR = 30

X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)

global_feat_importance = defaultdict(list)
per_model_top10 = {}
successful_base = None  # to pick a model for single-customer chart

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre, clf = pipe.named_steps["pre"], pipe.named_steps["clf"]
        X_enc = _float32(pre.transform(X_for_explain))
        fn = _names(pre); n_feat = len(fn)

        if _is_tree(clf):
            bg_arr = _bg(X_enc, K_BG)

            # Prefer TreeExplainer; for XGB fallback to Kernel if it throws conversion errors
            try:
                model_for_shap = clf.get_booster() if _is_xgb(clf) else clf
                expl = shap.TreeExplainer(
                    model_for_shap,
                    data=bg_arr,
                    model_output="probability",
                    feature_perturbation="interventional"
                )
                vals_raw = expl.shap_values(X_enc, check_additivity=False)
                vals = _norm(vals_raw, n_feat)
            except Exception as e_xgb:
                if _is_xgb(clf):
                    # Robust fallback: KernelExplainer on a smaller batch (fast enough)
                    print(f"… XGB TreeExplainer failed; falling back to KernelExplainer: {e_xgb}")
                    bg_arr = _bg(X_enc, 32)
                    pred = (lambda data: clf.predict_proba(data)[:,1])
                    expl = shap.KernelExplainer(pred, bg_arr)
                    X_batch = _float32(shap.sample(X_enc, min(300, X_enc.shape[0])))
                    vals_raw = expl.shap_values(X_batch)
                    vals = _norm(vals_raw, n_feat)
                    # Align to full sample length by repeating mean (keeps outputs consistent for saving charts)
                    if vals.shape[0] != X_enc.shape[0]:
                        vals = np.repeat(vals.mean(axis=0, keepdims=True), X_enc.shape[0], axis=0)
                else:
                    raise
        else:
            # LinearExplainer with small bg; fallback to Kernel if needed
            try:
                bg_arr = _float32(shap.sample(X_enc, min(128, X_enc.shape[0])))
                expl = shap.LinearExplainer(clf, bg_arr)
                vals_raw = expl.shap_values(X_enc)
                vals = _norm(vals_raw, n_feat)
            except Exception:
                bg_arr = _bg(X_enc, 32)
                pred = (lambda data: clf.predict_proba(data)[:,1])
                expl = shap.KernelExplainer(pred, bg_arr)
                X_batch = _float32(shap.sample(X_enc, min(400, X_enc.shape[0])))
                vals_raw = expl.shap_values(X_batch)
                vals = _norm(vals_raw, n_feat)
                if vals.shape[0] != X_enc.shape[0]:
                    vals = np.repeat(vals.mean(axis=0, keepdims=True), X_enc.shape[0], axis=0)

        imp = pd.Series(vals.mean(axis=0), index=fn).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        (imp.head(EXPLAIN_TOP_BAR).sort_values(ascending=True)
           .plot(kind="barh", figsize=(6, 8)))
        plt.tight_layout(); plt.savefig(EXPLAIN_DIR / f"base_{name}_bar.png", dpi=200); plt.close()

        for f, v in imp.items(): global_feat_importance[f].append(v)

        if successful_base is None:
            successful_base = name  # remember first model that succeeded for local chart

        print(f"✓ SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed for base model {name}: {e}")

# Aggregate across models (mean |SHAP|)
if global_feat_importance:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10_all = agg.head(10)

    pd.DataFrame(per_model_top10).fillna(0.0).to_csv(EXPLAIN_DIR / "per_model_top10.csv")
    agg.to_csv(EXPLAIN_DIR / "global_feature_importance.csv", header=["mean_abs_shap"])
    top10_all.to_csv(EXPLAIN_DIR / "global_top10_features.csv", header=["mean_abs_shap"])

    (top10_all.sort_values(ascending=True)
        .plot(kind="barh", figsize=(5, 4)))
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR / "global_top10_bar.png", dpi=200); plt.close()

    print("✓ Saved per-model & aggregated feature importances.")
else:
    print("! No per-model SHAP importances aggregated.")

# ---- (3) Single-customer quick local chart (choose a successful model; avoid XGB path if it failed) ----
try:
    # Pick a model that succeeded; fallback to strongest by meta if available
    if successful_base is None:
        try:
            meta_imp = pd.read_csv(EXPLAIN_DIR / "meta_base_importance.csv", index_col=0).iloc[:,0]
            successful_base = meta_imp.idxmax()
        except Exception:
            successful_base = base_names[0]

    base_idx = base_names.index(successful_base)
    pipe = fitted_base_pipes[base_idx]
    pre, clf = pipe.named_steps["pre"], pipe.named_steps["clf"]
    fn = _names(pre); n_feat = len(fn)

    X0 = _float32(pre.transform(X_test_raw.iloc[[0]]))
    bg0 = _bg(_float32(pre.transform(X_train_raw.sample(min(400, len(X_train_raw)), random_state=RANDOM_STATE))), 32)

    vals0 = None
    if _is_tree(clf):
        try:
            model_for_shap = clf.get_booster() if _is_xgb(clf) else clf
            expl0 = shap.TreeExplainer(model_for_shap, data=bg0, model_output="probability", feature_perturbation="interventional")
            vraw = expl0.shap_values(X0, check_additivity=False)
            vals0 = _norm(vraw, n_feat)[0]
        except Exception as e_xgb:
            # Kernel fallback for local chart (fast)
            pred = (lambda data: clf.predict_proba(data)[:,1])
            expl0 = shap.KernelExplainer(pred, bg0)
            vraw = expl0.shap_values(X0)
            vals0 = _norm(vraw, n_feat)[0]
    else:
        try:
            expl0 = shap.LinearExplainer(clf, bg0)
            vraw = expl0.shap_values(X0)
            vals0 = _norm(vraw, n_feat)[0]
        except Exception:
            pred = (lambda data: clf.predict_proba(data)[:,1])
            expl0 = shap.KernelExplainer(pred, bg0)
            vraw = expl0.shap_values(X0)
            vals0 = _norm(vraw, n_feat)[0]

    s = pd.Series(vals0, index=fn).sort_values(ascending=False).head(15)
    s.sort_values(ascending=True).plot(kind="barh", figsize=(6,5))
    plt.title(f"Top local contributions for one customer ({successful_base})")
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR / "single_customer_top_bar.png", dpi=200); plt.close()
    print(f"✓ Saved single-customer quick chart using base '{successful_base}'.")
except Exception as e:
    print(f"! Single-customer quick chart failed: {e}")


✓ Saved meta SHAP summaries & importances.
… XGB TreeExplainer failed; falling back to KernelExplainer: could not convert string to float: '[2.6542976E-1]'


  0%|          | 0/300 [00:00<?, ?it/s]

✓ SHAP computed for base model: xgb




✓ SHAP computed for base model: lgb




✓ SHAP computed for base model: cat




✓ SHAP computed for base model: rf
✓ SHAP computed for base model: hgb
✓ Saved per-model & aggregated feature importances.


  0%|          | 0/1 [00:00<?, ?it/s]

✓ Saved single-customer quick chart using base 'xgb'.


In [11]:
# =========================================
# ONE-CELL COLAB: STACKING + SHAP (Force XGB Kernel)
# =========================================

!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib

import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
from typing import Dict, List
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# optional imports
try:
    from xgboost import XGBClassifier; HAVE_XGB = True
except: HAVE_XGB = False
try:
    from lightgbm import LGBMClassifier; HAVE_LGB = True
except: HAVE_LGB = False
try:
    from catboost import CatBoostClassifier; HAVE_CAT = True
except: HAVE_CAT = False
import shap

# -------------------------
# Config
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(exist_ok=True)

# -------------------------
# Load dataset
# -------------------------
def find_file_recursively(filename, roots=["/content", "."]):
    for root in roots:
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"{filename} not found")

csv_path = find_file_recursively(CSV_FILE_NAME)
print(f"✓ Found dataset at: {csv_path}")

df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns: df.drop(columns=["customerID"], inplace=True)
y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", SimpleImputer(strategy="median"), num_cols)
], verbose_feature_names_out=False)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# -------------------------
# Models
# -------------------------
def get_base_models():
    models = {}
    if HAVE_XGB:
        models["xgb"] = XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1)
    if HAVE_LGB:
        models["lgb"] = LGBMClassifier(n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1)
    if HAVE_CAT:
        models["cat"] = CatBoostClassifier(iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE)
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# Build OOF and Test Matrices
# -------------------------
def build_oof_and_test_matrix(models, X_tr_raw, y_tr, X_te_raw):
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    n_train, n_test = X_tr_raw.shape[0], X_te_raw.shape[0]
    oof, test_meta = np.zeros((n_train,len(models))), np.zeros((n_test,len(models)))
    fitted_full_pipes, per_model_scores = [], {}
    X_tr_idx, X_te_idx = X_tr_raw.reset_index(drop=True), X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j,(name,clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train)
        fold_scores=[]
        for tr_idx,va_idx in skf.split(X_tr_idx,y_tr_idx):
            X_tr_f,X_va_f=X_tr_idx.iloc[tr_idx],X_tr_idx.iloc[va_idx]
            y_tr_f,y_va_f=y_tr_idx[tr_idx],y_tr_idx[va_idx]
            pipe=Pipeline([("pre",preprocessor),("clf",clf)])
            pipe.fit(X_tr_f,y_tr_f)
            p_va=pipe.predict_proba(X_va_f)[:,1]
            preds_oof[va_idx]=p_va
            fold_scores.append((roc_auc_score(y_va_f,p_va),average_precision_score(y_va_f,p_va)))
        aucs=[a for a,_ in fold_scores]; prs=[p for _,p in fold_scores]
        per_model_scores[name]={"oof_roc_auc_mean":float(np.mean(aucs)),"oof_pr_auc_mean":float(np.mean(prs))}
        oof[:,j]=preds_oof
        full_pipe=Pipeline([("pre",preprocessor),("clf",clf.__class__(**clf.get_params()))])
        full_pipe.fit(X_tr_idx,y_tr_idx)
        test_meta[:,j]=full_pipe.predict_proba(X_te_idx)[:,1]
        fitted_full_pipes.append(full_pipe)
        print(f"[{name}] OOF ROC-AUC={np.mean(aucs):.4f} | PR-AUC={np.mean(prs):.4f}")
    return oof,test_meta,per_model_scores,fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix,test_matrix,base_oof_scores,fitted_base_pipes=build_oof_and_test_matrix(base_models,X_train_raw,y_train,X_test_raw)

# -------------------------
# Meta Learner
# -------------------------
meta=LogisticRegression(max_iter=2000,class_weight="balanced",random_state=RANDOM_STATE)
meta.fit(oof_matrix,y_train)
meta_val_auc=roc_auc_score(y_test,meta.predict_proba(test_matrix)[:,1])
meta_val_pr=average_precision_score(y_test,meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

bundle={"base_pipes":fitted_base_pipes,"meta":meta,"base_order":base_names}
joblib.dump(bundle,OUT_DIR/"stacking_bundle.pkl")
(OUT_DIR/"metrics.json").write_text(json.dumps({
    "base_oof_scores":base_oof_scores,"meta_holdout":{"roc_auc":meta_val_auc,"pr_auc":meta_val_pr}},indent=2))
print("✓ Saved:",OUT_DIR/"stacking_bundle.pkl")

# -------------------------
# SHAP Explainability (Force XGB Kernel)
# -------------------------
FORCE_XGB_KERNEL = True
def _to_dense(X): return X.toarray() if hasattr(X,"toarray") else X
def _float32(A): return np.asarray(_to_dense(A), dtype=np.float32)
def _is_tree(clf): return any(k in clf.__class__.__name__.lower() for k in ["xgb","lgbm","cat","forest","gradient"])
def _is_xgb(clf): return clf.__class__.__name__.lower().startswith("xgb")
def _names(pre): return pre.get_feature_names_out()
def _bg(X,k):
    try: bg=shap.kmeans(X,k); return _float32(bg.data if hasattr(bg,"data") else bg)
    except: return _float32(shap.sample(X,min(k*10,X.shape[0])))
def _norm(vals,n_feat):
    arr=np.array(vals,dtype=object if isinstance(vals,list) else None)
    try: arr=np.array(vals)
    except: arr=np.stack([np.asarray(v) for v in vals],axis=0)
    arr=np.abs(arr)
    if arr.ndim==2: return arr
    if arr.ndim==3:
        if arr.shape[2]==n_feat: return arr.mean(axis=1)
        elif arr.shape[0] in (2,3): return arr.mean(axis=0)
    if isinstance(vals,list): return np.stack([np.asarray(v) for v in vals],axis=0).mean(axis=0)
    return np.asarray(vals)

# meta-level
explainer_meta=shap.LinearExplainer(meta,oof_matrix)
sv_meta=explainer_meta.shap_values(test_matrix)
plt.figure(); shap.summary_plot(sv_meta,feature_names=base_names,show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"meta_summary_base_contributions.png",dpi=200); plt.close()
plt.figure(); shap.summary_plot(sv_meta,feature_names=base_names,plot_type="bar",show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"meta_summary_bar_base_contributions.png",dpi=200); plt.close()
pd.Series(np.mean(np.abs(sv_meta),axis=0),index=base_names).sort_values(ascending=False)\
    .to_csv(EXPLAIN_DIR/"meta_base_importance.csv",header=["mean_abs_shap"])
print("✓ Saved meta SHAP summaries & importances.")

# per-base
N_SAMPLE=min(800,len(X_train_raw)); X_exp=X_train_raw.sample(n=N_SAMPLE,random_state=RANDOM_STATE)
global_imp=defaultdict(list); per_model_top10={}
for name,pipe in zip(base_names,fitted_base_pipes):
    try:
        pre,clf=pipe.named_steps["pre"],pipe.named_steps["clf"]
        X_enc=_float32(pre.transform(X_exp)); fn=_names(pre); n_feat=len(fn)
        if _is_tree(clf):
            if _is_xgb(clf) and FORCE_XGB_KERNEL:
                bg_arr=_bg(X_enc,32); pred=lambda data: clf.predict_proba(data)[:,1]
                expl=shap.KernelExplainer(pred,bg_arr)
                X_batch=_float32(shap.sample(X_enc,min(300,X_enc.shape[0])))
                vals_raw=expl.shap_values(X_batch); vals=_norm(vals_raw,n_feat)
                if vals.shape[0]!=X_enc.shape[0]: vals=np.repeat(vals.mean(axis=0,keepdims=True),X_enc.shape[0],axis=0)
            else:
                bg_arr=_bg(X_enc,64)
                expl=shap.TreeExplainer(clf,data=bg_arr,model_output="probability",feature_perturbation="interventional")
                vals=_norm(expl.shap_values(X_enc,check_additivity=False),n_feat)
        else:
            bg_arr=_bg(X_enc,64)
            expl=shap.LinearExplainer(clf,bg_arr)
            vals=_norm(expl.shap_values(X_enc),n_feat)
        imp=pd.Series(vals.mean(axis=0),index=fn).sort_values(ascending=False)
        per_model_top10[name]=imp.head(10)
        imp.head(30).sort_values(ascending=True).plot(kind="barh",figsize=(6,8))
        plt.tight_layout(); plt.savefig(EXPLAIN_DIR/f"base_{name}_bar.png",dpi=200); plt.close()
        for f,v in imp.items(): global_imp[f].append(v)
        print(f"✓ SHAP computed for base model: {name}")
    except Exception as e: print(f"! SHAP failed for base model {name}: {e}")

if global_imp:
    agg={f:np.mean(vs) for f,vs in global_imp.items()}
    agg=pd.Series(agg).sort_values(ascending=False)
    agg.to_csv(EXPLAIN_DIR/"global_feature_importance.csv",header=["mean_abs_shap"])
    agg.head(10).sort_values(ascending=True).plot(kind="barh",figsize=(5,4))
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"global_top10_bar.png",dpi=200); plt.close()
    print("✓ Saved per-model & aggregated feature importances.")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR}")


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved meta SHAP summaries & importances.


  0%|          | 0/300 [00:00<?, ?it/s]

✓ SHAP computed for base model: xgb




✓ SHAP computed for base model: lgb




✓ SHAP computed for base model: cat




! SHAP failed for base model rf: Data must be 1-dimensional, got ndarray of shape (45, 2) instead
✓ SHAP computed for base model: hgb
✓ Saved per-model & aggregated feature importances.

=== OUTPUTS ===
- Bundle: outputs/stacking_bundle.pkl
- Metrics: outputs/metrics.json
- Explain dir: outputs/explain


In [12]:
# =========================================
# ONE-CELL COLAB: STACKING + SHAP (Force XGB Kernel + RF Safety Squeeze)
# =========================================

!pip -q install numpy pandas scikit-learn xgboost lightgbm catboost shap joblib matplotlib

import os, json, joblib, numpy as np, pandas as pd, matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
from typing import Dict, List

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# Optional boosters
try:
    from xgboost import XGBClassifier; HAVE_XGB = True
except: HAVE_XGB = False
try:
    from lightgbm import LGBMClassifier; HAVE_LGB = True
except: HAVE_LGB = False
try:
    from catboost import CatBoostClassifier; HAVE_CAT = True
except: HAVE_CAT = False
import shap

# -------------------------
# Config
# -------------------------
CSV_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
TARGET = "Churn"
TEST_SIZE = 0.25
N_FOLDS = 5
RANDOM_STATE = 42
FORCE_XGB_KERNEL = True  # نستخدم KernelExplainer دائمًا مع XGB
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)
EXPLAIN_DIR = OUT_DIR / "explain"; EXPLAIN_DIR.mkdir(exist_ok=True)

# -------------------------
# Load dataset
# -------------------------
def find_file_recursively(filename, roots=["/content", "."]):
    for root in roots:
        for r, _, files in os.walk(root):
            if filename in files:
                return os.path.join(r, filename)
    raise FileNotFoundError(f"{filename} not found")

csv_path = find_file_recursively(CSV_FILE_NAME)
print(f"✓ Found dataset at: {csv_path}")

df = pd.read_csv(csv_path)
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0.0)
if "customerID" in df.columns: df.drop(columns=["customerID"], inplace=True)
y = df[TARGET].astype(str).str.strip().map({"Yes":1,"No":0}).astype(int)
X = df.drop(columns=[TARGET])

cat_cols = [c for c in X.columns if X[c].dtype == "O"]
num_cols = [c for c in X.columns if c not in cat_cols]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", SimpleImputer(strategy="median"), num_cols)
], verbose_feature_names_out=False)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# -------------------------
# Models
# -------------------------
def get_base_models():
    models = {}
    if HAVE_XGB:
        models["xgb"] = XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.06,
            subsample=0.85, colsample_bytree=0.85, eval_metric="logloss",
            random_state=RANDOM_STATE, tree_method="hist", n_jobs=-1
        )
    if HAVE_LGB:
        models["lgb"] = LGBMClassifier(
            n_estimators=350, num_leaves=64, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
        )
    if HAVE_CAT:
        models["cat"] = CatBoostClassifier(
            iterations=400, depth=6, learning_rate=0.05,
            loss_function="Logloss", verbose=False, random_state=RANDOM_STATE
        )
    models["rf"] = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE)
    models["hgb"] = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.08, random_state=RANDOM_STATE)
    return models

base_models = get_base_models()
base_names = list(base_models.keys())
print("Base learners:", base_names)

# -------------------------
# Build OOF & Test Matrices
# -------------------------
def build_oof_and_test_matrix(models, X_tr_raw, y_tr, X_te_raw):
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    n_train, n_test = X_tr_raw.shape[0], X_te_raw.shape[0]
    oof, test_meta = np.zeros((n_train,len(models))), np.zeros((n_test,len(models)))
    fitted_full_pipes, per_model_scores = [], {}
    X_tr_idx, X_te_idx = X_tr_raw.reset_index(drop=True), X_te_raw.reset_index(drop=True)
    y_tr_idx = pd.Series(y_tr).reset_index(drop=True).values

    for j,(name,clf) in enumerate(models.items()):
        preds_oof = np.zeros(n_train)
        fold_scores=[]
        for tr_idx,va_idx in skf.split(X_tr_idx,y_tr_idx):
            X_tr_f,X_va_f=X_tr_idx.iloc[tr_idx],X_tr_idx.iloc[va_idx]
            y_tr_f,y_va_f=y_tr_idx[tr_idx],y_tr_idx[va_idx]
            pipe=Pipeline([("pre",preprocessor),("clf",clf)])
            pipe.fit(X_tr_f,y_tr_f)
            p_va=pipe.predict_proba(X_va_f)[:,1]
            preds_oof[va_idx]=p_va
            fold_scores.append((roc_auc_score(y_va_f,p_va),average_precision_score(y_va_f,p_va)))
        aucs=[a for a,_ in fold_scores]; prs=[p for _,p in fold_scores]
        per_model_scores[name]={"oof_roc_auc_mean":float(np.mean(aucs)),"oof_pr_auc_mean":float(np.mean(prs))}
        oof[:,j]=preds_oof
        full_pipe=Pipeline([("pre",preprocessor),("clf",clf.__class__(**clf.get_params()))])
        full_pipe.fit(X_tr_idx,y_tr_idx)
        test_meta[:,j]=full_pipe.predict_proba(X_te_idx)[:,1]
        fitted_full_pipes.append(full_pipe)
        print(f"[{name}] OOF ROC-AUC={np.mean(aucs):.4f} | PR-AUC={np.mean(prs):.4f}")
    return oof,test_meta,per_model_scores,fitted_full_pipes

print("→ Rebuilding OOF & test matrices ...")
oof_matrix,test_matrix,base_oof_scores,fitted_base_pipes = build_oof_and_test_matrix(base_models,X_train_raw,y_train,X_test_raw)

# -------------------------
# Meta Learner
# -------------------------
meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
meta.fit(oof_matrix,y_train)
meta_val_auc = roc_auc_score(y_test, meta.predict_proba(test_matrix)[:,1])
meta_val_pr  = average_precision_score(y_test, meta.predict_proba(test_matrix)[:,1])
print(f"Meta holdout ROC-AUC={meta_val_auc:.4f} | PR-AUC={meta_val_pr:.4f}")

joblib.dump({"base_pipes":fitted_base_pipes,"meta":meta,"base_order":base_names}, OUT_DIR/"stacking_bundle.pkl")
(OUT_DIR/"metrics.json").write_text(json.dumps({
    "base_oof_scores":base_oof_scores,
    "meta_holdout":{"roc_auc":float(meta_val_auc),"pr_auc":float(meta_val_pr)}
}, indent=2))
print("✓ Saved:", OUT_DIR/"stacking_bundle.pkl")

# -------------------------
# SHAP Explainability (Force XGB Kernel + RF Safety Squeeze)
# -------------------------
def _to_dense(X): return X.toarray() if hasattr(X,"toarray") else X
def _float32(A): return np.asarray(_to_dense(A), dtype=np.float32)
def _is_tree(clf): return any(k in clf.__class__.__name__.lower() for k in ["xgb","lgbm","cat","forest","gradient"])
def _is_xgb(clf): return clf.__class__.__name__.lower().startswith("xgb")
def _names(pre): return pre.get_feature_names_out()
def _bg(X,k):
    try:
        bg = shap.kmeans(X,k)
        return _float32(bg.data if hasattr(bg,"data") else bg)
    except:
        return _float32(shap.sample(X, min(k*10, X.shape[0])))

def _norm(vals, n_feat):
    # يُوحِّد الشكل إلى (n_samples, n_features)
    try: arr = np.array(vals)
    except: arr = np.stack([np.asarray(v) for v in vals], axis=0)
    arr = np.abs(arr)
    if arr.ndim == 2: return arr
    if arr.ndim == 3:
        # (N, C, F)
        if arr.shape[2] == n_feat and arr.shape[1] in (2,3): return arr.mean(axis=1)
        # (C, N, F)
        if arr.shape[0] in (2,3) and arr.shape[2] == n_feat: return arr.mean(axis=0)
        # (N, F, C)
        if arr.shape[1] == n_feat and arr.shape[2] in (2,3): return arr.mean(axis=2)
    if isinstance(vals, list):
        stacked = np.stack([np.asarray(v) for v in vals], axis=0)
        return np.abs(stacked).mean(axis=0)
    return np.asarray(vals)

# ---- (1) Meta-level SHAP (base learners as features) ----
explainer_meta = shap.LinearExplainer(meta, oof_matrix)
sv_meta = explainer_meta.shap_values(test_matrix)

plt.figure(); shap.summary_plot(sv_meta, feature_names=base_names, show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"meta_summary_base_contributions.png", dpi=200); plt.close()
plt.figure(); shap.summary_plot(sv_meta, feature_names=base_names, plot_type="bar", show=False)
plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"meta_summary_bar_base_contributions.png", dpi=200); plt.close()
pd.Series(np.mean(np.abs(sv_meta), axis=0), index=base_names).sort_values(ascending=False)\
  .to_csv(EXPLAIN_DIR/"meta_base_importance.csv", header=["mean_abs_shap"])
print("✓ Saved meta SHAP summaries & importances.")

# ---- (2) Per-base model SHAP + RF safety squeeze ----
N_SAMPLE = min(800, len(X_train_raw))
K_BG = 64
EXPLAIN_TOP_BAR = 30

X_for_explain = X_train_raw.sample(n=N_SAMPLE, random_state=RANDOM_STATE)
global_feat_importance = defaultdict(list)
per_model_top10 = {}

for name, pipe in zip(base_names, fitted_base_pipes):
    try:
        pre, clf = pipe.named_steps["pre"], pipe.named_steps["clf"]
        X_enc = _float32(pre.transform(X_for_explain))
        fn = _names(pre); n_feat = len(fn)

        if _is_tree(clf):
            if _is_xgb(clf) and FORCE_XGB_KERNEL:
                # XGB: KernelExplainer ثابت
                bg_arr = _bg(X_enc, 32)
                pred = lambda data: clf.predict_proba(data)[:,1]
                expl = shap.KernelExplainer(pred, bg_arr)
                X_batch = _float32(shap.sample(X_enc, min(300, X_enc.shape[0])))
                vals_raw = expl.shap_values(X_batch)
                vals = _norm(vals_raw, n_feat)
                if vals.shape[0] != X_enc.shape[0]:
                    vals = np.repeat(vals.mean(axis=0, keepdims=True), X_enc.shape[0], axis=0)
            else:
                # باقي الشجر: TreeExplainer سريع
                bg_arr = _bg(X_enc, K_BG)
                expl = shap.TreeExplainer(clf, data=bg_arr, model_output="probability", feature_perturbation="interventional")
                vals = _norm(expl.shap_values(X_enc, check_additivity=False), n_feat)
        else:
            # LinearExplainer (سريع)؛ fallback Kernel لو لزم
            try:
                bg_arr = _bg(X_enc, 64)
                expl = shap.LinearExplainer(clf, bg_arr)
                vals = _norm(expl.shap_values(X_enc), n_feat)
            except Exception:
                bg_arr = _bg(X_enc, 32)
                pred = lambda data: clf.predict_proba(data)[:,1]
                expl = shap.KernelExplainer(pred, bg_arr)
                X_batch = _float32(shap.sample(X_enc, min(400, X_enc.shape[0])))
                vals_raw = expl.shap_values(X_batch)
                vals = _norm(vals_raw, n_feat)
                if vals.shape[0] != X_enc.shape[0]:
                    vals = np.repeat(vals.mean(axis=0, keepdims=True), X_enc.shape[0], axis=0)

        # ===== RF SAFETY SQUEEZE (يعالج أشكال الأبعاد الغريبة) =====
        if "forest" in clf.__class__.__name__.lower():
            # 3D -> متوسط عبر محور الكلاسات أيًا كان مكانه
            if vals.ndim == 3 and vals.shape[2] in (2,3): vals = vals.mean(axis=2)
            if vals.ndim == 3 and vals.shape[0] in (2,3) and vals.shape[2] == n_feat: vals = vals.mean(axis=0)
            if vals.ndim == 3 and vals.shape[1] == n_feat and vals.shape[2] in (2,3): vals = vals.mean(axis=2)
            # (n_features, n_classes) -> (1, n_features)
            if vals.ndim == 2 and vals.shape[0] == n_feat and vals.shape[1] in (2,3):
                vals = vals.mean(axis=1, keepdims=True)
            # (n_features, n_samples) -> transpose
            if vals.ndim == 2 and vals.shape[0] == n_feat and vals.shape[1] != n_feat:
                vals = vals.T
            # Ensure 2D (n_samples, n_features)
            if vals.ndim != 2 or vals.shape[1] != n_feat:
                vals = np.atleast_2d(vals)
                if vals.shape[1] == n_feat:
                    pass
                elif vals.shape[0] == n_feat:
                    vals = vals.T
                else:
                    vals = np.atleast_2d(vals.mean(axis=tuple(range(vals.ndim - 1))))
                    if vals.shape[1] != n_feat:
                        m = min(vals.shape[1], n_feat)
                        tmp = np.zeros((1, n_feat), dtype=vals.dtype); tmp[0, :m] = vals[0, :m]
                        vals = tmp
        # ===========================================================

        imp = pd.Series(vals.mean(axis=0), index=fn).sort_values(ascending=False)
        per_model_top10[name] = imp.head(10)

        (imp.head(EXPLAIN_TOP_BAR).sort_values(ascending=True)
            .plot(kind="barh", figsize=(6,8)))
        plt.tight_layout(); plt.savefig(EXPLAIN_DIR/f"base_{name}_bar.png", dpi=200); plt.close()

        for f, v in imp.items(): global_feat_importance[f].append(v)
        print(f"✓ SHAP computed for base model: {name}")

    except Exception as e:
        print(f"! SHAP failed for base model {name}: {e}")

# Aggregate across models
if global_feat_importance:
    agg = {f: np.mean(vs) for f, vs in global_feat_importance.items()}
    agg = pd.Series(agg).sort_values(ascending=False)
    top10 = agg.head(10)
    pd.DataFrame(per_model_top10).fillna(0.0).to_csv(EXPLAIN_DIR/"per_model_top10.csv")
    agg.to_csv(EXPLAIN_DIR/"global_feature_importance.csv", header=["mean_abs_shap"])
    (top10.sort_values(ascending=True).plot(kind="barh", figsize=(5,4)))
    plt.tight_layout(); plt.savefig(EXPLAIN_DIR/"global_top10_bar.png", dpi=200); plt.close()
    print("✓ Saved per-model & aggregated feature importances.")

print("\n=== OUTPUTS ===")
print(f"- Bundle: {OUT_DIR/'stacking_bundle.pkl'}")
print(f"- Metrics: {OUT_DIR/'metrics.json'}")
print(f"- Explain dir: {EXPLAIN_DIR}")


✓ Found dataset at: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
Base learners: ['xgb', 'lgb', 'cat', 'rf', 'hgb']
→ Rebuilding OOF & test matrices ...
[xgb] OOF ROC-AUC=0.8304 | PR-AUC=0.6420




[lgb] OOF ROC-AUC=0.8230 | PR-AUC=0.6232
[cat] OOF ROC-AUC=0.8365 | PR-AUC=0.6508
[rf] OOF ROC-AUC=0.8214 | PR-AUC=0.6148
[hgb] OOF ROC-AUC=0.8255 | PR-AUC=0.6276
Meta holdout ROC-AUC=0.8384 | PR-AUC=0.6434
✓ Saved: outputs/stacking_bundle.pkl
✓ Saved meta SHAP summaries & importances.


  0%|          | 0/300 [00:00<?, ?it/s]

✓ SHAP computed for base model: xgb




✓ SHAP computed for base model: lgb




✓ SHAP computed for base model: cat




✓ SHAP computed for base model: rf
✓ SHAP computed for base model: hgb
✓ Saved per-model & aggregated feature importances.

=== OUTPUTS ===
- Bundle: outputs/stacking_bundle.pkl
- Metrics: outputs/metrics.json
- Explain dir: outputs/explain
