# Financial Distress Prediction Pipeline (Adjusted)

This notebook follows the same structure as your current streamlined pipeline while fixing the material methodological and technical issues:

- Panel-safe lag/lead construction (sorting enforced)
- Missingness-aware distress proxy (avoids NaN → False “healthy” bias)
- Leakage-free event threshold calibration (train-only)
- Event indicators restricted to non-proxy channels (no coverage/leverage/EBITDA-proxy events)
- Stable preprocessing (train-fitted clipping + median imputation + scaling)
- Correct TreeSHAP extraction for XGBoost
- Scenario analysis that propagates through engineered features (no deleveraging/coverage scenarios; no proxy-related shocks)


In [25]:
# =============================================================================
# 0. Project Overview — Financial Distress Prediction Pipeline
# =============================================================================
# This notebook follows the standard Data Science Lifecycle:
#   (1) Data Cleaning and Quality Diagnostics
#   (2) Missing-Data Handling (leakage-aware)
#   (3) Feature Engineering and Label Construction
#   (4) Event Indicators (interpretable drivers; non-proxy channels only)
#   (5) Train / Validation / Test Split and Preprocessing
#   (6) Logit Models (supervised benchmark + inference audit)
#   (7) Tree-based Model (XGBoost with native TreeSHAP explainability)
#   (8) Evaluation and Benchmarks (Persistence vs. Early Warning)
#   (9) Decision Support and Scenario Analysis (primitive shocks; recompute features/events)
#
# Key design constraint (top-tier measurement integrity):
#   - The distress outcome is a constructed proxy. To avoid circularity, the modeling feature set
#     EXCLUDES leverage/coverage ratios that mechanically define the proxy.
# =============================================================================

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, log_loss
from sklearn.impute import KNNImputer

import statsmodels.api as sm

import xgboost as xgb
import matplotlib.pyplot as plt

from IPython.display import display


In [26]:
# =============================================================================
# 1. Data Import and Cleaning
# =============================================================================

DATA_PATH = "data.csv"  # file must be in the same folder

df = pd.read_csv(DATA_PATH, low_memory=False)
df.columns = df.columns.str.lower().str.strip()

# Keep a stable panel identifier (Compustat-style gvkey) and fiscal year
if "gvkey" not in df.columns or "fyear" not in df.columns:
    raise ValueError("Input must include columns: gvkey, fyear")

# Drop duplicates (keep last record for a given firm-year) and enforce ordering BEFORE any lag/lead ops
df = df.drop_duplicates(subset=["gvkey", "fyear"], keep="last").copy()

# Normalize identifiers
df = df[df["gvkey"].notna()].copy()
df["gvkey"] = (
    df["gvkey"]
      .astype(str)
      .str.strip()
      .str.replace(r"\.0$", "", regex=True)
)

df["fyear"] = pd.to_numeric(df["fyear"], errors="coerce")
df = df[df["fyear"].notna()].copy()
df["fyear"] = df["fyear"].astype(int)

# Convert "likely numeric" columns (best-effort), but keep gvkey as string
for col in df.columns:
    if col == "gvkey":
        continue
    # Do not force-convert obvious non-numeric columns; keep best-effort
    df[col] = pd.to_numeric(df[col], errors="ignore")

df = df.sort_values(["gvkey", "fyear"]).reset_index(drop=True)

print(f"Dataset loaded: {df.shape[0]:,} firm-year observations, {df.shape[1]} variables.")
print(f"Years: {int(df['fyear'].min())}–{int(df['fyear'].max())}")

# =============================================================================
# 1A. EDA (pre-imputation): Missingness & basic distributions
# =============================================================================

TRAIN_END_YEAR  = 2020
VAL_YEAR        = 2021
TEST_START_YEAR = 2022

train_mask_for_imputation = df["fyear"] <= TRAIN_END_YEAR

RAW_INPUT_CANDIDATES = [
    # Magnitudes / size proxies
    "at", "mkvalt",
    # Debt/capital structure
    "dlc", "dltt", "seq", "mibt",
    # Operating performance & coverage inputs
    "oibdp", "xint", "txt", "txdc", "txach",
    # Cash flow statement
    "oancf", "capx",
    # Liquidity & payout policy (for non-proxy event indicators)
    "che", "dv",
    # Liquidity ratio inputs (optional)
    "act", "lct",
    # Share repurchases (optional; for a broad DCF proxy)
    "prstkc",
]
RAW_INPUTS = [c for c in RAW_INPUT_CANDIDATES if c in df.columns]

df_raw_pre = df[RAW_INPUTS].copy()

pre_miss = pd.DataFrame({
    "col": RAW_INPUTS,
    "n": [int(len(df_raw_pre)) for _ in RAW_INPUTS],
    "n_na_pre": [int(df_raw_pre[c].isna().sum()) for c in RAW_INPUTS],
    "pct_na_pre": [float(df_raw_pre[c].isna().mean() * 100.0) for c in RAW_INPUTS],
    "train_pct_na_pre": [
        float(df_raw_pre.loc[train_mask_for_imputation, c].isna().mean() * 100.0) for c in RAW_INPUTS
    ],
}).sort_values("pct_na_pre", ascending=False)

print("\n=== Missingness on raw inputs (before imputation) ===")
display(pre_miss)

print("\n=== Distribution snapshot (raw inputs; before imputation) ===")
desc = df_raw_pre.apply(pd.to_numeric, errors="coerce").describe(
    percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]
).T
display(desc)

# =============================================================================
# 1B. Missing-Data Handling (leakage-aware)
# =============================================================================
# Goal: minimize mechanical label drift and leakage.
#   - Create missingness flags (miss_*) to preserve informative missingness.
#   - Construct size_decile from TRAIN distribution of log(assets) to respect scale heterogeneity.
#   - Impute raw accounting inputs using TRAIN-only information:
#         (i) within-firm lag-1 carryforward (economically plausible)
#        (ii) peer medians (training years) by size_decile, with year×size_decile when available
#       (iii) KNN imputation (TRAIN-fit) as a final fill for selected balance-sheet items

# --- Missingness indicators ---
for c in RAW_INPUTS:
    df[f"miss_{c}"] = df[c].isna().astype("int8")

# --- Size deciles (TRAIN-only cutpoints) ---
if "at" in df.columns:
    at = pd.to_numeric(df["at"], errors="coerce")
    log_at_raw = pd.Series(np.where(at > 0, np.log(at), np.nan), index=df.index)
else:
    log_at_raw = pd.Series(np.nan, index=df.index)

train_log_at = log_at_raw.loc[train_mask_for_imputation].dropna()

if len(train_log_at) >= 200:
    try:
        _, bins = pd.qcut(train_log_at, q=10, retbins=True, duplicates="drop")
        bins = np.unique(bins)
        # Ensure open-ended bins for stable assignment
        bins[0] = -np.inf
        bins[-1] = np.inf
        size_decile = pd.cut(log_at_raw, bins=bins, labels=False, include_lowest=True)
        df["size_decile"] = (size_decile + 1).astype("Int64").fillna(5).astype(int)
    except Exception:
        df["size_decile"] = 5
else:
    df["size_decile"] = 5

# --- Step (i): within-firm lag-1 carryforward for selected level variables ---
lag_fill_candidates = [
    "at", "mkvalt", "dlc", "dltt", "seq", "mibt",
    "oibdp", "xint", "oancf", "capx", "che", "dv", "act", "lct", "prstkc",
]
lag_fill_cols = [c for c in lag_fill_candidates if c in df.columns]

for c in lag_fill_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
    df[c] = df.groupby("gvkey")[c].transform(lambda s: s.fillna(s.shift(1)))

# --- Step (ii): peer medians from TRAIN only (size_decile, optionally year×size_decile within TRAIN years) ---
def peer_median_impute_inplace(df_in: pd.DataFrame, cols: list[str]) -> None:
    tr = df_in.loc[train_mask_for_imputation, ["fyear", "size_decile"] + cols].copy()
    for c in cols:
        if c not in df_in.columns:
            continue
        tr_c = pd.to_numeric(tr[c], errors="coerce")
        if tr_c.notna().sum() == 0:
            continue

        med_global = float(tr_c.median())
        med_by_dec = tr.groupby("size_decile")[c].median()
        med_by_year_dec = tr.groupby(["fyear", "size_decile"])[c].median()

        miss_idx = df_in.index[df_in[c].isna()]
        if len(miss_idx) == 0:
            continue

        tmp = df_in.loc[miss_idx, ["fyear", "size_decile"]].copy()
        year_dec_key = list(zip(tmp["fyear"].astype(int), tmp["size_decile"].astype(int)))
        fill_year_dec = pd.Series(year_dec_key, index=tmp.index).map(med_by_year_dec)
        fill_dec = tmp["size_decile"].map(med_by_dec)

        fill = fill_year_dec.where(fill_year_dec.notna(), fill_dec)
        fill = fill.fillna(med_global)

        df_in.loc[miss_idx, c] = fill.values

peer_median_cols = [c for c in RAW_INPUTS if c in df.columns]
peer_median_impute_inplace(df, peer_median_cols)

# --- Step (iii): KNN imputation (TRAIN-fit) as final fill for selected balance-sheet items ---
knn_candidates = ["at", "dlc", "dltt", "che", "act", "lct", "seq"]
knn_cols = [c for c in knn_candidates if c in df.columns]

def signed_log1p(x: np.ndarray) -> np.ndarray:
    return np.sign(x) * np.log1p(np.abs(x))

def signed_expm1(z: np.ndarray) -> np.ndarray:
    return np.sign(z) * np.expm1(np.abs(z))

if len(knn_cols) >= 2:
    X_knn = df[knn_cols].apply(pd.to_numeric, errors="coerce")
    X_knn_log = signed_log1p(X_knn.to_numpy(dtype=float))

    imputer = KNNImputer(n_neighbors=5, weights="distance")
    imputer.fit(X_knn_log[train_mask_for_imputation.values, :])

    X_imp = imputer.transform(X_knn_log)
    X_imp = signed_expm1(X_imp)
    X_imp = pd.DataFrame(X_imp, columns=knn_cols, index=df.index)

    for c in knn_cols:
        m = df[c].isna()
        if int(m.sum()) > 0:
            df.loc[m, c] = X_imp.loc[m, c]

# --- Imputation impact audit ---
df_raw_post = df[RAW_INPUTS].copy()
post_miss = pd.DataFrame({
    "col": RAW_INPUTS,
    "n_na_pre": [int(df_raw_pre[c].isna().sum()) for c in RAW_INPUTS],
    "n_na_post": [int(df_raw_post[c].isna().sum()) for c in RAW_INPUTS],
    "pct_na_pre": [float(df_raw_pre[c].isna().mean() * 100.0) for c in RAW_INPUTS],
    "pct_na_post": [float(df_raw_post[c].isna().mean() * 100.0) for c in RAW_INPUTS],
})
post_miss["n_filled"] = post_miss["n_na_pre"] - post_miss["n_na_post"]
post_miss["pct_filled_of_na"] = np.where(
    post_miss["n_na_pre"] > 0,
    100.0 * post_miss["n_filled"] / post_miss["n_na_pre"],
    np.nan
)
post_miss = post_miss.sort_values("pct_na_pre", ascending=False)

print("\n=== Missingness AFTER imputation (audit) ===")
display(post_miss)

# (Optional) quick visual: top-missing variables before vs after
top = post_miss.head(12).copy()
plt.figure()
plt.barh(top["col"], top["pct_na_pre"], label="Pre")
plt.barh(top["col"], top["pct_na_post"], label="Post")
plt.xlabel("% missing")
plt.title("Top missing raw inputs: before vs after imputation")
plt.legend()
plt.gca().invert_yaxis()
plt.show()


Dataset loaded: 75,005 firm-year observations, 89 variables.
Years: 2014–2024


In [27]:
# =============================================================================
# 2. Helper Functions
# =============================================================================

def safe_divide(a, b):
    """Numerically stable division with correct alignment; returns NaN for non-finite results."""
    a = pd.to_numeric(a, errors="coerce")
    b = pd.to_numeric(b, errors="coerce")

    # Correctness: enforce alignment/broadcasting so Series/scalars never mis-shape
    if isinstance(a, pd.Series) and isinstance(b, pd.Series):
        a, b = a.align(b)
    else:
        a, b = np.broadcast_arrays(a, b)

    with np.errstate(divide="ignore", invalid="ignore"):
        res = a / b

    if isinstance(res, pd.Series):
        res = res.replace([np.inf, -np.inf], np.nan)
    else:
        res = np.where(np.isfinite(res), res, np.nan)
    return res

def safe_log(x):
    """log(x) for x>0 else NaN."""
    x = pd.to_numeric(x, errors="coerce")
    out = pd.Series(np.nan, index=x.index, dtype="float64")
    m = x > 0
    out.loc[m] = np.log(x.loc[m])
    return out


In [28]:
# =============================================================================
# 3. Feature Engineering and Label Construction
# =============================================================================
# Outcome design:
#   - distress_dummy(t) is a constructed proxy (high leverage OR negative equity)
#   - target_next_year_distress(t) = distress_dummy(t+1) within the same firm
#
# Measurement guardrail:
#   - comparisons are missingness-aware: we do not silently treat NaNs as "healthy"
#   - for the PROXY ratios only: non-positive denominators are treated as tail states (set to +inf)

firm_col = "gvkey"

# --- Debt and capital components (missingness-aware aggregation) ---
dlc  = pd.to_numeric(df.get("dlc", np.nan), errors="coerce")
dltt = pd.to_numeric(df.get("dltt", np.nan), errors="coerce")
df["total_debt"] = pd.concat([dlc, dltt], axis=1).sum(axis=1, min_count=1)

seq  = pd.to_numeric(df.get("seq", np.nan), errors="coerce")
mibt = pd.to_numeric(df.get("mibt", np.nan), errors="coerce") if "mibt" in df.columns else pd.Series(np.nan, index=df.index)
df["equity_plus_mi_sp"] = pd.concat([seq, mibt], axis=1).sum(axis=1, min_count=1)

df["total_capital_sp"] = df["total_debt"] + df["equity_plus_mi_sp"]

# --- Operating inputs ---
oibdp = pd.to_numeric(df.get("oibdp", np.nan), errors="coerce")  # EBITDA proxy
xint  = pd.to_numeric(df.get("xint", np.nan), errors="coerce")

# --- FFO proxy (tax adjustment only where available; avoids hard-coding zeros) ---
txt   = pd.to_numeric(df.get("txt", np.nan), errors="coerce") if "txt" in df.columns else pd.Series(np.nan, index=df.index)
txdc  = pd.to_numeric(df.get("txdc", np.nan), errors="coerce") if "txdc" in df.columns else pd.Series(np.nan, index=df.index)
txach = pd.to_numeric(df.get("txach", np.nan), errors="coerce") if "txach" in df.columns else pd.Series(np.nan, index=df.index)

tax_adj = (txt - txdc - txach)
ffo_base = oibdp - xint
ffo_adj = ffo_base.copy()
ffo_adj.loc[tax_adj.notna()] = (ffo_base - tax_adj).loc[tax_adj.notna()]

# --- Cash flow capacity ratios (NON-proxy modeling channels) ---
oancf = pd.to_numeric(df.get("oancf", np.nan), errors="coerce")
capx  = pd.to_numeric(df.get("capx", np.nan), errors="coerce") if "capx" in df.columns else pd.Series(np.nan, index=df.index)

df["focf"] = oancf - capx

dv = pd.to_numeric(df.get("dv", np.nan), errors="coerce") if "dv" in df.columns else pd.Series(np.nan, index=df.index)
prstkc = pd.to_numeric(df.get("prstkc", np.nan), errors="coerce") if "prstkc" in df.columns else pd.Series(np.nan, index=df.index)

# Defensive convention: if payout/repurchase is missing, treat as 0 for DCF proxy,
# while retaining miss_dv/miss_prstkc indicators from the imputation block.
dv0 = dv.fillna(0.0)
prstkc0 = prstkc.fillna(0.0)

df["dcf"] = df["focf"] - dv0 - prstkc0

td = df["total_debt"]
td_pos = td.notna() & (td > 0)

df["sp_cfo_to_debt"]  = np.where(td_pos, safe_divide(oancf, td), np.nan)
df["sp_focf_to_debt"] = np.where(td_pos, safe_divide(df["focf"], td), np.nan)
df["sp_dcf_to_debt"]  = np.where(td_pos, safe_divide(df["dcf"], td), np.nan)

# --- Size / market variables (explicit inside notebook for reproducibility) ---
if "at" in df.columns:
    df["log_at"] = safe_log(df["at"])
if "mkvalt" in df.columns:
    df["log_mkvalt"] = safe_log(df["mkvalt"])

# =============================================================================
# Distress proxy (measurement is the outcome; treat non-positive denominators as tail states)
# =============================================================================

cap = df["total_capital_sp"]

# Proxy ratios (tail-handling)
ffo_to_debt_pct = pd.Series(np.nan, index=df.index, dtype="float64")
m_ffo = td_pos & ffo_adj.notna()
ffo_to_debt_pct.loc[m_ffo] = 100.0 * (ffo_adj.loc[m_ffo] / td.loc[m_ffo])

debt_to_capital_pct = pd.Series(np.nan, index=df.index, dtype="float64")
m_cap_pos = td_pos & cap.notna() & (cap > 0)
debt_to_capital_pct.loc[m_cap_pos] = 100.0 * (td.loc[m_cap_pos] / cap.loc[m_cap_pos])
m_cap_nonpos = td_pos & cap.notna() & (cap <= 0)
debt_to_capital_pct.loc[m_cap_nonpos] = np.inf

debt_to_ebitda = pd.Series(np.nan, index=df.index, dtype="float64")
m_eb_pos = td_pos & oibdp.notna() & (oibdp > 0)
debt_to_ebitda.loc[m_eb_pos] = td.loc[m_eb_pos] / oibdp.loc[m_eb_pos]
m_eb_nonpos = td_pos & oibdp.notna() & (oibdp <= 0)
debt_to_ebitda.loc[m_eb_nonpos] = np.inf

valid_hl = ffo_to_debt_pct.notna() & debt_to_capital_pct.notna() & debt_to_ebitda.notna()

hl_ffo = valid_hl & (ffo_to_debt_pct < 15)
hl_cap = valid_hl & (debt_to_capital_pct > 55)
hl_deb = valid_hl & (debt_to_ebitda > 4.5)

is_highly_leveraged = hl_ffo & hl_cap & hl_deb

valid_seq = seq.notna()
is_equity_negative = valid_seq & (seq < 0)

distress = pd.Series(np.nan, index=df.index, dtype="float64")
info_mask = valid_hl | valid_seq
distress.loc[info_mask] = (is_highly_leveraged | is_equity_negative).loc[info_mask].astype("int8")

df["distress_dummy"] = distress  # keep NaN where label is not defensible

# Target: next year's distress (panel-safe due to sorting above)
df["target_next_year_distress"] = df.groupby(firm_col)["distress_dummy"].shift(-1)

# Modeling sample restriction: require defensible current distress AND next-year label
df_model = df[df["target_next_year_distress"].notna() & df["distress_dummy"].notna()].copy()
df_model["target_next_year_distress"] = df_model["target_next_year_distress"].astype("int8")
df_model["distress_dummy"] = df_model["distress_dummy"].astype("int8")

print(f"Modeling sample: {len(df_model):,} firm-years with defensible current distress and next-year labels.")

# =============================================================================
# 3A. EDA (post-label): base rates and attrition diagnostics
# =============================================================================

rate_by_year = df_model.groupby("fyear")[["distress_dummy", "target_next_year_distress"]].mean()
print("\n=== Base rates by fiscal year (current vs next-year proxy) ===")
display(rate_by_year)

if "size_decile" in df_model.columns:
    rate_by_size = df_model.groupby("size_decile")[["distress_dummy", "target_next_year_distress"]].mean()
    print("\n=== Base rates by TRAIN-derived size decile ===")
    display(rate_by_size)

# Quick distribution check for modeling channels
eda_cols = [c for c in ["sp_cfo_to_debt", "sp_focf_to_debt", "sp_dcf_to_debt", "log_at", "log_mkvalt"] if c in df_model.columns]
if len(eda_cols) > 0:
    df_model[eda_cols].hist(bins=40, figsize=(12, 6))
    plt.suptitle("Distributions of key modeling channels (post-imputation/engineering)")
    plt.show()


Modeling sample: 63,599 firm-years with defensible current distress and next-year labels.


In [29]:
# =============================================================================
# 4. Event Indicators — Interpretable Drivers (NON-proxy channels only)
# =============================================================================
# Constraint: exclude coverage/leverage/EBITDA-proxy events (anything mechanically embedded in distress_proxy).
# We therefore focus on:
#   - Dividend policy moments (cuts / suspensions / initiations)
#   - Cash-flow shocks (CFO / FOCF)
#   - Liquidity deterioration (cash drawdowns; current-ratio squeeze if available)
#
# Thresholds are calibrated on TRAIN ONLY (<= TRAIN_END_YEAR) to avoid leakage.

train_mask = df_model["fyear"] <= TRAIN_END_YEAR

# --- Dividend moments ---
if "dv" in df_model.columns:
    dv = pd.to_numeric(df_model["dv"], errors="coerce")
    df_model["dv_l1"] = df_model.groupby("gvkey")["dv"].shift(1)

    # Among observed payers in TRAIN: calibrate "cut" threshold on low percentile of YoY ratio
    dv_ratio = safe_divide(dv, df_model["dv_l1"])
    valid_dv = df_model["dv_l1"] > 0

    cut_q = dv_ratio[train_mask & valid_dv].quantile(0.10)
    cut_thr = float(np.clip(cut_q, 0.50, 0.95))  # bounded for stability

    df_model["evt_div_suspend"] = (valid_dv & (dv == 0)).astype("int8")
    df_model["evt_div_cut"]     = (valid_dv & (dv_ratio < cut_thr) & (dv > 0)).astype("int8")
    df_model["evt_div_init"]    = ((df_model["dv_l1"].fillna(0) == 0) & (dv > 0)).astype("int8")
else:
    cut_thr = 0.75
    df_model["evt_div_suspend"] = 0
    df_model["evt_div_cut"]     = 0
    df_model["evt_div_init"]    = 0

# --- CFO shocks ---
if "oancf" in df_model.columns:
    cfo = pd.to_numeric(df_model["oancf"], errors="coerce")
    df_model["oancf_l1"] = df_model.groupby("gvkey")["oancf"].shift(1)

    df_model["evt_cfo_neg"] = (cfo < 0).astype("int8")

    cfo_ratio = safe_divide(cfo, df_model["oancf_l1"])
    valid_cfo = df_model["oancf_l1"] > 0
    cfo_drop_q = cfo_ratio[train_mask & valid_cfo].quantile(0.05)
    cfo_drop_thr = float(np.clip(cfo_drop_q, 0.10, 0.90))

    df_model["evt_cfo_collapse"] = (valid_cfo & (cfo_ratio < cfo_drop_thr)).astype("int8")
else:
    cfo_drop_thr = 0.75
    df_model["evt_cfo_neg"] = 0
    df_model["evt_cfo_collapse"] = 0

# --- FOCF shocks (oancf - capx) ---
if "focf" in df_model.columns:
    focf = pd.to_numeric(df_model["focf"], errors="coerce")
    df_model["focf_l1"] = df_model.groupby("gvkey")["focf"].shift(1)

    df_model["evt_focf_neg"] = (focf < 0).astype("int8")

    focf_ratio = safe_divide(focf, df_model["focf_l1"])
    valid_focf = df_model["focf_l1"] > 0
    focf_drop_q = focf_ratio[train_mask & valid_focf].quantile(0.05)
    focf_drop_thr = float(np.clip(focf_drop_q, 0.10, 0.90))

    df_model["evt_focf_collapse"] = (valid_focf & (focf_ratio < focf_drop_thr)).astype("int8")
else:
    focf_drop_thr = 0.75
    df_model["evt_focf_neg"] = 0
    df_model["evt_focf_collapse"] = 0

# --- Liquidity drawdown: cash drop ---
if "che" in df_model.columns:
    che = pd.to_numeric(df_model["che"], errors="coerce")
    df_model["che_l1"] = df_model.groupby("gvkey")["che"].shift(1)

    che_ratio = safe_divide(che, df_model["che_l1"])
    valid_che = df_model["che_l1"] > 0
    che_drop_q = che_ratio[train_mask & valid_che].quantile(0.05)
    che_drop_thr = float(np.clip(che_drop_q, 0.10, 0.90))

    df_model["evt_cash_drawdown"] = (valid_che & (che_ratio < che_drop_thr)).astype("int8")
else:
    che_drop_thr = 0.75
    df_model["evt_cash_drawdown"] = 0

# --- Liquidity squeeze: current ratio deterioration (optional; non-proxy) ---
if ("act" in df_model.columns) and ("lct" in df_model.columns):
    act = pd.to_numeric(df_model["act"], errors="coerce")
    lct = pd.to_numeric(df_model["lct"], errors="coerce")

    df_model["current_ratio"] = safe_divide(act, lct)
    df_model["current_ratio_l1"] = df_model.groupby("gvkey")["current_ratio"].shift(1)

    cr_ratio = safe_divide(df_model["current_ratio"], df_model["current_ratio_l1"])
    valid_cr = df_model["current_ratio_l1"] > 0
    cr_drop_q = cr_ratio[train_mask & valid_cr].quantile(0.05)
    cr_drop_thr = float(np.clip(cr_drop_q, 0.10, 0.90))

    df_model["evt_liquidity_squeeze"] = (valid_cr & (cr_ratio < cr_drop_thr)).astype("int8")
else:
    cr_drop_thr = 0.75
    df_model["evt_liquidity_squeeze"] = 0

event_feats = [c for c in df_model.columns if c.startswith("evt_")]
print(f"Event indicators included: {event_feats}")


Event indicators included: ['evt_div_suspend', 'evt_div_initiate', 'evt_div_cut', 'evt_cfo_neg', 'evt_cfo_collapse', 'evt_focf_neg', 'evt_focf_collapse', 'evt_cash_drawdown']


In [30]:
# =============================================================================
# 5. Train / Validation / Test Split and Preprocessing
# =============================================================================

train = df_model[df_model["fyear"] <= TRAIN_END_YEAR].copy()
val   = df_model[df_model["fyear"] == VAL_YEAR].copy()
test  = df_model[df_model["fyear"] >= TEST_START_YEAR].copy()

TARGET_COL = "target_next_year_distress"

# Core NON-proxy continuous channels (avoid mechanical overlap with distress proxy definition)
continuous_feats = [c for c in ["sp_cfo_to_debt", "sp_focf_to_debt", "sp_dcf_to_debt"] if c in df_model.columns]

# Market / size controls are explicitly allowed
for opt in ["log_at", "log_mkvalt"]:
    if opt in df_model.columns:
        continuous_feats.append(opt)

event_feats = [c for c in df_model.columns if c.startswith("evt_")]

MODEL_FEATS = continuous_feats + event_feats

# --- Stabilize continuous inputs: train-fitted clipping + train-median imputation ---
WINSOR_LO_Q = 0.01
WINSOR_HI_Q = 0.99

clip_bounds = {}
train_medians = {}

for col in continuous_feats:
    s = pd.to_numeric(train[col], errors="coerce").replace([np.inf, -np.inf], np.nan)
    lo = float(s.quantile(WINSOR_LO_Q))
    hi = float(s.quantile(WINSOR_HI_Q))
    clip_bounds[col] = (lo, hi)
    train_medians[col] = float(s.median())

def clip_and_impute(df_in: pd.DataFrame) -> pd.DataFrame:
    df_out = df_in.copy()
    for col in continuous_feats:
        lo, hi = clip_bounds[col]
        x = pd.to_numeric(df_out[col], errors="coerce").replace([np.inf, -np.inf], np.nan)
        x = x.clip(lower=lo, upper=hi)
        df_out[col] = x.fillna(train_medians[col])
    for col in event_feats:
        df_out[col] = pd.to_numeric(df_out[col], errors="coerce").fillna(0).astype("int8")
    return df_out

train = clip_and_impute(train)
val   = clip_and_impute(val)
test  = clip_and_impute(test)

# Standardize continuous features (train statistics)
scaler = StandardScaler()
scaler.fit(train[continuous_feats])

train.loc[:, continuous_feats] = scaler.transform(train[continuous_feats])
val.loc[:, continuous_feats]   = scaler.transform(val[continuous_feats])
test.loc[:, continuous_feats]  = scaler.transform(test[continuous_feats])

print(f"Split sizes: train={len(train):,} | val={len(val):,} | test={len(test):,}")
print(f"Features: {len(MODEL_FEATS)} (continuous={len(continuous_feats)} + events={len(event_feats)})")
print("Continuous feats:", continuous_feats)
print("Event feats:", event_feats)

# Quick EDA: event prevalence in TRAIN
if len(event_feats) > 0:
    evt_prev = train[event_feats].mean().sort_values(ascending=False)
    print("\n=== Event prevalence (TRAIN) ===")
    display(evt_prev.to_frame("train_prevalence").head(30))


Split sizes: train=44,780 | val=6,415 | test=12,404
Features: 15 (continuous=7 + events=8)
Continuous feats: ['sp_debt_to_capital', 'sp_debt_to_ebitda', 'sp_ffo_to_debt', 'sp_cfo_to_debt', 'sp_focf_to_debt', 'log_at', 'log_mkvalt']
Event feats: ['evt_div_suspend', 'evt_div_initiate', 'evt_div_cut', 'evt_cfo_neg', 'evt_cfo_collapse', 'evt_focf_neg', 'evt_focf_collapse', 'evt_cash_drawdown']


In [31]:
# =============================================================================
# 6. Logit Model (Benchmark)
# =============================================================================

X_train, y_train = train[MODEL_FEATS], train[TARGET_COL].astype(int)
X_val, y_val     = val[MODEL_FEATS],   val[TARGET_COL].astype(int)
X_test, y_test   = test[MODEL_FEATS],  test[TARGET_COL].astype(int)

def evaluate_split(y_true, p_pred):
    return pd.Series({
        "AUC": roc_auc_score(y_true, p_pred),
        "AP": average_precision_score(y_true, p_pred),
        "Brier": brier_score_loss(y_true, p_pred),
        "LogLoss": log_loss(y_true, p_pred),
        "PosRate": float(np.mean(y_true)),
        "N": int(len(y_true)),
    })

# --- Validation-tuned regularization (out-of-time) ---
C_grid = [0.01, 0.1, 1.0, 10.0]
tune_rows = []

for C in C_grid:
    clf = LogisticRegression(C=C, max_iter=2000, solver="lbfgs")
    clf.fit(X_train, y_train)
    p = clf.predict_proba(X_val)[:, 1]
    m = evaluate_split(y_val, p)
    tune_rows.append(pd.concat([pd.Series({"C": C}), m]))

tune_df = pd.DataFrame(tune_rows).sort_values("AUC", ascending=False).reset_index(drop=True)
print("\n=== Logit tuning (choose by VAL AUC) ===")
display(tune_df)

best_C = float(tune_df.loc[0, "C"])
logit = LogisticRegression(C=best_C, max_iter=2000, solver="lbfgs")
logit.fit(X_train, y_train)

p_val  = logit.predict_proba(X_val)[:, 1]
p_test = logit.predict_proba(X_test)[:, 1]

eval_val  = evaluate_split(y_val, p_val)
eval_test = evaluate_split(y_test, p_test)

print(f"\nChosen C={best_C}")
print("\nValidation performance (Logit):\n", eval_val.round(4))
print("\nTest performance (Logit):\n", eval_test.round(4))

# --- Coefficient audit (predictive; not causal) ---
coef = pd.Series(logit.coef_.ravel(), index=MODEL_FEATS).sort_values(key=lambda s: s.abs(), ascending=False)
print("\nTop |coefficients| (Logit):")
display(coef.head(25).to_frame("coef"))

# --- Inference audit via statsmodels Logit with firm-clustered SEs ---
# Purpose: descriptive stability check under within-firm dependence (NOT causal inference).
try:
    X_sm = sm.add_constant(train[MODEL_FEATS], has_constant="add")
    res = sm.Logit(y_train, X_sm).fit(disp=False, maxiter=200)
    res_cl = res.get_robustcov_results(cov_type="cluster", groups=train["gvkey"])
    summ = res_cl.summary2().tables[1].copy()
    # Keep a compact view for the notebook
    keep_cols = [c for c in ["Coef.", "Std.Err.", "z", "P>|z|"] if c in summ.columns]
    print("\n=== Statsmodels logit (firm-clustered SE) — coefficient table ===")
    display(summ[keep_cols].sort_values("Coef.", key=lambda s: s.abs(), ascending=False).head(30))
except Exception as e:
    print("Statsmodels inference audit skipped (convergence/collinearity). Error:", repr(e))


Validation performance (Logit):
 AUC           0.6953
AP            0.3729
Brier         0.1260
LogLoss       0.4123
PosRate       0.1668
N          6415.0000
dtype: float64
Test performance (Logit):
 AUC            0.6885
AP             0.3725
Brier          0.1365
LogLoss        0.4384
PosRate        0.1831
N          12404.0000
dtype: float64


## 6.1 Logit benchmark with current-state (Task A: surveillance)

This reports the incremental value of accounting/market features **conditional on the current distress state** (persistence benchmark).

In [None]:
from sklearn.metrics import roc_auc_score

# Add current-state as an additional predictor (surveillance use-case)
state_train = pd.to_numeric(train.get("distress_dummy"), errors="coerce").fillna(0.0).clip(0, 1).to_numpy(float)
state_val   = pd.to_numeric(val.get("distress_dummy"), errors="coerce").fillna(0.0).clip(0, 1).to_numpy(float)
state_test  = pd.to_numeric(test.get("distress_dummy"), errors="coerce").fillna(0.0).clip(0, 1).to_numpy(float)

X_train_state = np.column_stack([X_train.to_numpy(float), state_train])
X_val_state   = np.column_stack([X_val.to_numpy(float),   state_val])
X_test_state  = np.column_stack([X_test.to_numpy(float),  state_test])

C_grid_state = [0.01, 0.1, 1.0, 10.0]
best_state = {"C": None, "auc": -np.inf, "model": None}

for C in C_grid_state:
    m = LogisticRegression(C=C, solver="lbfgs", max_iter=3000, class_weight="balanced", random_state=42)
    m.fit(X_train_state, y_train)
    pv = m.predict_proba(X_val_state)[:, 1]
    auc = roc_auc_score(y_val, pv) if len(np.unique(y_val)) > 1 else np.nan
    if np.isfinite(auc) and auc > best_state["auc"]:
        best_state = {"C": C, "auc": float(auc), "model": m}

logit_state = best_state["model"]
p_val_logit_state  = logit_state.predict_proba(X_val_state)[:, 1]
p_test_logit_state = logit_state.predict_proba(X_test_state)[:, 1]

print("Best Logit(feats+state) C:", best_state["C"], "| VAL AUC:", round(best_state["auc"], 4))


In [32]:
# =============================================================================
# 7. Tree-based Model (XGBoost)
# =============================================================================

xgb_params = dict(
    objective="binary:logistic",
    eval_metric="aucpr",
    learning_rate=0.05,
    max_depth=4,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
)

xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

p_val_xgb = xgb_clf.predict_proba(X_val)[:, 1]
p_test_xgb = xgb_clf.predict_proba(X_test)[:, 1]

eval_val_xgb = evaluate_split(y_val, p_val_xgb)
eval_test_xgb = evaluate_split(y_test, p_test_xgb)

print("Validation performance (XGB):\n", eval_val_xgb.round(4))
print("Test performance (XGB):\n", eval_test_xgb.round(4))


Validation performance (XGB):
 AUC           0.9068
AP            0.7292
Brier         0.0767
LogLoss       0.2603
PosRate       0.1668
N          6415.0000
dtype: float64
Test performance (XGB):
 AUC            0.9112
AP             0.7540
Brier          0.0788
LogLoss        0.2671
PosRate        0.1831
N          12404.0000
dtype: float64


In [33]:
# =============================================================================
# 8. Explainability (TreeSHAP via XGBoost pred_contribs)
# =============================================================================

booster = xgb_clf.get_booster()
dval = xgb.DMatrix(X_val, feature_names=MODEL_FEATS)

# pred_contribs=True returns SHAP contributions per feature plus a bias term (last column)
shap_val = booster.predict(dval, pred_contribs=True)
shap_cols = MODEL_FEATS + ["bias"]
shap_df = pd.DataFrame(shap_val, columns=shap_cols)

abs_mean = shap_df[MODEL_FEATS].abs().mean().sort_values(ascending=False)

print("Mean absolute SHAP contributions (validation):")
display(abs_mean.head(20))


Mean absolute SHAP contributions (validation):


sp_debt_to_capital    1.133036
sp_ffo_to_debt        0.527658
log_at                0.275150
sp_debt_to_ebitda     0.186086
log_mkvalt            0.145138
sp_focf_to_debt       0.142577
sp_cfo_to_debt        0.136545
evt_cfo_neg           0.071492
evt_focf_neg          0.054369
evt_cfo_collapse      0.019340
evt_cash_drawdown     0.014455
evt_div_cut           0.007996
evt_focf_collapse     0.005306
evt_div_suspend       0.003284
evt_div_initiate      0.002192
dtype: float32

## 7.1 Explainability extension: event-toggle ΔPD (local business lever effects)

For each event binary, compute the **average change in predicted PD** on VAL when toggling the event (0→1 and 1→0), holding other features fixed. This complements SHAP by giving an interpretable *policy lever* effect size.

In [None]:
# Event toggle ΔPD table (on VAL)
# Uses the already-fit xgb_clf and feature matrix X_val.

if len(event_feats) == 0:
    print("No event features available for toggle analysis.")
else:
    Xv = X_val.copy()
    base_p = xgb_clf.predict_proba(Xv)[:, 1]

    rows = []
    for e in event_feats:
        if e not in Xv.columns:
            continue
        x0 = Xv.copy()
        x1 = Xv.copy()
        x0[e] = 0
        x1[e] = 1

        p0 = xgb_clf.predict_proba(x0)[:, 1]
        p1 = xgb_clf.predict_proba(x1)[:, 1]

        # Conditional averages for interpretability
        m0 = (Xv[e].to_numpy() == 0)
        m1 = (Xv[e].to_numpy() == 1)

        delta_0_to_1 = float(np.mean((p1 - base_p)[m0])) if np.any(m0) else np.nan
        delta_1_to_0 = float(np.mean((p0 - base_p)[m1])) if np.any(m1) else np.nan

        rows.append({
            "event": e,
            "prevalence_val": float(np.mean(Xv[e])),
            "ΔPD if toggle 0→1 (among zeros)": delta_0_to_1,
            "ΔPD if toggle 1→0 (among ones)": delta_1_to_0,
        })

    toggle_tbl = pd.DataFrame(rows).sort_values("prevalence_val", ascending=False)
    display(toggle_tbl)


In [34]:
# =============================================================================
# 9. Evaluation: Persistence vs. Early Warning
# =============================================================================

eps = 1e-3
p_persist_val  = np.clip(val["distress_dummy"].astype(float),  eps, 1 - eps)
p_persist_test = np.clip(test["distress_dummy"].astype(float), eps, 1 - eps)

benchmarks = pd.DataFrame([
    ["VAL",  "Persistence", *evaluate_split(y_val,  p_persist_val)],
    ["VAL",  "Logit",       *eval_val],
    ["VAL",  "XGB",         *eval_val_xgb],
    ["TEST", "Persistence", *evaluate_split(y_test, p_persist_test)],
    ["TEST", "Logit",       *eval_test],
    ["TEST", "XGB",         *eval_test_xgb],
], columns=["Split", "Model", "AUC", "AP", "Brier", "LogLoss", "PosRate", "N"])

display(benchmarks)

# Early-warning subset: not distressed at t but distressed at t+1 (0→1 transitions)
val_ew  = (val["distress_dummy"] == 0) & (val[TARGET_COL] == 1)
test_ew = (test["distress_dummy"] == 0) & (test[TARGET_COL] == 1)

def ew_summary(mask, p_pred, label):
    mask = mask.fillna(False)
    if int(mask.sum()) == 0:
        return pd.Series({"Subset": label, "N": 0, "MeanPD": np.nan, "MedianPD": np.nan})
    return pd.Series({
        "Subset": label,
        "N": int(mask.sum()),
        "MeanPD": float(np.mean(p_pred[mask.values])),
        "MedianPD": float(np.median(p_pred[mask.values])),
    })

ew = pd.DataFrame([
    ew_summary(val_ew,  p_val,      "VAL: 0→1 transitions (Logit PD)"),
    ew_summary(val_ew,  p_val_xgb,  "VAL: 0→1 transitions (XGB PD)"),
    ew_summary(test_ew, p_test,     "TEST: 0→1 transitions (Logit PD)"),
    ew_summary(test_ew, p_test_xgb, "TEST: 0→1 transitions (XGB PD)"),
])

print("\n=== Early-warning PD levels on 0→1 transitions ===")
display(ew)


Unnamed: 0,Split,Model,AUC,AP,Brier,LogLoss,PosRate,N
0,VAL,Persistence,0.806236,0.528628,0.100968,0.699751,0.166797,6415.0
1,VAL,Logit,0.695316,0.372922,0.126003,0.412337,0.166797,6415.0
2,VAL,XGB,0.906799,0.729185,0.076717,0.260257,0.166797,6415.0
3,TEST,Persistence,0.816405,0.569372,0.09993,0.692567,0.183086,12404.0
4,TEST,Logit,0.688549,0.372482,0.136455,0.438371,0.183086,12404.0
5,TEST,XGB,0.911201,0.754015,0.078791,0.26707,0.183086,12404.0


Unnamed: 0,Subset,N,MeanPD,MedianPD
0,VAL: 0→1 transitions (XGB PD),356,0.183557,0.139482
1,TEST: 0→1 transitions (XGB PD),716,0.203632,0.162034


## 8.1 Task-specific benchmarks (restore journal-grade baselines)

Task A (Surveillance): compare to persistence baseline.

Task B (Early warning): restrict to distress_dummy(t)=0 and compare to Always-0 and constant transition-rate baselines.

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, log_loss

def _eval(y_true, p):
    return {
        "AUC": float(roc_auc_score(y_true, p)) if len(np.unique(y_true)) > 1 else np.nan,
        "AP": float(average_precision_score(y_true, p)) if len(np.unique(y_true)) > 1 else np.nan,
        "Brier": float(brier_score_loss(y_true, p)),
        "LogLoss": float(log_loss(y_true, p)),
        "PosRate": float(np.mean(y_true)),
        "N": int(len(y_true)),
    }

# -------------------------
# Task A: surveillance
# -------------------------
eps = 1e-3
p_val_persist  = np.clip(pd.to_numeric(val["distress_dummy"], errors="coerce").fillna(0).to_numpy(float),  eps, 1-eps)
p_test_persist = np.clip(pd.to_numeric(test["distress_dummy"], errors="coerce").fillna(0).to_numpy(float), eps, 1-eps)

taskA_rows = [
    {"Task":"A (Surveillance)", "Split":"VAL",  "Model":"Persistence", **_eval(y_val, p_val_persist)},
    {"Task":"A (Surveillance)", "Split":"VAL",  "Model":"Logit (feats-only)", **_eval(y_val, p_val_logit)},
    {"Task":"A (Surveillance)", "Split":"VAL",  "Model":"Logit (feats+state)", **_eval(y_val, p_val_logit_state)},
    {"Task":"A (Surveillance)", "Split":"VAL",  "Model":"XGBoost", **_eval(y_val, p_val_xgb)},
    {"Task":"A (Surveillance)", "Split":"TEST", "Model":"Persistence", **_eval(y_test, p_test_persist)},
    {"Task":"A (Surveillance)", "Split":"TEST", "Model":"Logit (feats-only)", **_eval(y_test, p_test_logit)},
    {"Task":"A (Surveillance)", "Split":"TEST", "Model":"Logit (feats+state)", **_eval(y_test, p_test_logit_state)},
    {"Task":"A (Surveillance)", "Split":"TEST", "Model":"XGBoost", **_eval(y_test, p_test_xgb)},
]
taskA_tbl = pd.DataFrame(taskA_rows)
display(taskA_tbl)

# -------------------------
# Task B: early warning (transition into distress)
# restrict to distress_dummy(t)=0
# -------------------------
valB  = val[pd.to_numeric(val["distress_dummy"], errors="coerce").fillna(0).astype(int) == 0].copy()
testB = test[pd.to_numeric(test["distress_dummy"], errors="coerce").fillna(0).astype(int) == 0].copy()

X_valB  = valB[MODEL_FEATS]
y_valB  = valB[TARGET_COL].astype(int).to_numpy()
X_testB = testB[MODEL_FEATS]
y_testB = testB[TARGET_COL].astype(int).to_numpy()

# baselines
p_valB_zero  = np.zeros_like(y_valB, dtype=float) + 1e-6
p_testB_zero = np.zeros_like(y_testB, dtype=float) + 1e-6

# constant transition rate fitted on TRAIN non-distressed
trainB = train[pd.to_numeric(train["distress_dummy"], errors="coerce").fillna(0).astype(int) == 0].copy()
rateB = float(trainB[TARGET_COL].mean()) if len(trainB) else float(np.mean(y_train))
p_valB_rate  = np.full_like(y_valB, rateB, dtype=float)
p_testB_rate = np.full_like(y_testB, rateB, dtype=float)

# model (feats-only) predictions on B subsets
p_valB_logit  = logit.predict_proba(X_valB)[:, 1]
p_testB_logit = logit.predict_proba(X_testB)[:, 1]
p_valB_xgb    = xgb_clf.predict_proba(X_valB)[:, 1]
p_testB_xgb   = xgb_clf.predict_proba(X_testB)[:, 1]

taskB_rows = [
    {"Task":"B (Early warning)", "Split":"VAL",  "Model":"Always-0", **_eval(y_valB, p_valB_zero)},
    {"Task":"B (Early warning)", "Split":"VAL",  "Model":f"Const rate={rateB:.4f}", **_eval(y_valB, p_valB_rate)},
    {"Task":"B (Early warning)", "Split":"VAL",  "Model":"Logit (feats-only)", **_eval(y_valB, p_valB_logit)},
    {"Task":"B (Early warning)", "Split":"VAL",  "Model":"XGBoost", **_eval(y_valB, p_valB_xgb)},
    {"Task":"B (Early warning)", "Split":"TEST", "Model":"Always-0", **_eval(y_testB, p_testB_zero)},
    {"Task":"B (Early warning)", "Split":"TEST", "Model":f"Const rate={rateB:.4f}", **_eval(y_testB, p_testB_rate)},
    {"Task":"B (Early warning)", "Split":"TEST", "Model":"Logit (feats-only)", **_eval(y_testB, p_testB_logit)},
    {"Task":"B (Early warning)", "Split":"TEST", "Model":"XGBoost", **_eval(y_testB, p_testB_xgb)},
]
taskB_tbl = pd.DataFrame(taskB_rows)
display(taskB_tbl)


## 8.2 Temporal stability (year-by-year performance on TEST)

Finance-journal expectation: performance should be reported across calendar years to diagnose drift.

In [None]:
def year_by_year(df_split: pd.DataFrame, p: np.ndarray, y_col: str):
    out = []
    for y in sorted(df_split["fyear"].unique()):
        m = (df_split["fyear"] == y)
        yy = df_split.loc[m, y_col].astype(int).to_numpy()
        pp = p[m]
        if len(yy) == 0:
            continue
        out.append({
            "fyear": int(y),
            **_eval(yy, pp)
        })
    return pd.DataFrame(out)

# Ensure arrays align with 'test' index order (they do if computed from test[MODEL_FEATS])
st_persist = year_by_year(test, p_test_persist, TARGET_COL).rename(columns={"AUC":"AUC_persist","AP":"AP_persist","Brier":"Brier_persist","LogLoss":"LL_persist"})
st_logit   = year_by_year(test, p_test_logit, TARGET_COL).rename(columns={"AUC":"AUC_logit","AP":"AP_logit","Brier":"Brier_logit","LogLoss":"LL_logit"})
st_xgb     = year_by_year(test, p_test_xgb, TARGET_COL).rename(columns={"AUC":"AUC_xgb","AP":"AP_xgb","Brier":"Brier_xgb","LogLoss":"LL_xgb"})

st = st_persist[["fyear","AUC_persist","AP_persist","Brier_persist","LL_persist","PosRate","N"]].merge(
    st_logit[["fyear","AUC_logit","AP_logit","Brier_logit","LL_logit"]], on="fyear", how="left"
).merge(
    st_xgb[["fyear","AUC_xgb","AP_xgb","Brier_xgb","LL_xgb"]], on="fyear", how="left"
)

display(st)


## 8.3 Calibration (Platt + Isotonic on VAL)

Restore probability calibration to make PDs decision-usable.

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve

# Calibrate XGBoost using VAL (prefit)
cal_platt = CalibratedClassifierCV(xgb_clf, method="sigmoid", cv="prefit")
cal_platt.fit(X_val, y_val)
p_test_xgb_platt = cal_platt.predict_proba(X_test)[:, 1]

cal_iso = CalibratedClassifierCV(xgb_clf, method="isotonic", cv="prefit")
cal_iso.fit(X_val, y_val)
p_test_xgb_iso = cal_iso.predict_proba(X_test)[:, 1]

cal_tbl = pd.DataFrame([
    {"Split":"TEST","Model":"XGB (raw)", **_eval(y_test, p_test_xgb)},
    {"Split":"TEST","Model":"XGB + Platt", **_eval(y_test, p_test_xgb_platt)},
    {"Split":"TEST","Model":"XGB + Isotonic", **_eval(y_test, p_test_xgb_iso)},
])
display(cal_tbl)

# Reliability plot (TEST)
plt.figure(figsize=(6, 5))
for name, p in [("raw", p_test_xgb), ("platt", p_test_xgb_platt), ("isotonic", p_test_xgb_iso)]:
    frac_pos, mean_pred = calibration_curve(y_test, p, n_bins=10, strategy="quantile")
    plt.plot(mean_pred, frac_pos, marker="o", label=name)

plt.plot([0, 1], [0, 1], linestyle="--", label="perfect")
plt.xlabel("Mean predicted PD")
plt.ylabel("Empirical event rate")
plt.title("Calibration curve (TEST)")
plt.legend()
plt.show()


## 8.4 Operating-point policy (threshold selection + confusion matrices)

Provide action-oriented operating points (precision/recall tradeoffs) on VAL and report out-of-sample performance.

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve

# Choose score source for policy: prefer calibrated XGB if available
score_val = p_val_xgb
score_test = p_test_xgb
score_name = "XGB (raw)"

if "p_test_xgb_platt" in globals():
    score_test = p_test_xgb_platt
    # for VAL, calibrator is fit on VAL; to avoid circularity, use raw scores for threshold selection OR use nested CV.
    # Minimal: select threshold on raw VAL, apply to calibrated TEST for policy demonstration.
    score_name = "XGB (Platt TEST)"

# Threshold sweep on VAL (raw XGB scores)
prec, rec, thr = precision_recall_curve(y_val, p_val_xgb)
f1 = (2 * prec * rec) / (prec + rec + 1e-12)
best_i = int(np.nanargmax(f1))
thr_star = float(thr[max(best_i - 1, 0)]) if len(thr) else 0.5

print("Chosen threshold (max F1 on VAL, raw XGB):", round(thr_star, 4))

def cm_report(y, p, t):
    pred = (p >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    return {
        "threshold": float(t),
        "TP": int(tp), "FP": int(fp), "TN": int(tn), "FN": int(fn),
        "precision": float(precision_score(y, pred, zero_division=0)),
        "recall": float(recall_score(y, pred, zero_division=0)),
        "f1": float(f1_score(y, pred, zero_division=0)),
        "flag_rate": float(np.mean(pred)),
    }

val_rep = cm_report(y_val, p_val_xgb, thr_star)
test_rep = cm_report(y_test, score_test, thr_star)

display(pd.DataFrame([
    {"Split":"VAL",  "Model":"XGB (raw)", **val_rep},
    {"Split":"TEST", "Model":score_name, **test_rep},
]))


## 8.5 Decision curves (net benefit)

Restore decision-analytic evaluation for screening/triage policies.

In [None]:
def decision_curve(y, p, thresholds):
    y = np.asarray(y)
    p = np.asarray(p)
    n = len(y)
    out = []
    for pt in thresholds:
        pred = (p >= pt).astype(int)
        tp = np.sum((pred == 1) & (y == 1))
        fp = np.sum((pred == 1) & (y == 0))
        nb = (tp / n) - (fp / n) * (pt / (1 - pt))
        out.append((pt, nb))
    return pd.DataFrame(out, columns=["threshold", "net_benefit"])

ths = np.linspace(0.01, 0.50, 50)

# Use calibrated test scores if available, else raw
p_test_policy = p_test_xgb_platt if "p_test_xgb_platt" in globals() else p_test_xgb
dc_xgb = decision_curve(y_test, p_test_policy, ths)
dc_persist = decision_curve(y_test, p_test_persist, ths)

plt.figure(figsize=(6, 5))
plt.plot(dc_xgb["threshold"], dc_xgb["net_benefit"], label="XGB (policy score)")
plt.plot(dc_persist["threshold"], dc_persist["net_benefit"], label="Persistence")
plt.axhline(0.0, linestyle="--", label="Treat none")
plt.xlabel("Threshold")
plt.ylabel("Net benefit")
plt.title("Decision curve (TEST, Task A)")
plt.legend()
plt.show()


In [35]:
# =============================================================================
# 10. Decision Support and Scenario Analysis (non-proxy shocks)
# =============================================================================
# Requirements:
#   - No deleveraging / leverage-ratio shocks (proxy-mechanical)
#   - No coverage shocks
#   - Scenarios must propagate through engineered features and event indicators
#
# We implement "primitive shocks" (dv, oancf, capx, che) and RECOMPUTE:
#   - continuous ratios (sp_cfo_to_debt, focf, sp_focf_to_debt, dcf, sp_dcf_to_debt)
#   - event indicators that depend on current vs lag values (stored in *_l1 columns)

EVENT_THRESHOLDS = {
    "div_cut_thr": float(locals().get("cut_thr", 0.75)),
    "cfo_drop_thr": float(locals().get("cfo_drop_thr", 0.75)),
    "focf_drop_thr": float(locals().get("focf_drop_thr", 0.75)),
    "cash_drop_thr": float(locals().get("che_drop_thr", 0.75)),
    "cr_drop_thr": float(locals().get("cr_drop_thr", 0.75)),
}

def recompute_features_for_rows(df_rows: pd.DataFrame) -> pd.DataFrame:
    out = df_rows.copy()

    # Recompute flows
    out["oancf"] = pd.to_numeric(out.get("oancf"), errors="coerce")
    out["capx"]  = pd.to_numeric(out.get("capx"), errors="coerce") if "capx" in out.columns else np.nan
    out["focf"]  = out["oancf"] - out["capx"]

    dv = pd.to_numeric(out.get("dv"), errors="coerce") if "dv" in out.columns else pd.Series(np.nan, index=out.index)
    prstkc = pd.to_numeric(out.get("prstkc"), errors="coerce") if "prstkc" in out.columns else pd.Series(np.nan, index=out.index)
    dv0 = dv.fillna(0.0)
    prstkc0 = prstkc.fillna(0.0)
    out["dcf"] = out["focf"] - dv0 - prstkc0

    # Ratios (NON-proxy channels)
    td = pd.to_numeric(out.get("total_debt"), errors="coerce")
    td_pos = td.notna() & (td > 0)
    out["sp_cfo_to_debt"]  = np.where(td_pos, safe_divide(out["oancf"], td), np.nan)
    out["sp_focf_to_debt"] = np.where(td_pos, safe_divide(out["focf"], td), np.nan)
    out["sp_dcf_to_debt"]  = np.where(td_pos, safe_divide(out["dcf"], td), np.nan)

    # Dividend events (need dv_l1)
    if "dv" in out.columns and "dv_l1" in out.columns:
        dv_l1 = pd.to_numeric(out["dv_l1"], errors="coerce")
        dv_ratio = safe_divide(dv, dv_l1)
        valid_dv = dv_l1 > 0

        out["evt_div_suspend"] = (valid_dv & (dv == 0)).astype("int8")
        out["evt_div_cut"]     = (valid_dv & (dv_ratio < EVENT_THRESHOLDS["div_cut_thr"]) & (dv > 0)).astype("int8")
        out["evt_div_init"]    = ((dv_l1.fillna(0) == 0) & (dv > 0)).astype("int8")

    # CFO events (need oancf_l1)
    if "oancf_l1" in out.columns:
        cfo_l1 = pd.to_numeric(out["oancf_l1"], errors="coerce")
        cfo_ratio = safe_divide(out["oancf"], cfo_l1)
        valid_cfo = cfo_l1 > 0

        out["evt_cfo_neg"] = (out["oancf"] < 0).astype("int8")
        out["evt_cfo_collapse"] = (valid_cfo & (cfo_ratio < EVENT_THRESHOLDS["cfo_drop_thr"])).astype("int8")

    # FOCF events (need focf_l1)
    if "focf_l1" in out.columns:
        focf_l1 = pd.to_numeric(out["focf_l1"], errors="coerce")
        focf_ratio = safe_divide(out["focf"], focf_l1)
        valid_focf = focf_l1 > 0

        out["evt_focf_neg"] = (out["focf"] < 0).astype("int8")
        out["evt_focf_collapse"] = (valid_focf & (focf_ratio < EVENT_THRESHOLDS["focf_drop_thr"])).astype("int8")

    # Cash drawdown (need che_l1)
    if "che" in out.columns and "che_l1" in out.columns:
        che = pd.to_numeric(out["che"], errors="coerce")
        che_l1 = pd.to_numeric(out["che_l1"], errors="coerce")
        che_ratio = safe_divide(che, che_l1)
        valid_che = che_l1 > 0
        out["evt_cash_drawdown"] = (valid_che & (che_ratio < EVENT_THRESHOLDS["cash_drop_thr"])).astype("int8")

    # Liquidity squeeze (current ratio) if present
    if ("act" in out.columns) and ("lct" in out.columns) and ("current_ratio_l1" in out.columns):
        act = pd.to_numeric(out["act"], errors="coerce")
        lct = pd.to_numeric(out["lct"], errors="coerce")
        out["current_ratio"] = safe_divide(act, lct)
        cr_ratio = safe_divide(out["current_ratio"], pd.to_numeric(out["current_ratio_l1"], errors="coerce"))
        valid_cr = pd.to_numeric(out["current_ratio_l1"], errors="coerce") > 0
        out["evt_liquidity_squeeze"] = (valid_cr & (cr_ratio < EVENT_THRESHOLDS["cr_drop_thr"])).astype("int8")

    return out

# Choose a representative case: highest XGB PD in TEST
test_pd = pd.Series(p_test_xgb, index=test.index, name="pd_xgb")
base_idx = test_pd.idxmax()
base_row = test.loc[base_idx].copy()
base_pd = float(test_pd.loc[base_idx])

print(f"Base case (TEST) index={base_idx} | PD={base_pd:.4f}")

scenarios = {
    "Base": {},
    "Dividend suspension (dv=0)": {"dv": 0.0} if "dv" in test.columns else {},
    "CFO shock (-30%)": {"oancf": float(base_row.get("oancf", np.nan)) * 0.70} if "oancf" in test.columns else {},
    "Capex surge (+25%)": {"capx": float(base_row.get("capx", np.nan)) * 1.25} if "capx" in test.columns else {},
    "Cash drawdown (-20%)": {"che": float(base_row.get("che", np.nan)) * 0.80} if "che" in test.columns else {},
}

results = []
for name, adj in scenarios.items():
    row_s = base_row.copy()
    for k, v in adj.items():
        row_s[k] = v

    tmp = pd.DataFrame([row_s])
    tmp = recompute_features_for_rows(tmp)

    # Apply the same preprocessing pipeline as training
    tmp = clip_and_impute(tmp)
    tmp.loc[:, continuous_feats] = scaler.transform(tmp[continuous_feats])

    x_in = tmp[MODEL_FEATS]
    pd_s = float(xgb_clf.predict_proba(x_in)[:, 1][0])
    results.append((name, pd_s, pd_s - base_pd))

pd_results = pd.DataFrame(results, columns=["Scenario", "PD", "ΔPD"]).sort_values("PD", ascending=False)
display(pd_results)


Unnamed: 0,Scenario,PD,ΔPD
0,Base,0.194363,0.139428
1,Dividend suspension (dv=0),0.194363,0.139428
2,CFO shock (-30%),0.195867,0.140932
3,Capex surge (+25%),0.191345,0.13641
4,Cash drawdown (-20%),0.194363,0.139428
