# Financial Distress Prediction Pipeline (Adjusted)

This notebook follows the same structure as your current streamlined pipeline while fixing the material methodological and technical issues:

- Panel-safe lag/lead construction (sorting enforced)
- Missingness-aware distress proxy (avoids NaN → False “healthy” bias)
- Leakage-free event threshold calibration (train-only)
- Event indicators restricted to non-proxy channels (no coverage/leverage/EBITDA-proxy events)
- Stable preprocessing (train-fitted clipping + median imputation + scaling)
- Correct TreeSHAP extraction for XGBoost
- Scenario analysis that propagates through engineered features (no deleveraging/coverage scenarios; no proxy-related shocks)


In [25]:
# =============================================================================
# 0. Project Overview — Financial Distress Prediction Pipeline
# =============================================================================
# This notebook follows the standard Data Science Lifecycle:
#   (1) Data Cleaning and Quality Diagnostics
#   (2) Feature Engineering and Label Construction
#   (3) Event Indicators (interpretable drivers)
#   (4) Train / Validation / Test Split and Preprocessing
#   (5) Logit Models (supervised benchmark)
#   (6) Tree-based Model (XGBoost with native TreeSHAP explainability)
#   (7) Evaluation and Benchmarks (Persistence vs. Early Warning)
#   (8) Decision Support and Scenario Analysis
#
# Adjustments (relative to the simplified script you shared):
#   - Enforce panel sorting BEFORE any groupby shift
#   - Missingness-aware distress label construction (avoid NaN -> False)
#   - Train-only quantile thresholds for event shocks (avoid leakage)
#   - Events exclude coverage/leverage/EBITDA-proxy channels
#   - Preprocessing: train-fitted clipping + median imputation + scaling
#   - Correct SHAP contributions extraction via Booster.predict(pred_contribs=True)
#   - Scenario analysis recomputes engineered features and events; no leverage/coverage scenarios
# =============================================================================

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, log_loss

import xgboost as xgb
import matplotlib.pyplot as plt

from IPython.display import display


In [26]:
# =============================================================================
# 1. Data Import and Cleaning
# =============================================================================

DATA_PATH = "data.csv"  # file must be in the same folder
df = pd.read_csv(DATA_PATH, low_memory=False)

# Basic formatting
df.columns = df.columns.str.lower().str.strip()

# Drop duplicates if any
df = df.drop_duplicates(subset=["gvkey", "fyear"], keep="last")

# Convert numeric columns (best-effort)
num_cols = df.select_dtypes(include=["float64", "int64", "int32", "float32"]).columns
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# Panel safety: enforce stable ordering BEFORE any lag/lead
df = df.sort_values(["gvkey", "fyear"]).reset_index(drop=True)

print(f"Dataset loaded: {df.shape[0]:,} firm-year observations, {df.shape[1]} variables.")
print(f"Years: {int(df['fyear'].min())}–{int(df['fyear'].max())}")


Dataset loaded: 75,005 firm-year observations, 89 variables.
Years: 2014–2024


In [27]:
# =============================================================================
# 2. Helper Functions
# =============================================================================

def safe_divide(a, b):
    # Numerically stable division; returns NaN for non-finite results.
    a = pd.to_numeric(a, errors="coerce")
    b = pd.to_numeric(b, errors="coerce")
    with np.errstate(divide="ignore", invalid="ignore"):
        res = a / b
    res = res.replace([np.inf, -np.inf], np.nan)
    return res

def safe_log(x):
    x = pd.to_numeric(x, errors="coerce")
    out = pd.Series(np.nan, index=x.index, dtype="float64")
    m = x > 0
    out.loc[m] = np.log(x.loc[m])
    return out


In [28]:
# =============================================================================
# 3. Feature Engineering and Label Construction
# =============================================================================

# --- Debt and capital components (missingness-aware) ---
dlc = pd.to_numeric(df.get("dlc"), errors="coerce")
dltt = pd.to_numeric(df.get("dltt"), errors="coerce")

total_debt = dlc.fillna(0) + dltt.fillna(0)
# If both components missing, keep total_debt as missing (do NOT coerce to 0)
both_debt_missing = dlc.isna() & dltt.isna()
total_debt.loc[both_debt_missing] = np.nan
df["total_debt"] = total_debt

seq = pd.to_numeric(df.get("seq"), errors="coerce")
mibt = pd.to_numeric(df.get("mibt"), errors="coerce") if "mibt" in df.columns else pd.Series(np.nan, index=df.index)

equity_plus_mi = seq.fillna(0) + mibt.fillna(0)
both_eq_missing = seq.isna() & mibt.isna()
equity_plus_mi.loc[both_eq_missing] = np.nan
df["equity_plus_mi_sp"] = equity_plus_mi

df["total_capital_sp"] = df["total_debt"] + df["equity_plus_mi_sp"]

# --- Core ratios (continuous features) ---
oibdp = pd.to_numeric(df.get("oibdp"), errors="coerce")
xint  = pd.to_numeric(df.get("xint"), errors="coerce")

# FFO proxy: keep taxes adjustment only when available (avoid hard-coding zeros)
txt  = pd.to_numeric(df.get("txt"), errors="coerce") if "txt" in df.columns else pd.Series(np.nan, index=df.index)
txdc = pd.to_numeric(df.get("txdc"), errors="coerce") if "txdc" in df.columns else pd.Series(np.nan, index=df.index)
txach= pd.to_numeric(df.get("txach"), errors="coerce") if "txach" in df.columns else pd.Series(np.nan, index=df.index)

tax_adj = (txt - txdc - txach)
ffo_base = oibdp - xint
ffo_adj = ffo_base.copy()
ffo_adj.loc[tax_adj.notna()] = (ffo_base - tax_adj).loc[tax_adj.notna()]

df["sp_debt_to_capital"] = safe_divide(df["total_debt"], df["total_capital_sp"])
df["sp_debt_to_ebitda"]  = safe_divide(df["total_debt"], oibdp)

df["sp_ffo_to_debt"]     = safe_divide(ffo_adj, df["total_debt"])

oancf = pd.to_numeric(df.get("oancf"), errors="coerce")
capx  = pd.to_numeric(df.get("capx"), errors="coerce")
df["sp_cfo_to_debt"]     = safe_divide(oancf, df["total_debt"])

df["focf"] = oancf - capx
df["sp_focf_to_debt"]    = safe_divide(df["focf"], df["total_debt"])

# Optional size / market variables (allowed)
if "at" in df.columns:
    df["log_at"] = safe_log(df["at"])
if "mkvalt" in df.columns:
    df["log_mkvalt"] = safe_log(df["mkvalt"])

# --- Distress proxy (missingness-aware: do NOT let NaNs become "healthy") ---
td = df["total_debt"]
cap = df["total_capital_sp"]
eb = oibdp

ffo_to_debt_pct     = 100.0 * safe_divide(ffo_adj, td)
debt_to_capital_pct = 100.0 * safe_divide(td, cap)
debt_to_ebitda      = safe_divide(td, eb)

valid_hl = ffo_to_debt_pct.notna() & debt_to_capital_pct.notna() & debt_to_ebitda.notna()
hl_ffo = valid_hl & (ffo_to_debt_pct < 15)
hl_cap = valid_hl & (debt_to_capital_pct > 55)
hl_deb = valid_hl & (debt_to_ebitda > 4.5)
is_highly_leveraged = hl_ffo & hl_cap & hl_deb

valid_seq = seq.notna()
is_equity_negative = valid_seq & (seq < 0)

distress = pd.Series(np.nan, index=df.index, dtype="float64")
info_mask = valid_hl | valid_seq
distress.loc[info_mask] = (is_highly_leveraged | is_equity_negative).loc[info_mask].astype("int8")

df["distress_dummy"] = distress.astype("float64")  # keep NaN where label is not defensible

# Target: next year's distress (panel-safe due to sorting above)
df["target_next_year_distress"] = df.groupby("gvkey")["distress_dummy"].shift(-1)

# Modeling sample restriction: require (i) next-year label and (ii) current-year distress for persistence baseline
df_model = df[df["target_next_year_distress"].notna() & df["distress_dummy"].notna()].copy()
df_model["target_next_year_distress"] = df_model["target_next_year_distress"].astype("int8")
df_model["distress_dummy"] = df_model["distress_dummy"].astype("int8")

print(f"Modeling sample: {len(df_model):,} firm-years with defensible current distress and next-year labels.")


Modeling sample: 63,599 firm-years with defensible current distress and next-year labels.


In [29]:
# =============================================================================
# 4. Event Indicators — Interpretable Drivers (NON-proxy channels only)
# =============================================================================
# Constraint: exclude coverage/leverage/EBITDA-proxy events (anything mechanically embedded in distress_proxy).
# We therefore focus on:
#   - Dividend policy moments (cuts / suspensions / initiations)
#   - Cash-flow shocks (CFO / FOCF)
#   - Liquidity drawdowns (cash)
# Thresholds are calibrated on TRAIN ONLY (<= 2020) to avoid leakage.

# Define splits first for leakage-free calibration
train_mask = df_model["fyear"] <= 2020

# --- Dividend moments ---
if "dv" in df_model.columns:
    dv = pd.to_numeric(df_model["dv"], errors="coerce")
    df_model["dv_abs"] = dv.abs()
    df_model["dv_abs_l1"] = df_model.groupby("gvkey")["dv_abs"].shift(1)

    payer = (df_model["dv_abs_l1"] > 0.01) & df_model["dv_abs_l1"].notna()

    df_model["evt_div_suspend"]  = (payer & (df_model["dv_abs"] <= 0.01)).astype("int8")
    df_model["evt_div_initiate"] = ((~payer) & (df_model["dv_abs"] > 0.01) & df_model["dv_abs_l1"].notna()).astype("int8")

    div_chg = safe_divide(df_model["dv_abs"], df_model["dv_abs_l1"])
    cut_q = div_chg[train_mask & payer].quantile(0.10)
    cut_thr = float(np.clip(cut_q, 0.25, 0.90))
    df_model["evt_div_cut"] = (payer & (div_chg < cut_thr)).astype("int8")
else:
    df_model["evt_div_suspend"]  = 0
    df_model["evt_div_initiate"] = 0
    df_model["evt_div_cut"]      = 0

# --- CFO shocks ---
oancf = pd.to_numeric(df_model.get("oancf"), errors="coerce")
df_model["oancf_l1"] = df_model.groupby("gvkey")["oancf"].shift(1)

df_model["evt_cfo_neg"] = (oancf < 0).astype("int8")

cfo_ratio = safe_divide(oancf, df_model["oancf_l1"])
valid_cfo = df_model["oancf_l1"] > 0
cfo_drop_q = cfo_ratio[train_mask & valid_cfo].quantile(0.05)
cfo_drop_thr = float(np.clip(cfo_drop_q, 0.10, 0.90))
df_model["evt_cfo_collapse"] = (valid_cfo & (cfo_ratio < cfo_drop_thr)).astype("int8")

# --- FOCF shocks (oancf - capx) ---
if "capx" in df_model.columns:
    capx = pd.to_numeric(df_model["capx"], errors="coerce")
    focf = pd.to_numeric(df_model["focf"], errors="coerce")
    df_model["focf_l1"] = df_model.groupby("gvkey")["focf"].shift(1)

    df_model["evt_focf_neg"] = (focf < 0).astype("int8")

    focf_ratio = safe_divide(focf, df_model["focf_l1"])
    valid_focf = df_model["focf_l1"] > 0
    focf_drop_q = focf_ratio[train_mask & valid_focf].quantile(0.05)
    focf_drop_thr = float(np.clip(focf_drop_q, 0.10, 0.90))
    df_model["evt_focf_collapse"] = (valid_focf & (focf_ratio < focf_drop_thr)).astype("int8")
else:
    df_model["evt_focf_neg"] = 0
    df_model["evt_focf_collapse"] = 0

# --- Liquidity drawdown: cash drop ---
if "che" in df_model.columns:
    che = pd.to_numeric(df_model["che"], errors="coerce")
    df_model["che_l1"] = df_model.groupby("gvkey")["che"].shift(1)
    che_ratio = safe_divide(che, df_model["che_l1"])
    valid_che = df_model["che_l1"] > 0
    che_drop_q = che_ratio[train_mask & valid_che].quantile(0.05)
    che_drop_thr = float(np.clip(che_drop_q, 0.10, 0.90))
    df_model["evt_cash_drawdown"] = (valid_che & (che_ratio < che_drop_thr)).astype("int8")
else:
    df_model["evt_cash_drawdown"] = 0


event_feats = [c for c in df_model.columns if c.startswith("evt_")]
print(f"Event indicators included: {event_feats}")

Event indicators included: ['evt_div_suspend', 'evt_div_initiate', 'evt_div_cut', 'evt_cfo_neg', 'evt_cfo_collapse', 'evt_focf_neg', 'evt_focf_collapse', 'evt_cash_drawdown']


In [30]:
# =============================================================================
# 5. Train / Validation / Test Split and Preprocessing
# =============================================================================

train = df_model[df_model["fyear"] <= 2020].copy()
val   = df_model[df_model["fyear"] == 2021].copy()
test  = df_model[df_model["fyear"] >= 2022].copy()

TARGET_COL = "target_next_year_distress"

continuous_feats = [
    "sp_debt_to_capital", "sp_debt_to_ebitda",
    "sp_ffo_to_debt", "sp_cfo_to_debt", "sp_focf_to_debt"
]
for opt in ["log_at", "log_mkvalt"]:
    if opt in df_model.columns:
        continuous_feats.append(opt)

event_feats = [c for c in df_model.columns if c.startswith("evt_")]
MODEL_FEATS = continuous_feats + event_feats

# --- Stabilize ratios (train-fitted clipping) + train-median imputation ---
clip_bounds = {}
train_medians = {}

for col in continuous_feats:
    s = pd.to_numeric(train[col], errors="coerce")
    lo = s.quantile(0.01)
    hi = s.quantile(0.99)
    clip_bounds[col] = (float(lo), float(hi))
    train_medians[col] = float(s.median())

def clip_and_impute(df_in):
    df_out = df_in.copy()
    for col in continuous_feats:
        lo, hi = clip_bounds[col]
        df_out[col] = pd.to_numeric(df_out[col], errors="coerce").clip(lower=lo, upper=hi)
        df_out[col] = df_out[col].fillna(train_medians[col])
    for col in event_feats:
        df_out[col] = pd.to_numeric(df_out[col], errors="coerce").fillna(0).astype("int8")
    return df_out

train = clip_and_impute(train)
val   = clip_and_impute(val)
test  = clip_and_impute(test)

# Standardize continuous features (train statistics)
scaler = StandardScaler()
scaler.fit(train[continuous_feats])

train.loc[:, continuous_feats] = scaler.transform(train[continuous_feats])
val.loc[:, continuous_feats]   = scaler.transform(val[continuous_feats])
test.loc[:, continuous_feats]  = scaler.transform(test[continuous_feats])

print(f"Split sizes: train={len(train):,} | val={len(val):,} | test={len(test):,}")
print(f"Features: {len(MODEL_FEATS)} (continuous={len(continuous_feats)} + events={len(event_feats)})")
print("Continuous feats:", continuous_feats)
print("Event feats:", event_feats)


Split sizes: train=44,780 | val=6,415 | test=12,404
Features: 15 (continuous=7 + events=8)
Continuous feats: ['sp_debt_to_capital', 'sp_debt_to_ebitda', 'sp_ffo_to_debt', 'sp_cfo_to_debt', 'sp_focf_to_debt', 'log_at', 'log_mkvalt']
Event feats: ['evt_div_suspend', 'evt_div_initiate', 'evt_div_cut', 'evt_cfo_neg', 'evt_cfo_collapse', 'evt_focf_neg', 'evt_focf_collapse', 'evt_cash_drawdown']


In [31]:
# =============================================================================
# 6. Logit Model (Benchmark)
# =============================================================================

X_train, y_train = train[MODEL_FEATS], train[TARGET_COL].astype(int)
X_val, y_val     = val[MODEL_FEATS],   val[TARGET_COL].astype(int)
X_test, y_test   = test[MODEL_FEATS],  test[TARGET_COL].astype(int)

best_C = 0.1
logit = LogisticRegression(C=best_C, max_iter=1000, solver="lbfgs")
logit.fit(X_train, y_train)

p_val = logit.predict_proba(X_val)[:, 1]
p_test = logit.predict_proba(X_test)[:, 1]

def evaluate_split(y_true, p_pred):
    return pd.Series({
        "AUC": roc_auc_score(y_true, p_pred),
        "AP": average_precision_score(y_true, p_pred),
        "Brier": brier_score_loss(y_true, p_pred),
        "LogLoss": log_loss(y_true, p_pred),
        "PosRate": float(np.mean(y_true)),
        "N": int(len(y_true)),
    })

eval_val = evaluate_split(y_val, p_val)
eval_test = evaluate_split(y_test, p_test)

print("Validation performance (Logit):\n", eval_val.round(4))
print("Test performance (Logit):\n", eval_test.round(4))


Validation performance (Logit):
 AUC           0.6953
AP            0.3729
Brier         0.1260
LogLoss       0.4123
PosRate       0.1668
N          6415.0000
dtype: float64
Test performance (Logit):
 AUC            0.6885
AP             0.3725
Brier          0.1365
LogLoss        0.4384
PosRate        0.1831
N          12404.0000
dtype: float64


In [32]:
# =============================================================================
# 7. Tree-based Model (XGBoost)
# =============================================================================

xgb_params = dict(
    objective="binary:logistic",
    eval_metric="aucpr",
    learning_rate=0.05,
    max_depth=4,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
)

xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

p_val_xgb = xgb_clf.predict_proba(X_val)[:, 1]
p_test_xgb = xgb_clf.predict_proba(X_test)[:, 1]

eval_val_xgb = evaluate_split(y_val, p_val_xgb)
eval_test_xgb = evaluate_split(y_test, p_test_xgb)

print("Validation performance (XGB):\n", eval_val_xgb.round(4))
print("Test performance (XGB):\n", eval_test_xgb.round(4))


Validation performance (XGB):
 AUC           0.9068
AP            0.7292
Brier         0.0767
LogLoss       0.2603
PosRate       0.1668
N          6415.0000
dtype: float64
Test performance (XGB):
 AUC            0.9112
AP             0.7540
Brier          0.0788
LogLoss        0.2671
PosRate        0.1831
N          12404.0000
dtype: float64


In [33]:
# =============================================================================
# 8. Explainability (TreeSHAP via XGBoost pred_contribs)
# =============================================================================

booster = xgb_clf.get_booster()
dval = xgb.DMatrix(X_val, feature_names=MODEL_FEATS)

# pred_contribs=True returns SHAP contributions per feature plus a bias term (last column)
shap_val = booster.predict(dval, pred_contribs=True)
shap_cols = MODEL_FEATS + ["bias"]
shap_df = pd.DataFrame(shap_val, columns=shap_cols)

abs_mean = shap_df[MODEL_FEATS].abs().mean().sort_values(ascending=False)

print("Mean absolute SHAP contributions (validation):")
display(abs_mean.head(20))


Mean absolute SHAP contributions (validation):


sp_debt_to_capital    1.133036
sp_ffo_to_debt        0.527658
log_at                0.275150
sp_debt_to_ebitda     0.186086
log_mkvalt            0.145138
sp_focf_to_debt       0.142577
sp_cfo_to_debt        0.136545
evt_cfo_neg           0.071492
evt_focf_neg          0.054369
evt_cfo_collapse      0.019340
evt_cash_drawdown     0.014455
evt_div_cut           0.007996
evt_focf_collapse     0.005306
evt_div_suspend       0.003284
evt_div_initiate      0.002192
dtype: float32

In [34]:
# =============================================================================
# 9. Evaluation: Persistence vs. Early Warning
# =============================================================================

eps = 1e-3
p_persist_val  = np.clip(val["distress_dummy"].astype(float),  eps, 1 - eps)
p_persist_test = np.clip(test["distress_dummy"].astype(float), eps, 1 - eps)

benchmarks = pd.DataFrame([
    ["VAL",  "Persistence", *evaluate_split(y_val,  p_persist_val)],
    ["VAL",  "Logit",       *eval_val],
    ["VAL",  "XGB",         *eval_val_xgb],
    ["TEST", "Persistence", *evaluate_split(y_test, p_persist_test)],
    ["TEST", "Logit",       *eval_test],
    ["TEST", "XGB",         *eval_test_xgb],
], columns=["Split", "Model", "AUC", "AP", "Brier", "LogLoss", "PosRate", "N"])

display(benchmarks)

# Early-warning subset: cases not distressed at t but distressed at t+1
val_ew  = (val["distress_dummy"] == 0) & (val[TARGET_COL] == 1)
test_ew = (test["distress_dummy"] == 0) & (test[TARGET_COL] == 1)

def ew_summary(mask, p_pred, label):
    if int(mask.sum()) == 0:
        return pd.Series({"Subset": label, "N": 0, "MeanPD": np.nan, "MedianPD": np.nan})
    return pd.Series({
        "Subset": label,
        "N": int(mask.sum()),
        "MeanPD": float(np.mean(p_pred[mask.values])),
        "MedianPD": float(np.median(p_pred[mask.values])),
    })

ew = pd.DataFrame([
    ew_summary(val_ew,  p_val_xgb,  "VAL: 0→1 transitions (XGB PD)"),
    ew_summary(test_ew, p_test_xgb, "TEST: 0→1 transitions (XGB PD)"),
])

display(ew)


Unnamed: 0,Split,Model,AUC,AP,Brier,LogLoss,PosRate,N
0,VAL,Persistence,0.806236,0.528628,0.100968,0.699751,0.166797,6415.0
1,VAL,Logit,0.695316,0.372922,0.126003,0.412337,0.166797,6415.0
2,VAL,XGB,0.906799,0.729185,0.076717,0.260257,0.166797,6415.0
3,TEST,Persistence,0.816405,0.569372,0.09993,0.692567,0.183086,12404.0
4,TEST,Logit,0.688549,0.372482,0.136455,0.438371,0.183086,12404.0
5,TEST,XGB,0.911201,0.754015,0.078791,0.26707,0.183086,12404.0


Unnamed: 0,Subset,N,MeanPD,MedianPD
0,VAL: 0→1 transitions (XGB PD),356,0.183557,0.139482
1,TEST: 0→1 transitions (XGB PD),716,0.203632,0.162034


In [35]:
# =============================================================================
# 10. Decision Support and Scenario Analysis (non-proxy shocks)
# =============================================================================
# Requirements:
#   - No deleveraging / leverage-ratio shocks (proxy-mechanical)
#   - No coverage shocks
#   - Scenarios must propagate through engineered features and event indicators
#
# We implement "primitive shocks" (dv, oancf, capx, che) and RECOMPUTE:
#   - continuous ratios (sp_cfo_to_debt, focf, sp_focf_to_debt, etc.)
#   - event indicators that depend on current vs lag values (stored in *_l1 columns)

EVENT_THRESHOLDS = {
    "div_cut_thr": float(locals().get("cut_thr", 0.75)),
    "cfo_drop_thr": float(locals().get("cfo_drop_thr", 0.75)),
    "focf_drop_thr": float(locals().get("focf_drop_thr", 0.75)),
    "cash_drop_thr": float(locals().get("che_drop_thr", 0.75)),
}

def recompute_features_for_rows(df_rows: pd.DataFrame) -> pd.DataFrame:
    out = df_rows.copy()

    out["sp_debt_to_capital"] = safe_divide(out["total_debt"], out["total_capital_sp"])
    out["sp_debt_to_ebitda"]  = safe_divide(out["total_debt"], out["oibdp"])

    xint = pd.to_numeric(out.get("xint"), errors="coerce")
    oibdp = pd.to_numeric(out.get("oibdp"), errors="coerce")

    txt  = pd.to_numeric(out.get("txt"), errors="coerce") if "txt" in out.columns else pd.Series(np.nan, index=out.index)
    txdc = pd.to_numeric(out.get("txdc"), errors="coerce") if "txdc" in out.columns else pd.Series(np.nan, index=out.index)
    txach= pd.to_numeric(out.get("txach"), errors="coerce") if "txach" in out.columns else pd.Series(np.nan, index=out.index)

    tax_adj = (txt - txdc - txach)
    ffo_base = oibdp - xint
    ffo_adj = ffo_base.copy()
    ffo_adj.loc[tax_adj.notna()] = (ffo_base - tax_adj).loc[tax_adj.notna()]

    out["sp_ffo_to_debt"] = safe_divide(ffo_adj, out["total_debt"])

    oancf = pd.to_numeric(out.get("oancf"), errors="coerce")
    capx  = pd.to_numeric(out.get("capx"), errors="coerce")

    out["sp_cfo_to_debt"]  = safe_divide(oancf, out["total_debt"])
    out["focf"]            = oancf - capx
    out["sp_focf_to_debt"] = safe_divide(out["focf"], out["total_debt"])

    # --- Events ---
    if "dv" in out.columns and "dv_abs_l1" in out.columns:
        dv = pd.to_numeric(out["dv"], errors="coerce")
        out["dv_abs"] = dv.abs()
        payer = (out["dv_abs_l1"] > 0.01) & out["dv_abs_l1"].notna()

        out["evt_div_suspend"]  = (payer & (out["dv_abs"] <= 0.01)).astype("int8")
        out["evt_div_initiate"] = ((~payer) & (out["dv_abs"] > 0.01) & out["dv_abs_l1"].notna()).astype("int8")

        div_chg = safe_divide(out["dv_abs"], out["dv_abs_l1"])
        out["evt_div_cut"] = (payer & (div_chg < EVENT_THRESHOLDS["div_cut_thr"])).astype("int8")

    if "oancf_l1" in out.columns:
        cfo_ratio = safe_divide(oancf, out["oancf_l1"])
        valid_cfo = out["oancf_l1"] > 0
        out["evt_cfo_neg"] = (oancf < 0).astype("int8")
        out["evt_cfo_collapse"] = (valid_cfo & (cfo_ratio < EVENT_THRESHOLDS["cfo_drop_thr"])).astype("int8")

    if "focf_l1" in out.columns:
        focf_ratio = safe_divide(out["focf"], out["focf_l1"])
        valid_focf = out["focf_l1"] > 0
        out["evt_focf_neg"] = (out["focf"] < 0).astype("int8")
        out["evt_focf_collapse"] = (valid_focf & (focf_ratio < EVENT_THRESHOLDS["focf_drop_thr"])).astype("int8")

    if "che" in out.columns and "che_l1" in out.columns:
        che = pd.to_numeric(out["che"], errors="coerce")
        che_ratio = safe_divide(che, out["che_l1"])
        valid_che = out["che_l1"] > 0
        out["evt_cash_drawdown"] = (valid_che & (che_ratio < EVENT_THRESHOLDS["cash_drop_thr"])).astype("int8")

    return out

base_row = test.iloc[0].copy()
base_pd = float(p_test_xgb[0])

scenarios = {
    "Base": {},
    "Dividend suspension (dv=0)": {"dv": 0.0} if "dv" in test.columns else {},
    "CFO shock (-30%)": {"oancf": float(base_row.get("oancf", np.nan)) * 0.70} if "oancf" in test.columns else {},
    "Capex surge (+25%)": {"capx": float(base_row.get("capx", np.nan)) * 1.25} if "capx" in test.columns else {},
    "Cash drawdown (-20%)": {"che": float(base_row.get("che", np.nan)) * 0.80} if "che" in test.columns else {},
}

results = []
for name, adj in scenarios.items():
    row_s = base_row.copy()
    for k, v in adj.items():
        row_s[k] = v

    tmp = pd.DataFrame([row_s])
    tmp = recompute_features_for_rows(tmp)

    tmp = clip_and_impute(tmp)
    tmp.loc[:, continuous_feats] = scaler.transform(tmp[continuous_feats])

    x_in = tmp[MODEL_FEATS]
    pd_s = float(xgb_clf.predict_proba(x_in)[:, 1][0])
    results.append((name, pd_s, pd_s - base_pd))

pd_results = pd.DataFrame(results, columns=["Scenario", "PD", "ΔPD"])
display(pd_results)


Unnamed: 0,Scenario,PD,ΔPD
0,Base,0.194363,0.139428
1,Dividend suspension (dv=0),0.194363,0.139428
2,CFO shock (-30%),0.195867,0.140932
3,Capex surge (+25%),0.191345,0.13641
4,Cash drawdown (-20%),0.194363,0.139428
