In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# =============================================================================
# Goal of this cell (beginner-friendly):
# 1) Load panel data (firm-year).
# 2) Clean types + de-duplicate firm-year rows.
# 3) Impute missing raw inputs using training-only information.
# 4) Build financial ratios ("features") + next-year distress target.
# 5) Create Train / Validation / Test splits by year.
# 6) Winsorize + standardize features, then print basic diagnostics.
# =============================================================================

# ----------------------------
# Configuration / Hyperparams
# ----------------------------
FILE_NAME = "../data.csv"

TRAIN_CUTOFF_LABEL_YEAR = 2022  # label_year <= cutoff → train/val pool; after cutoff → test
VAL_YEARS = 1  # last N years within the pool are validation
N_SPLITS_TIME_CV = 5  # rolling time-based folds for sanity checks

WINSOR_LOWER_Q = 0.01  # winsorization lower quantile (train-only)
WINSOR_UPPER_Q = 0.99  # winsorization upper quantile (train-only)

REQUIRED_KEYS = ["gvkey", "fyear"]


# ----------------------------
# Small helper utilities
# ----------------------------

def to_float_numpy(x) -> np.ndarray:
    """Convert series/array-like to float numpy array, coercing non-numeric to NaN."""
    s = pd.to_numeric(x, errors="coerce")
    return s.to_numpy(dtype=float) if hasattr(s, "to_numpy") else np.asarray(s, dtype=float)


def safe_divide(a, b) -> np.ndarray:
    """Elementwise divide a/b with NaN when division is invalid (0 or non-finite)."""
    a = to_float_numpy(a)
    b = to_float_numpy(b)
    out = np.full_like(a, np.nan, dtype=float)
    np.divide(a, b, out=out, where=(b != 0) & np.isfinite(a) & np.isfinite(b))
    return out


def rolling_year_folds(
        df_in: pd.DataFrame, year_col: str = "label_year", n_splits: int = 5, min_train_years: int = 3
) -> list[tuple[np.ndarray, np.ndarray, np.ndarray, int]]:
    """
    Create expanding-window time folds:
      train years: first (min_train_years + k) years
      val year:    next year
    Returns: list of (train_idx, val_idx, train_years, val_year)
    """
    years_sorted = np.sort(df_in[year_col].dropna().unique())
    if len(years_sorted) <= min_train_years:
        return []
    n_splits = min(n_splits, len(years_sorted) - min_train_years)

    folds_out = []
    for k in range(n_splits):
        train_years = years_sorted[: min_train_years + k]
        val_year = int(years_sorted[min_train_years + k])

        train_idx = df_in.index[df_in[year_col].isin(train_years)].to_numpy()
        val_idx = df_in.index[df_in[year_col] == val_year].to_numpy()
        folds_out.append((train_idx, val_idx, train_years, val_year))

    return folds_out


# =============================================================================
# 1) Load data + basic cleaning
# =============================================================================
df = pd.read_csv(FILE_NAME, low_memory=False)

# Convert datadate if present
if "datadate" in df.columns:
    df["datadate"] = pd.to_datetime(df["datadate"], errors="coerce")

# Create stable firm id + de-duplicate firm-year (keep last record)
df["firm_id"] = df["gvkey"]
df = (
    df.sort_values(["firm_id", "fyear"])
    .drop_duplicates(subset=["firm_id", "fyear"], keep="last")
    .reset_index(drop=True)
)

# Label year: predict distress in the next fiscal year
df["label_year"] = df["fyear"] + 1

# =============================================================================
# 2) Define train/val pool years (based on label_year)
# =============================================================================
pool_mask = df["label_year"] <= TRAIN_CUTOFF_LABEL_YEAR
pool_years = np.sort(df.loc[pool_mask, "label_year"].dropna().unique())
val_years = pool_years[-VAL_YEARS:] if len(pool_years) else np.array([], dtype=int)

# This mask is ONLY used for imputations (train-only information)
train_mask_for_imputation = pool_mask & (~df["label_year"].isin(val_years))

# =============================================================================
# 3) Missingness flags + imputation setup (+ EDA before/after imputation)
#    Imputation policy implemented:
#    - Do NOT impute: gvkey, datadate, fyear, conm, datafmt, indfmt, consol (drop rows if missing)
#    - ismod: mode (binary flag)
#    - Construct first: mkvalt from prcc_f * csho; dlcch/recch/invch/chech from level diffs; apalch proxy with Δap
#    - Stocks: within-firm history (ffill) -> industry-year ratio median (fallback: year-only if no industry col)
#    - Flows: ratio median (and optionally regression for xint etc.)
# =============================================================================

RAW_INPUTS_FOR_FE = [
    "aco", "act", "ao", "aoloch", "ap", "apalch", "aqc", "at", "caps", "capx", "ceq", "che", "chech", "csho", "cstk",
    "cstke",
    "datadate", "dlc", "dlcch", "dltis", "dltr", "dltt", "do", "dp", "dpc", "dv", "dvc", "dvp", "dvt", "esubc", "exre",
    "fiao", "fincf", "fopo", "fyear", "gvkey", "ib", "ibadj", "ibc", "intan", "invch", "invt", "ismod", "ivaco",
    "ivaeq",
    "ivao", "ivch", "ivncf", "ivstch", "lco", "lct", "lt", "mibt", "mkvalt", "niadj", "nopi", "oancf", "oibdp", "ppent",
    "prcc_c", "prcc_f", "prstkc", "pstk", "pstkn", "pstkr", "re", "recch", "rect", "seq", "siv", "spi", "sppe", "sppiv",
    "sstk", "tstk", "txach", "txbcof", "txdc", "txditc", "txp", "txt", "xi", "xido", "xidoc", "xint",
    # optional identifiers present in many extracts:
    "conm", "consol", "datafmt", "indfmt",
]
raw = [c for c in RAW_INPUTS_FOR_FE if c in df.columns]

# ---------------------------------------------------------------------------
# 3.0 Ensure keys exist + types
# ---------------------------------------------------------------------------
# firm_id used for panel operations
if "firm_id" not in df.columns:
    if "gvkey" in df.columns:
        df["firm_id"] = df["gvkey"]
    else:
        raise ValueError("Need either firm_id or gvkey in df to run panel imputations.")

# fyear numeric
if "fyear" in df.columns:
    df["fyear"] = pd.to_numeric(df["fyear"], errors="coerce")

# datadate datetime (if present)
if "datadate" in df.columns:
    df["datadate"] = pd.to_datetime(df["datadate"], errors="coerce")

# ---------------------------------------------------------------------------
# 3.1 Drop rows with missing critical identifiers (do not impute these)
# ---------------------------------------------------------------------------
NON_IMPUTE_DROP = [c for c in ["gvkey", "datadate", "fyear", "conm", "datafmt", "indfmt", "consol"] if c in df.columns]
if NON_IMPUTE_DROP:
    before_n = df.shape[0]
    df = df.dropna(subset=NON_IMPUTE_DROP).copy()
    after_n = df.shape[0]
    if after_n < before_n:
        print(
            f"[INFO] Dropped {before_n - after_n:,} rows due to missing non-imputable ID/meta fields: {NON_IMPUTE_DROP}")

# Rebuild raw after potential drop
raw = [c for c in RAW_INPUTS_FOR_FE if c in df.columns]

# ---------------------------------------------------------------------------
# 3.2 EDA BEFORE (snapshot)
# ---------------------------------------------------------------------------
df_raw_pre = df[raw].copy(deep=True)

pre_miss = pd.DataFrame(
    {
        "col": raw,
        "n": [int(df_raw_pre[c].shape[0]) for c in raw],
        "n_na_pre": [int(df_raw_pre[c].isna().sum()) for c in raw],
        "pct_na_pre": [float(df_raw_pre[c].isna().mean() * 100.0) for c in raw],
        "train_n": [int(train_mask_for_imputation.sum()) for _ in raw],
        "train_pct_na_pre": [
            float(df_raw_pre.loc[train_mask_for_imputation, c].isna().mean() * 100.0) for c in raw
        ],
    }
).sort_values("pct_na_pre", ascending=False)

print("\n=== EDA (BEFORE imputation): Missingness on raw inputs ===")
print(pre_miss.round(4).head(50))

# Numeric distribution summary (exclude obvious non-numeric)
if raw:
    x_pre = df_raw_pre[raw].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
    q_pre = x_pre.quantile([0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]).T
    pre_dist = pd.DataFrame(
        {
            "n_nonmiss_pre": x_pre.notna().sum(),
            "mean_pre": x_pre.mean(),
            "std_pre": x_pre.std(ddof=0),
            "min_pre": x_pre.min(),
            "p01_pre": q_pre[0.01],
            "p05_pre": q_pre[0.05],
            "p25_pre": q_pre[0.25],
            "p50_pre": q_pre[0.50],
            "p75_pre": q_pre[0.75],
            "p95_pre": q_pre[0.95],
            "p99_pre": q_pre[0.99],
            "max_pre": x_pre.max(),
        }
    )
    print("\n=== EDA (BEFORE imputation): Distribution summary (raw inputs) ===")
    print(pre_dist.round(4).sort_values("n_nonmiss_pre", ascending=True).head(50))

# ---------------------------------------------------------------------------
# 3.3 Missingness flags (ALWAYS create before imputations)
# ---------------------------------------------------------------------------
for c in raw:
    df[f"miss_{c}"] = df[c].isna().astype("int8")

# ---------------------------------------------------------------------------
# 3.4 Helper: pick an "industry" column if available, else fallback to year-only
# ---------------------------------------------------------------------------
INDUSTRY_CANDIDATES = ["sic", "naics", "gsector", "gind", "gsubind", "industry", "ff49", "ff12"]
industry_col = next((c for c in INDUSTRY_CANDIDATES if c in df.columns), None)
group_cols = ["fyear"] + ([industry_col] if industry_col is not None else [])
if industry_col is None:
    print("[WARN] No industry column found (sic/naics/gics/etc.). Using year-only medians for ratio imputations.")

# ---------------------------------------------------------------------------
# 3.5 Step 1: Construct / reconcile FIRST (no leakage: uses contemporaneous or lag only)
# ---------------------------------------------------------------------------
df = df.sort_values(["firm_id", "fyear"]).copy()

# 3.5.1 mkvalt construction from prcc_f * csho (if mkvalt missing)
if all(c in df.columns for c in ["mkvalt", "prcc_f", "csho"]):
    mkvalt_miss = df["mkvalt"].isna()
    mkvalt_calc = pd.to_numeric(df["prcc_f"], errors="coerce") * pd.to_numeric(df["csho"], errors="coerce")
    df.loc[mkvalt_miss & mkvalt_calc.notna(), "mkvalt"] = mkvalt_calc.loc[mkvalt_miss & mkvalt_calc.notna()]


# 3.5.2 Reconstruct change variables from level differences (fill only if change var is missing)
def _fill_change_from_levels(change_col, level_col):
    if change_col in df.columns and level_col in df.columns:
        miss = df[change_col].isna()
        lvl = pd.to_numeric(df[level_col], errors="coerce")
        lag_lvl = df.groupby("firm_id")[level_col].shift(1)
        lag_lvl = pd.to_numeric(lag_lvl, errors="coerce")
        recon = lvl - lag_lvl
        df.loc[miss & recon.notna(), change_col] = recon.loc[miss & recon.notna()]


_fill_change_from_levels("dlcch", "dlc")
_fill_change_from_levels("recch", "rect")
_fill_change_from_levels("invch", "invt")
_fill_change_from_levels("chech", "che")

# 3.5.3 apalch proxy with Δap if apalch missing
if "apalch" in df.columns and "ap" in df.columns:
    miss = df["apalch"].isna()
    ap = pd.to_numeric(df["ap"], errors="coerce")
    lag_ap = pd.to_numeric(df.groupby("firm_id")["ap"].shift(1), errors="coerce")
    recon = ap - lag_ap
    df.loc[miss & recon.notna(), "apalch"] = recon.loc[miss & recon.notna()]

# ---------------------------------------------------------------------------
# 3.6 Step 2: ismod mode imputation (binary flag)
# ---------------------------------------------------------------------------
if "ismod" in df.columns:
    tr_obs = df.loc[train_mask_for_imputation, "ismod"]
    tr_obs_num = pd.to_numeric(tr_obs, errors="coerce")
    # global mode on training
    if tr_obs_num.notna().any():
        mode_val = float(tr_obs_num.mode(dropna=True).iloc[0])
    else:
        mode_val = 0.0
    df.loc[df["ismod"].isna(), "ismod"] = mode_val

# ---------------------------------------------------------------------------
# 3.7 Step 3: Stocks — firm-history (ffill) -> group median of ratio (x/at)
# ---------------------------------------------------------------------------
STOCKS = [c for c in [
    "aco", "act", "ao", "ap", "at", "caps", "ceq", "che", "csho", "cstk", "dlc", "dltt", "intan", "invt", "lco", "lct",
    "lt",
    "mibt", "ppent", "pstk", "pstkn", "pstkr", "re", "rect", "seq", "tstk", "ivaeq", "mkvalt"
] if c in df.columns]

# firm-history forward fill within each firm (uses only past observations)
if STOCKS:
    df[STOCKS] = df.groupby("firm_id")[STOCKS].ffill()

# ratio median impute remaining missing stocks (fit on training only)
NONNEG_STOCKS = set(
    [c for c in STOCKS if c in {"aco", "act", "ao", "ap", "at", "caps", "ceq", "che", "csho", "cstk", "dlc", "dltt",
                                "intan", "invt", "lco", "lct", "lt", "mibt", "mkvalt", "ppent", "pstk", "pstkn",
                                "pstkr",
                                "rect", "seq", "tstk", "ivaeq"}])


def _fit_ratio_medians(train_df, col, size_col="at"):
    # returns (overall_ratio_median, series indexed by group tuple)
    s = pd.to_numeric(train_df[col], errors="coerce")
    size = pd.to_numeric(train_df[size_col], errors="coerce") if size_col in train_df.columns else None
    if size is None:
        # fallback: level median by groups
        grp_med = train_df.groupby(group_cols)[col].median()
        overall = float(s.median()) if s.notna().any() else 0.0
        return ("level", overall, grp_med)

    valid = s.notna() & size.notna() & (size > 0)
    if valid.sum() == 0:
        grp_med = train_df.groupby(group_cols)[col].median()
        overall = float(s.median()) if s.notna().any() else 0.0
        return ("level", overall, grp_med)

    ratio = (s[valid] / size[valid]).replace([np.inf, -np.inf], np.nan).dropna()
    overall = float(ratio.median()) if ratio.notna().any() else 0.0
    tmp = train_df.loc[valid, group_cols].copy()
    tmp["_ratio_"] = ratio.values
    grp_med = tmp.groupby(group_cols)["_ratio_"].median()
    return ("ratio", overall, grp_med)


def _apply_ratio_medians(df_all, col, fit_obj, size_col="at", nonneg=False):
    kind, overall, grp_med = fit_obj
    miss = df_all[col].isna()
    if not miss.any():
        return

    if kind == "ratio" and size_col in df_all.columns:
        size = pd.to_numeric(df_all.loc[miss, size_col], errors="coerce")
        # map group median ratio
        g = df_all.loc[miss, group_cols]
        mapped = pd.Series([np.nan] * miss.sum(), index=df_all.index[miss], dtype="float64")
        # build tuple key if multi-col grouping
        if len(group_cols) == 1:
            mapped = g[group_cols[0]].map(grp_med)
        else:
            keys = list(map(tuple, g[group_cols].to_numpy()))
            mapped = pd.Series(keys, index=g.index).map(grp_med)

        r = pd.to_numeric(mapped, errors="coerce").fillna(overall)
        fill = r * size
        # if size missing/<=0, fallback to level median within groups
        fill = fill.where(size.notna() & (size > 0), np.nan)
        df_all.loc[miss & fill.notna(), col] = fill.loc[miss & fill.notna()].to_numpy()

    # fallback: level median by group (still fit on training)
    # (use training group median of levels if ratio path didn't fill everything)
    miss2 = df_all[col].isna()
    if miss2.any():
        tr = df_all.loc[train_mask_for_imputation, [*group_cols, col]].copy()
        tr[col] = pd.to_numeric(tr[col], errors="coerce")
        lvl_overall = float(tr[col].median()) if tr[col].notna().any() else 0.0
        lvl_grp = tr.groupby(group_cols)[col].median()

        g2 = df_all.loc[miss2, group_cols]
        if len(group_cols) == 1:
            mapped2 = g2[group_cols[0]].map(lvl_grp)
        else:
            keys2 = list(map(tuple, g2[group_cols].to_numpy()))
            mapped2 = pd.Series(keys2, index=g2.index).map(lvl_grp)

        fill2 = pd.to_numeric(mapped2, errors="coerce").fillna(lvl_overall)
        if nonneg:
            fill2 = fill2.clip(lower=0.0)
        df_all.loc[miss2, col] = fill2.to_numpy()


# Fit on training
tr_all = df.loc[train_mask_for_imputation].copy()
ratio_fits = {}
for c in STOCKS:
    ratio_fits[c] = _fit_ratio_medians(tr_all, c, size_col="at")

# Apply to full df
for c in STOCKS:
    _apply_ratio_medians(df, c, ratio_fits[c], size_col="at", nonneg=(c in NONNEG_STOCKS))

# ---------------------------------------------------------------------------
# 3.8 Step 4: Flows / Income variables — ratio median (x/at), with debt-aware rules for xint
# ---------------------------------------------------------------------------
FLOWS = [c for c in [
    "ib", "ibadj", "ibc", "niadj", "nopi", "oibdp", "dp", "txt", "oancf", "fincf", "ivncf", "xint", "esubc"
] if c in df.columns]

# Debt-aware rule: if total debt == 0, xint can be set to 0 (keep miss flag already created)
if "xint" in df.columns and all(c in df.columns for c in ["dlc", "dltt"]):
    total_debt = pd.to_numeric(df["dlc"], errors="coerce").fillna(0.0) + pd.to_numeric(df["dltt"],
                                                                                       errors="coerce").fillna(0.0)
    xint_miss = df["xint"].isna()
    df.loc[xint_miss & (total_debt <= 0), "xint"] = 0.0

# Fit ratio medians on training and apply
flow_fits = {}
for c in FLOWS:
    flow_fits[c] = _fit_ratio_medians(tr_all, c, size_col="at")

for c in FLOWS:
    # flows can be negative; do not clip
    _apply_ratio_medians(df, c, flow_fits[c], size_col="at", nonneg=False)

# ---------------------------------------------------------------------------
# 3.9 EDA AFTER + change analysis
# ---------------------------------------------------------------------------
df_raw_post = df[raw].copy(deep=True)

post_miss = pd.DataFrame(
    {
        "col": raw,
        "n_na_post": [int(df_raw_post[c].isna().sum()) for c in raw],
        "pct_na_post": [float(df_raw_post[c].isna().mean() * 100.0) for c in raw],
        "train_pct_na_post": [
            float(df_raw_post.loc[train_mask_for_imputation, c].isna().mean() * 100.0) for c in raw
        ],
    }
)

changes = pre_miss.merge(post_miss, on="col", how="left")
changes["n_imputed"] = changes["n_na_pre"] - changes["n_na_post"]
changes["pct_points_na_reduction"] = changes["pct_na_pre"] - changes["pct_na_post"]

x_post = df_raw_post[raw].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
q_post = x_post.quantile([0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]).T
post_dist = pd.DataFrame(
    {
        "n_nonmiss_post": x_post.notna().sum(),
        "mean_post": x_post.mean(),
        "std_post": x_post.std(ddof=0),
        "min_post": x_post.min(),
        "p01_post": q_post[0.01],
        "p05_post": q_post[0.05],
        "p25_post": q_post[0.25],
        "p50_post": q_post[0.50],
        "p75_post": q_post[0.75],
        "p95_post": q_post[0.95],
        "p99_post": q_post[0.99],
        "max_post": x_post.max(),
    }
)

pre_dist_key = pre_dist[
    ["n_nonmiss_pre", "mean_pre", "std_pre", "p01_pre", "p50_pre", "p99_pre"]].copy() if raw else pd.DataFrame()
post_dist_key = post_dist[
    ["n_nonmiss_post", "mean_post", "std_post", "p01_post", "p50_post", "p99_post"]].copy() if raw else pd.DataFrame()

dist_delta = pre_dist_key.join(post_dist_key, how="outer")
dist_delta["delta_mean"] = dist_delta["mean_post"] - dist_delta["mean_pre"]
dist_delta["delta_std"] = dist_delta["std_post"] - dist_delta["std_pre"]
dist_delta["delta_p50"] = dist_delta["p50_post"] - dist_delta["p50_pre"]

# Imputed-only diagnostics
rows = []
for c in raw:
    pre_na_mask = df_raw_pre[c].isna()
    n_imp = int(pre_na_mask.sum())
    if n_imp == 0:
        rows.append((c, 0, np.nan, np.nan, np.nan, np.nan, np.nan))
        continue

    imp_vals = pd.to_numeric(df_raw_post.loc[pre_na_mask, c], errors="coerce").replace([np.inf, -np.inf], np.nan)
    obs_vals = pd.to_numeric(df_raw_pre.loc[~pre_na_mask, c], errors="coerce").replace([np.inf, -np.inf], np.nan)

    rows.append(
        (
            c,
            n_imp,
            float(imp_vals.mean()) if imp_vals.notna().any() else np.nan,
            float(imp_vals.median()) if imp_vals.notna().any() else np.nan,
            float(imp_vals.std(ddof=0)) if imp_vals.notna().any() else np.nan,
            float(obs_vals.mean()) if obs_vals.notna().any() else np.nan,
            float(obs_vals.median()) if obs_vals.notna().any() else np.nan,
        )
    )

imputed_only = pd.DataFrame(
    rows,
    columns=["col", "n_imputed", "imputed_mean", "imputed_median", "imputed_std", "observed_mean_pre",
             "observed_median_pre"],
).set_index("col")

print("\n=== EDA (AFTER imputation): Missingness on raw inputs + change ===")
cols_show = [
    "col", "n", "n_na_pre", "pct_na_pre", "n_na_post", "pct_na_post",
    "n_imputed", "pct_points_na_reduction", "train_pct_na_pre", "train_pct_na_post",
]
print(
    changes[cols_show]
    .sort_values(["n_imputed", "pct_points_na_reduction"], ascending=[False, False])
    .round(4)
    .head(50)
)

print("\n=== Change analysis: Distribution deltas (post - pre) on raw inputs ===")
print(
    dist_delta[["n_nonmiss_pre", "n_nonmiss_post", "delta_mean", "delta_std", "delta_p50"]]
    .sort_values("delta_mean", key=lambda s: s.abs(), ascending=False)
    .round(6)
    .head(50)
)

print("\n=== Change analysis: Imputed-only vs observed (pre) summary ===")
print(
    imputed_only.assign(
        mean_gap_imputed_minus_observed=lambda d: d["imputed_mean"] - d["observed_mean_pre"],
        median_gap_imputed_minus_observed=lambda d: d["imputed_median"] - d["observed_median_pre"],
    )
    .sort_values("n_imputed", ascending=False)
    .round(6)
    .head(50)
)

# =============================================================================
# 4) Feature engineering (ratios) + distress_dummy (FEATURE) + payout-cut TARGETS
# =============================================================================

# --- Core building blocks ---
dlc = pd.to_numeric(df.get("dlc", np.nan), errors="coerce")
dltt = pd.to_numeric(df.get("dltt", np.nan), errors="coerce")
df["total_debt"] = pd.concat([dlc, dltt], axis=1).sum(axis=1, min_count=1)

seq = pd.to_numeric(df.get("seq", np.nan), errors="coerce")
mibt = pd.to_numeric(df.get("mibt", 0.0), errors="coerce")
df["equity_plus_mi_sp"] = seq + mibt
df["total_capital_sp"] = df["total_debt"] + df["equity_plus_mi_sp"]
df["sp_debt_to_capital"] = safe_divide(df["total_debt"], df["total_capital_sp"])

oibdp = pd.to_numeric(df.get("oibdp", np.nan), errors="coerce")
xint = pd.to_numeric(df.get("xint", np.nan), errors="coerce")
df["sp_debt_to_ebitda"] = safe_divide(df["total_debt"], oibdp)

txt = pd.to_numeric(df.get("txt", np.nan), errors="coerce")
txdc = pd.to_numeric(df.get("txdc", 0.0), errors="coerce")
txach = pd.to_numeric(df.get("txach", 0.0), errors="coerce")
df["cash_tax_paid_proxy"] = txt - txdc - txach

df["ffo_proxy"] = oibdp - xint - pd.to_numeric(df["cash_tax_paid_proxy"], errors="coerce")
df["sp_ffo_to_debt"] = safe_divide(df["ffo_proxy"], df["total_debt"])

oancf = pd.to_numeric(df.get("oancf", np.nan), errors="coerce")
capx = pd.to_numeric(df.get("capx", np.nan), errors="coerce")
df["sp_cfo_to_debt"] = safe_divide(oancf, df["total_debt"])
df["focf"] = oancf - capx
df["sp_focf_to_debt"] = safe_divide(df["focf"], df["total_debt"])

# NOTE: dv & prstkc NICHT default=0 setzen (sonst verzerrst du Cuts, wenn missing ≠ 0)
dv_raw = pd.to_numeric(df.get("dv", np.nan), errors="coerce")
dvc = pd.to_numeric(df.get("dvc", np.nan), errors="coerce")
dvp = pd.to_numeric(df.get("dvp", np.nan), errors="coerce")
dvt = pd.to_numeric(df.get("dvt", np.nan), errors="coerce")

prstkc = pd.to_numeric(df.get("prstkc", np.nan), errors="coerce")

# --- Robust payout measures (positiv als "Betrag") ---
# Dividends: prefer dvt, else dvc+dvp, else dv
if "dvt" in df.columns:
    df["div_total"] = dvt.abs()
elif ("dvc" in df.columns) or ("dvp" in df.columns):
    df["div_total"] = (dvc.fillna(0.0) + dvp.fillna(0.0)).abs()
else:
    df["div_total"] = dv_raw.abs()

# Repurchases: prstkc (purchase of stock) as amount
df["rep_total"] = prstkc.abs()

# --- Additional payout intensities (useful predictors) ---
at = pd.to_numeric(df.get("at", np.nan), errors="coerce")
che = pd.to_numeric(df.get("che", np.nan), errors="coerce")

df["cash_to_assets"] = safe_divide(che, at)
df["ebitda_to_assets"] = safe_divide(oibdp, at)
df["cfo_to_assets"] = safe_divide(oancf, at)
df["capx_to_assets"] = safe_divide(capx, at)

df["div_to_assets"] = safe_divide(df["div_total"], at)
df["rep_to_assets"] = safe_divide(df["rep_total"], at)
df["payout_to_assets"] = safe_divide(df["div_total"].fillna(0.0) + df["rep_total"].fillna(0.0), at)

# Log transforms (log1p handles 0). Negative -> NaN.
for c in ["at", "mkvalt"]:
    if c in df.columns:
        s = pd.to_numeric(df[c], errors="coerce")
        df[f"log_{c}"] = np.where(s >= 0, np.log1p(s), np.nan)

# Interest coverage: EBITDA / |interest expense|
df["sp_interest_coverage"] = safe_divide(oibdp, xint.abs())

# --- Distress dummy stays as FEATURE (as in your code) ---
td = pd.to_numeric(df["total_debt"], errors="coerce").to_numpy(dtype=float)
cap = pd.to_numeric(df["total_capital_sp"], errors="coerce").to_numpy(dtype=float)
eb = pd.to_numeric(oibdp, errors="coerce").to_numpy(dtype=float)
ffo = pd.to_numeric(df["ffo_proxy"], errors="coerce").to_numpy(dtype=float)

ffo_to_debt_pct = 100.0 * safe_divide(ffo, td)
debt_to_capital_pct = 100.0 * safe_divide(td, cap)
debt_to_ebitda = safe_divide(td, eb)

hl_ffo = (td > 0) & (ffo_to_debt_pct < 15.0)
hl_cap = (cap > 0) & (debt_to_capital_pct > 55.0)
hl_deb = (td > 0) & (debt_to_ebitda > 4.5)

df["distress_dummy"] = (hl_ffo & hl_cap & hl_deb).astype("int8")

# --- Targets: dividend cut & repurchase cut (t -> t+1 within firm) ---
# Policy knobs:
CUT_FRAC_DIV = 0.25          # "cut" if next < (1 - 0.25)*current  (25% cut)
CUT_FRAC_REP = 0.25
MIN_PAYOUT = 1e-6            # treat very small as zero
ASSUME_MISSING_PAYOUT_ZERO = False
#  - False (conservative): only label when both years observed (non-NaN)
#  - True: treat NaN as 0 (aggressive; can inflate 'cuts' if missingness is reporting-related)

df = df.sort_values(["firm_id", "fyear"]).copy()

df["div_total_next"] = df.groupby("firm_id")["div_total"].shift(-1)
df["rep_total_next"] = df.groupby("firm_id")["rep_total"].shift(-1)

if ASSUME_MISSING_PAYOUT_ZERO:
    div_t  = df["div_total"].fillna(0.0)
    div_t1 = df["div_total_next"].fillna(0.0)
    rep_t  = df["rep_total"].fillna(0.0)
    rep_t1 = df["rep_total_next"].fillna(0.0)

    df["target_next_year_dividend_cut"] = ((div_t > MIN_PAYOUT) & (div_t1 < div_t * (1.0 - CUT_FRAC_DIV))).astype("int8")
    df["target_next_year_repurchase_cut"] = ((rep_t > MIN_PAYOUT) & (rep_t1 < rep_t * (1.0 - CUT_FRAC_REP))).astype("int8")
else:
    div_obs = df["div_total"].notna() & df["div_total_next"].notna()
    rep_obs = df["rep_total"].notna() & df["rep_total_next"].notna()

    df["target_next_year_dividend_cut"] = np.where(
        div_obs,
        ((df["div_total"] > MIN_PAYOUT) & (df["div_total_next"] < df["div_total"] * (1.0 - CUT_FRAC_DIV))).astype("int8"),
        np.nan,
    )
    df["target_next_year_repurchase_cut"] = np.where(
        rep_obs,
        ((df["rep_total"] > MIN_PAYOUT) & (df["rep_total_next"] < df["rep_total"] * (1.0 - CUT_FRAC_REP))).astype("int8"),
        np.nan,
    )

# Optional: Omission variants (often in payout-literature useful)
df["target_next_year_dividend_omission"] = np.where(
    df["div_total"].notna() & df["div_total_next"].notna(),
    ((df["div_total"] > MIN_PAYOUT) & (df["div_total_next"] <= MIN_PAYOUT)).astype("int8"),
    np.nan,
)

# =============================================================================
# 5-7) Splits + preprocessing + diagnostics for BOTH targets
# =============================================================================

TARGETS = ["target_next_year_dividend_cut", "target_next_year_repurchase_cut"]

# Candidate features (include distress_dummy + payout state + your credit features)
base_feats = [
    # Your original credit-ish block
    "sp_debt_to_capital",
    "sp_ffo_to_debt",
    "sp_cfo_to_debt",
    "sp_focf_to_debt",
    "sp_dcf_to_debt",
    "sp_debt_to_ebitda",
    "sp_interest_coverage",
    "log_at",
    "log_mkvalt",
    # Payout-policy relevant additions
    "distress_dummy",
    "cash_to_assets",
    "ebitda_to_assets",
    "cfo_to_assets",
    "capx_to_assets",
    "div_to_assets",
    "rep_to_assets",
    "payout_to_assets",
]

# Also allow missingness flags if present (informative: "not paying" vs "missing")
for c in ["dv", "dvc", "dvp", "dvt", "prstkc"]:
    mc = f"miss_{c}"
    if mc in df.columns and mc not in base_feats:
        base_feats.append(mc)

def run_pipeline_for_target(df_in: pd.DataFrame, target_col: str):
    d = df_in.copy()

    # Keep only rows with label present
    d = d.dropna(subset=[target_col]).copy()
    d[target_col] = pd.to_numeric(d[target_col], errors="coerce").astype("int8")

    # Final Train/Val/Test split (by label_year)
    train_pool = d[d["label_year"] <= TRAIN_CUTOFF_LABEL_YEAR].copy()
    test = d[d["label_year"] > TRAIN_CUTOFF_LABEL_YEAR].copy()

    years = np.sort(train_pool["label_year"].dropna().unique())
    val_years = years[-VAL_YEARS:] if len(years) else np.array([], dtype=int)

    val = train_pool[train_pool["label_year"].isin(val_years)].copy()
    train = train_pool[~train_pool["label_year"].isin(val_years)].copy()

    feats = [c for c in base_feats if c in train.columns and c in val.columns and c in test.columns]
    if not feats:
        raise ValueError(f"No overlapping features found for target={target_col}.")

    # Replace +/-inf with NaN
    for dd in (train, val, test):
        dd[feats] = dd[feats].replace([np.inf, -np.inf], np.nan)

    # Split features: binary-like vs continuous (winsorize+scale only continuous)
    bin_like = []
    for c in feats:
        if c == "distress_dummy" or c.startswith("miss_"):
            bin_like.append(c)

    cont_feats = [c for c in feats if c not in bin_like]

    # Impute: train-only
    fill_cont = train[cont_feats].median(numeric_only=True) if cont_feats else pd.Series(dtype=float)
    for dd in (train, val, test):
        if cont_feats:
            dd[cont_feats] = dd[cont_feats].fillna(fill_cont)
        if bin_like:
            dd[bin_like] = dd[bin_like].fillna(0).astype("int8")

    # Winsorize continuous using train-only quantiles
    bounds = {}
    for c in cont_feats:
        s = pd.to_numeric(train[c], errors="coerce")
        bounds[c] = (s.quantile(WINSOR_LOWER_Q), s.quantile(WINSOR_UPPER_Q))

    for dd in (train, val, test):
        for c, (lo, hi) in bounds.items():
            s = pd.to_numeric(dd[c], errors="coerce")
            dd[c] = s.clip(lo, hi)

    # Standardize continuous (fit train only)
    scaler = StandardScaler()
    if cont_feats:
        x_train = train[cont_feats].to_numpy(dtype=float)
        x_val = val[cont_feats].to_numpy(dtype=float)
        x_test = test[cont_feats].to_numpy(dtype=float)

        scaler.fit(x_train)
        train_z = scaler.transform(x_train)
        val_z = scaler.transform(x_val)
        test_z = scaler.transform(x_test)

        z_cols = [f"z_{c}" for c in cont_feats]
        train[z_cols] = train_z
        val[z_cols] = val_z
        test[z_cols] = test_z
    else:
        z_cols = []

    # Diagnostics
    print("\n" + "="*90)
    print(f"TARGET = {target_col}")
    print("Split:",
          f"train={len(train):,}", f"val={len(val):,}", f"test={len(test):,}",
          "| val_years:", list(val_years))

    def _overview(dd: pd.DataFrame, name: str) -> None:
        n_rows = len(dd)
        n_firms = dd["firm_id"].nunique() if "firm_id" in dd.columns else np.nan
        n_years = dd["fyear"].nunique() if "fyear" in dd.columns else np.nan
        rate = float(dd[target_col].mean())
        print(f"\n=== {name} === rows={n_rows:,} | firms={n_firms:,} | years={n_years} | target_rate={rate:.4f}")
        by_year = dd.groupby("label_year")[target_col].agg(["mean", "count"])
        print("\nTarget by label_year (tail):")
        print(by_year.tail(12))

    _overview(train, "TRAIN")
    _overview(val, "VAL")
    _overview(test, "TEST")

    # Correlations on TRAIN (continuous raw, not z)
    corr_cols = [target_col] + feats
    corr = (
        train[corr_cols]
        .corr(numeric_only=True)[target_col]
        .drop(target_col)
        .sort_values(key=np.abs, ascending=False)
    )
    print("\nTop correlations with target (TRAIN):")
    print(corr.head(25))

    # Rolling folds info (on train_pool)
    folds = rolling_year_folds(train_pool, n_splits=N_SPLITS_TIME_CV, min_train_years=3)
    for i, (tr_idx, va_idx, tr_years, va_year) in enumerate(folds, 1):
        print(
            f"Fold {i}: train_years={tr_years[0]}..{tr_years[-1]} (n={len(tr_idx)}), "
            f"val_year={va_year} (n={len(va_idx)})"
        )

    return {
        "train": train, "val": val, "test": test,
        "feats": feats, "cont_feats": cont_feats, "bin_like": bin_like,
        "z_cols": z_cols,
        "target": target_col,
        "scaler": scaler if cont_feats else None,
    }

artifacts = {}
for tgt in TARGETS:
    artifacts[tgt] = run_pipeline_for_target(df, tgt)

# Optional: quick sanity print for distress construction (unchanged)
print(pd.Series({
    "hl_ffo": hl_ffo.mean(),
    "hl_cap": hl_cap.mean(),
    "hl_deb": hl_deb.mean(),
    "hl_all": (hl_ffo & hl_cap & hl_deb).mean()
}))

# =============================================================================
# Decile tables: (A) Size deciles, (B) Distress-risk deciles
# Shows rates of: dividend cut, repurchase cut, distress_dummy
# =============================================================================

# Targets (may contain NaN depending on your conservative labeling policy)
DIV_TGT = "target_next_year_dividend_cut"
REP_TGT = "target_next_year_repurchase_cut"
DIST = "distress_dummy"

# --- 1) Size deciles (same idea as before) ---
if "log_at" not in df.columns:
    at_num = pd.to_numeric(df.get("at", np.nan), errors="coerce")
    df["log_at"] = np.where(at_num >= 0, np.log1p(at_num), np.nan)

df["size_decile"] = pd.qcut(df["log_at"], 10, duplicates="drop")

size_summary = (
    df.groupby("size_decile")
      .agg(
          n=("firm_id", "size"),
          n_firms=("firm_id", "nunique"),
          distress_rate=(DIST, "mean"),
          dividend_cut_rate=(DIV_TGT, "mean"),
          repurchase_cut_rate=(REP_TGT, "mean"),
      )
)

print("\n=== Rates by SIZE decile ===")
print(size_summary.round(4))


# --- 2) Distress-risk deciles (continuous score -> qcut) ---
# Build components if they are not already present as columns
if "debt_to_capital_pct" not in df.columns:
    df["debt_to_capital_pct"] = 100.0 * safe_divide(df["total_debt"], df["total_capital_sp"])

if "ffo_to_debt_pct" not in df.columns:
    df["ffo_to_debt_pct"] = 100.0 * safe_divide(df["ffo_proxy"], df["total_debt"])

if "debt_to_ebitda" not in df.columns:
    df["debt_to_ebitda"] = safe_divide(df["total_debt"], pd.to_numeric(df.get("oibdp", np.nan), errors="coerce"))

if "sp_interest_coverage" not in df.columns:
    oibdp_num = pd.to_numeric(df.get("oibdp", np.nan), errors="coerce")
    xint_num = pd.to_numeric(df.get("xint", np.nan), errors="coerce")
    df["sp_interest_coverage"] = safe_divide(oibdp_num, xint_num.abs())

# Distress-risk score (higher = more distressed)
# - Higher leverage => higher risk
# - Lower FFO/debt => higher risk (so subtract it)
# - Lower interest coverage => higher risk (so subtract log(1+coverage))
# Notes:
#   * This is for descriptive bucketing only (not “training-only” scaling).
#   * Winsorize score components lightly to avoid extreme outliers dominating.
score_df = df[["debt_to_capital_pct", "debt_to_ebitda", "ffo_to_debt_pct", "sp_interest_coverage"]].copy()

# Clean & winsorize components for stability (global, descriptive)
for c in score_df.columns:
    s = pd.to_numeric(score_df[c], errors="coerce").replace([np.inf, -np.inf], np.nan)
    lo, hi = s.quantile(0.01), s.quantile(0.99)
    score_df[c] = s.clip(lo, hi)

# Coverage transform (avoid log issues)
cov = score_df["sp_interest_coverage"].clip(lower=0)
cov_term = np.log1p(cov)  # higher coverage => safer

df["distress_risk_score"] = (
    0.6 * score_df["debt_to_capital_pct"] +
    0.6 * score_df["debt_to_ebitda"] -
    0.6 * score_df["ffo_to_debt_pct"] -
    0.4 * cov_term
)

# Create distress deciles (if too few unique values, qcut will drop bins)
df["distress_decile"] = pd.qcut(df["distress_risk_score"], 10, duplicates="drop")

distress_summary = (
    df.groupby("distress_decile")
      .agg(
          n=("firm_id", "size"),
          n_firms=("firm_id", "nunique"),
          distress_rate=(DIST, "mean"),
          dividend_cut_rate=(DIV_TGT, "mean"),
          repurchase_cut_rate=(REP_TGT, "mean"),
          mean_score=("distress_risk_score", "mean"),
      )
)

print("\n=== Rates by DISTRESS-RISK decile (higher decile = more distressed) ===")
print(distress_summary.round(4))




=== EDA (BEFORE imputation): Missingness on raw inputs ===
       col      n  n_na_pre  pct_na_pre  train_n  train_pct_na_pre
18   dlcch  75005     33143     44.1877    48458           42.9630
5   apalch  75005     30371     40.4920    48458           39.0214
75   txach  75005     22791     30.3860    48458           29.2501
48  ivstch  75005     19194     25.5903    48458           23.0282
66   recch  75005     12589     16.7842    48458           16.5938
53  mkvalt  75005     12350     16.4656    48458           17.0189
71    sppe  75005     12239     16.3176    48458           16.4307
1      act  75005     10721     14.2937    48458           14.6581
50     lct  75005     10695     14.2590    48458           14.5982
84    xint  75005     10536     14.0471    48458           13.9234
78  txditc  75005      9069     12.0912    48458           12.2044
79     txp  75005      7942     10.5886    48458           10.6917
29   esubc  75005      6798      9.0634    48458            8.8964
49

  df.groupby("size_decile")
  df.groupby("distress_decile")
