In [2]:
import numpy as np
import pandas as pd

FILE_NAME = "data.csv"

TRAIN_CUTOFF_LABEL_YEAR = 2022
VAL_YEARS = 1
N_SPLITS_TIME_CV = 5

WINSOR_LOWER_Q = 0.01
WINSOR_UPPER_Q = 0.99
EPS = 1e-8

NUMERIC_COLS = [
    "gvkey", "fyear", "ismod",
    "ib", "at", "dltt", "dlc", "seq", "mibt",
    "che", "act", "lct", "re",
    "oancf", "ivncf", "fincf",
    "oibdp", "xint", "capx",
    "dv", "prstkc",
    "txt", "txdc", "txach", "txp",
    "mkvalt", "csho", "prcc_f", "prcc_c",
    "dltis", "dltr", "sstk"
]
REQUIRED_KEYS = ["gvkey", "fyear"]


# --- helpers ---
def req(df, cols):
    miss = [c for c in cols if c not in df.columns]
    if miss:
        raise KeyError(f"Missing required column(s): {miss}")

def sdiv(a, b):
    a = pd.to_numeric(a, errors="coerce")
    b = pd.to_numeric(b, errors="coerce")
    return a / (b + EPS)

def log1p_col(df, cols):
    for c in cols:
        if c in df.columns:
            s = pd.to_numeric(df[c], errors="coerce")
            df[f"log_{c}"] = np.where(s >= 0, np.log1p(s), np.nan)
    return df

def fit_clip(train, cols):
    out = {}
    for c in cols:
        s = pd.to_numeric(train[c], errors="coerce")
        out[c] = (s.quantile(WINSOR_LOWER_Q), s.quantile(WINSOR_UPPER_Q))
    return out

def clip(df, bounds):
    for c, (lo, hi) in bounds.items():
        s = pd.to_numeric(df[c], errors="coerce")
        df[c] = s.clip(lo, hi)
    return df

def fit_z(train, cols):
    stats = {}
    for c in cols:
        s = pd.to_numeric(train[c], errors="coerce")
        mu, sd = s.mean(), s.std(ddof=0)
        stats[c] = (mu, sd if np.isfinite(sd) and sd > 0 else np.nan)
    return stats

def z(df, stats, pref="z_"):
    for c, (mu, sd) in stats.items():
        s = pd.to_numeric(df[c], errors="coerce")
        df[f"{pref}{c}"] = (s - mu) / (sd + EPS) if np.isfinite(sd) and sd > 0 else np.nan
    return df

def add_miss(df, cols):
    for c in cols:
        df[f"miss_{c}"] = df[c].isna().astype("int8")
    return df

def fit_year_med(train, c):
    return {"all": train[c].median(), "yr": train.groupby("fyear")[c].median()}

def apply_year_med(df, c, m):
    idx = df[c].isna()
    if idx.any():
        df.loc[idx, c] = df.loc[idx, "fyear"].map(m["yr"]).fillna(m["all"])
    return df

def fit_twfe(train, c):
    obs = train[["firm_id", "fyear", c]].dropna()
    return {
        "all": obs[c].mean(),
        "f": obs.groupby("firm_id")[c].mean(),
        "y": obs.groupby("fyear")[c].mean(),
    }

def apply_twfe(df, c, m, nonneg=False):
    idx = df[c].isna()
    if idx.any():
        pred = df.loc[idx, "firm_id"].map(m["f"]) + df.loc[idx, "fyear"].map(m["y"]) - m["all"]
        pred = pred.fillna(df.loc[idx, "fyear"].map(m["y"])).fillna(m["all"])
        if nonneg:
            pred = pred.clip(lower=0.0)
        df.loc[idx, c] = pred.to_numpy()
    return df

def icov(earn, intr):
    earn = pd.to_numeric(earn, errors="coerce").fillna(0.0)
    intr = pd.to_numeric(intr, errors="coerce").fillna(0.0)
    d = intr.abs()
    out = earn / (d + EPS)
    z0 = d <= EPS
    out = out.astype("float64")
    out[z0 & (earn >= 0)] = np.inf
    out[z0 & (earn < 0)] = -np.inf
    return out

def mk_feats(df):
    dlc = pd.to_numeric(df.get("dlc", 0.0), errors="coerce").fillna(0.0)
    dltt = pd.to_numeric(df.get("dltt", 0.0), errors="coerce").fillna(0.0)
    seq = pd.to_numeric(df.get("seq", 0.0), errors="coerce").fillna(0.0)
    mibt = pd.to_numeric(df.get("mibt", 0.0), errors="coerce").fillna(0.0)
    oibdp = pd.to_numeric(df.get("oibdp", 0.0), errors="coerce").fillna(0.0)
    xint = pd.to_numeric(df.get("xint", 0.0), errors="coerce").fillna(0.0)
    txt = pd.to_numeric(df.get("txt", 0.0), errors="coerce").fillna(0.0)
    txdc = pd.to_numeric(df.get("txdc", 0.0), errors="coerce").fillna(0.0)
    txach = pd.to_numeric(df.get("txach", 0.0), errors="coerce").fillna(0.0)
    oancf = pd.to_numeric(df.get("oancf", 0.0), errors="coerce").fillna(0.0)
    capx = pd.to_numeric(df.get("capx", 0.0), errors="coerce").fillna(0.0)
    dv = pd.to_numeric(df.get("dv", 0.0), errors="coerce").fillna(0.0)
    prstkc = pd.to_numeric(df.get("prstkc", 0.0), errors="coerce").fillna(0.0)

    df["total_debt"] = dlc + dltt
    df["sp_interest_coverage"] = icov(oibdp, xint)
    df["equity_plus_mi_sp"] = seq + mibt
    df["total_capital_sp"] = df["total_debt"] + df["equity_plus_mi_sp"]
    df["sp_debt_to_capital"] = sdiv(df["total_debt"], df["total_capital_sp"])
    df["sp_debt_to_ebitda"] = sdiv(df["total_debt"], oibdp)
    cash_tax = txt - txdc - txach
    ffo = oibdp - xint - cash_tax
    df["ffo_proxy"] = ffo
    df["sp_ffo_to_debt"] = sdiv(ffo, df["total_debt"])
    df["sp_cfo_to_debt"] = sdiv(oancf, df["total_debt"])
    focf = oancf - capx
    df["focf"] = focf
    df["sp_focf_to_debt"] = sdiv(focf, df["total_debt"])
    dcf = focf - dv - prstkc
    df["dcf"] = dcf
    df["sp_dcf_to_debt"] = sdiv(dcf, df["total_debt"])
    df["distress_dummy"] = ((df["sp_interest_coverage"] < 1.0) | (seq < 0)).astype("int8")
    return log1p_col(df, ["at", "mkvalt"])

def time_cv(df, n_splits, year_col="label_year"):
    yrs = np.array(sorted(pd.Series(df[year_col].unique()).dropna().astype(int)))
    if len(yrs) < 3:
        return []
    n_splits = int(min(n_splits, len(yrs) - 2))
    cuts = np.linspace(1, len(yrs) - 1, n_splits + 1, dtype=int)[1:]
    folds = []
    for k, c in enumerate(cuts, 1):
        tr_yrs = yrs[:c]
        va_yr = yrs[c]
        tr_idx = df.index[df[year_col].isin(tr_yrs)].to_numpy()
        va_idx = df.index[df[year_col].eq(va_yr)].to_numpy()
        if len(tr_idx) and len(va_idx):
            folds.append((k, tr_yrs.min(), tr_yrs.max(), int(va_yr), tr_idx, va_idx))
    return folds


# --- data ingestion ---
df = pd.read_csv(FILE_NAME, low_memory=False)
req(df, REQUIRED_KEYS)

if "datadate" in df.columns:
    df["datadate"] = pd.to_datetime(df["datadate"], errors="coerce")

for c in NUMERIC_COLS:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

for c in ["gvkey", "fyear", "ismod"]:
    if c in df.columns:
        df[c] = df[c].astype("Int64")

df["firm_id"] = df["gvkey"]
df = df.sort_values(["firm_id", "fyear"]).drop_duplicates(subset=["firm_id", "fyear"], keep="last").reset_index(drop=True)
df["label_year"] = df["fyear"] + 1


# --- splits ---
train_pool = df[df["label_year"] <= TRAIN_CUTOFF_LABEL_YEAR].copy()
test = df[df["label_year"] > TRAIN_CUTOFF_LABEL_YEAR].copy()

val_yrs = np.sort(train_pool["label_year"].dropna().unique())[-VAL_YEARS:]
train_mask = ~train_pool["label_year"].isin(val_yrs)
train = train_pool[train_mask].copy()
val = train_pool[~train_mask].copy()


# --- missingness handling (fit on train only) ---
raw = [c for c in ["at", "mkvalt", "seq", "dlc", "dltt", "oibdp", "xint", "oancf", "capx", "txt", "txdc", "txach", "dv", "prstkc", "mibt", "ismod"] if c in train_pool.columns]
for d in (train_pool, test):
    add_miss(d, raw)

fe_cols = [c for c in ["at", "mkvalt", "seq", "dlc", "dltt", "oibdp", "xint", "oancf", "capx", "txt"] if c in train.columns]
med_cols = [c for c in ["dv", "prstkc", "txdc", "txach", "mibt", "ismod"] if c in train.columns]

med = {c: fit_year_med(train, c) for c in med_cols}
fe = {c: fit_twfe(train, c) for c in fe_cols}
nonneg = {"at", "mkvalt", "dlc", "dltt", "xint", "capx"}

for d in (train_pool, test):
    for c in med_cols:
        apply_year_med(d, c, med[c])
    for c in fe_cols:
        apply_twfe(d, c, fe[c], nonneg=(c in nonneg))


# --- feature engineering & target ---
train_pool = mk_feats(train_pool)
test = mk_feats(test)

train_pool["target_next_year_distress"] = train_pool.groupby("firm_id")["distress_dummy"].shift(-1)
test["target_next_year_distress"] = test.groupby("firm_id")["distress_dummy"].shift(-1)

train_pool = train_pool.dropna(subset=["target_next_year_distress"]).copy()
test = test.dropna(subset=["target_next_year_distress"]).copy()

train = train_pool[train_pool["label_year"].isin(train["label_year"].unique())].copy()
val = train_pool[train_pool["label_year"].isin(val["label_year"].unique())].copy()


# --- transformations (fit on train only) ---
base = [
    "sp_debt_to_capital", "sp_ffo_to_debt", "sp_cfo_to_debt",
    "sp_focf_to_debt", "sp_dcf_to_debt", "sp_debt_to_ebitda", "sp_interest_coverage",
    "log_at", "log_mkvalt"
]
feat = [c for c in base if c in train.columns and c in val.columns and c in test.columns]

b = fit_clip(train, feat)
for d in (train, val, test):
    clip(d, b)

stats = fit_z(train, feat)
for d in (train, val, test):
    z(d, stats)


# --- time CV folds (train_pool years only, forward by label_year) ---
folds = time_cv(train, N_SPLITS_TIME_CV, year_col="label_year")
print(f"Train rows: {len(train):,} | Val rows: {len(val):,} | Test rows: {len(test):,}")
print("Time-CV folds:", [(k, tr_min, tr_max, va_yr) for (k, tr_min, tr_max, va_yr, _, _) in folds])


# --- minimal EDA ---
t = "target_next_year_distress"
c = train[[t] + feat].corr(numeric_only=True)[t].drop(t).sort_values(key=np.abs, ascending=False)
print(c)


Train rows: 44,678 | Val rows: 0 | Test rows: 12,404
Time-CV folds: [(1, np.int64(2015), np.int64(2016), 2017), (2, np.int64(2015), np.int64(2017), 2018), (3, np.int64(2015), np.int64(2018), 2019), (4, np.int64(2015), np.int64(2019), 2020), (5, np.int64(2015), np.int64(2020), 2021)]
log_at                 -0.348460
log_mkvalt             -0.302417
sp_ffo_to_debt         -0.205550
sp_focf_to_debt        -0.163570
sp_dcf_to_debt         -0.153777
sp_cfo_to_debt         -0.140211
sp_debt_to_capital     -0.111579
sp_debt_to_ebitda      -0.075864
sp_interest_coverage   -0.022543
Name: target_next_year_distress, dtype: float64


  diff_b_a = b - a
  diff_b_a = b - a
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
