In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =============================================================================
# 0. CONFIGURATION
# =============================================================================
FILE_NAME = "data.csv"

# IMPORTANT: because target is t+1, we split by label_year = fyear + 1
TRAIN_CUTOFF_LABEL_YEAR = 2022     # Train/Val labels up to and incl. 2022, Test labels after 2022
VAL_YEARS = 1                      # Hold out the last 1 label-year from the training pool as validation

# Rolling year-based CV folds (within train_pool)
N_SPLITS_TIME_CV = 5

WINSOR_LOWER_Q = 0.01
WINSOR_UPPER_Q = 0.99
EPS = 1e-8

NUMERIC_COLS = [
    'prcc_c', 'prcc_f', 'gvkey', 'fyear', 'ismod',
    'ib', 'at', 'dltt', 'dlc', 'che', 're', 'seq',
    'xrd', 'dv', 'ni', 'act', 'lct', 'oancf', 'ivncf', 'fincf',
    'oibdp', 'xint', 'mkvalt', 'capx'
]

REQUIRED_KEYS = ['gvkey', 'fyear']

In [None]:

# =============================================================================
# 1. HELPERS (No Imputation)
# =============================================================================
def _ensure_required_columns(df, required):
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing required column(s): {missing}. Check your CSV schema.")
    return True

def _safe_div(numer, denom, eps=EPS):
    numer = pd.to_numeric(numer, errors='coerce')
    denom = pd.to_numeric(denom, errors='coerce')
    return numer / (denom + eps)

def _add_log_features(df, cols):
    for c in cols:
        if c in df.columns:
            s = pd.to_numeric(df[c], errors='coerce')
            m = s >= 0
            out = pd.Series(np.nan, index=df.index, dtype='float64')
            out.loc[m] = np.log1p(s.loc[m])
            df[f'log_{c}'] = out
    return df

def _fit_winsor_bounds(train_df, cols, q_lo=WINSOR_LOWER_Q, q_hi=WINSOR_UPPER_Q):
    bounds = {}
    for c in cols:
        s = pd.to_numeric(train_df[c], errors='coerce')
        if s.notna().sum() == 0:
            bounds[c] = (np.nan, np.nan)
            continue
        bounds[c] = (s.quantile(q_lo), s.quantile(q_hi))
    return bounds

def _apply_winsor(df, bounds):
    for c, (lo, hi) in bounds.items():
        if c in df.columns and np.isfinite(lo) and np.isfinite(hi):
            s = pd.to_numeric(df[c], errors='coerce')
            df[c] = s.clip(lower=lo, upper=hi)
    return df

def _fit_scaler(train_df, cols):
    stats = {}
    for c in cols:
        s = pd.to_numeric(train_df[c], errors='coerce')
        mu = s.mean()
        sd = s.std(ddof=0)
        stats[c] = (mu, sd if np.isfinite(sd) and sd > 0 else np.nan)
    return stats

def _apply_zscore(df, stats, prefix="z_"):
    for c, (mu, sd) in stats.items():
        if c in df.columns:
            s = pd.to_numeric(df[c], errors='coerce')
            if np.isfinite(sd) and sd > 0:
                df[f"{prefix}{c}"] = (s - mu) / (sd + EPS)
            else:
                df[f"{prefix}{c}"] = np.nan
    return df

In [None]:


# =============================================================================
# 2. LOAD + BASIC DATA CLEANING (Formats, Duplicates)
# =============================================================================
df = pd.read_csv(FILE_NAME, low_memory=False)

_ensure_required_columns(df, REQUIRED_KEYS)

if 'datadate' in df.columns:
    df['datadate'] = pd.to_datetime(df['datadate'], errors='coerce')

for col in NUMERIC_COLS:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df['firm_id'] = df['gvkey']
df = df.sort_values(['firm_id', 'fyear']).reset_index(drop=True)
df = df.drop_duplicates(subset=['firm_id', 'fyear'], keep='last').reset_index(drop=True)

In [13]:

# =============================================================================
# 3. TRAIN / VALIDATION / TEST SPLIT (Temporal, based on label_year)
# =============================================================================
train_pool = df[df['label_year'] <= TRAIN_CUTOFF_LABEL_YEAR].copy()
test = df[df['label_year'] > TRAIN_CUTOFF_LABEL_YEAR].copy()

if train_pool.empty:
    raise ValueError("Training pool is empty after applying label_year cutoff. Check TRAIN_CUTOFF_LABEL_YEAR.")

unique_label_years = np.sort(train_pool['label_year'].dropna().unique())
val_years = unique_label_years[-VAL_YEARS:] if len(unique_label_years) >= VAL_YEARS else unique_label_years

val = train_pool[train_pool['label_year'].isin(val_years)].copy()
train = train_pool[~train_pool['label_year'].isin(val_years)].copy()

print("----- Split Summary (based on label_year = fyear+1) -----")
print(f"Train label_year max: {train['label_year'].max()} | n={len(train)}")
print(f"Val   label_years: {list(val_years)} | n={len(val)}")
print(f"Test  label_year min: {test['label_year'].min()} | n={len(test)}")


----- Split Summary (based on label_year = fyear+1) -----
Train label_year max: 2021 | n=44783
Val   label_years: [np.int64(2022)] | n=6415
Test  label_year min: 2023 | n=12404


In [None]:

# =============================================================================
# 4. EDA (NO IMPUTATION) + VISUALIZATIONS
# =============================================================================
miss_rate = train.isna().mean().sort_values(ascending=False)
top_miss = miss_rate.head(25)

plt.figure(figsize=(9, 4))
plt.bar(range(len(top_miss)), top_miss.values)
plt.xticks(range(len(top_miss)), top_miss.index, rotation=75, ha='right')
plt.ylabel("Missing share (train)")
plt.title("Top missingness rates (train, no imputation)")
plt.tight_layout()
plt.show()

def plot_distress_rate(split_df, name):
    if split_df.empty:
        return
    g = split_df.groupby('label_year')['target_next_year_distress'].mean()
    plt.figure(figsize=(7, 3))
    plt.plot(g.index, g.values, marker='o')
    plt.title(f"Distress rate by label_year â€” {name}")
    plt.xlabel("label_year")
    plt.ylabel("mean(target_next_year_distress)")
    plt.tight_layout()
    plt.show()

plot_distress_rate(train, "Train")
plot_distress_rate(val, "Validation")
plot_distress_rate(test, "Test")

base_features = [
    'roa', 'cf_roa', 'leverage', 'debt_to_equity', 'current_ratio',
    'cash_ratio', 're_to_assets', 'mkt_to_book', 'capex_ratio', 'interest_coverage'
]
base_features = [c for c in base_features if c in train.columns]

for c in base_features:
    s = pd.to_numeric(train[c], errors='coerce').dropna()
    if s.empty:
        continue

    plt.figure(figsize=(8, 3))
    plt.hist(s.values, bins=40)
    plt.title(f"Histogram (Train, raw): {c}")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 1.8))
    plt.boxplot(s.values, vert=False)
    plt.title(f"Boxplot (Train, raw): {c}")
    plt.tight_layout()
    plt.show()


In [None]:

# =============================================================================
# 6. TRANSFORMATIONS (fit on TRAIN only; apply to VAL/TEST)
# =============================================================================
for split in [train, val, test]:
    _add_log_features(split, cols=['at', 'mkvalt', 'sale'])

log_features = [c for c in ['log_at', 'log_mkvalt', 'log_sale'] if c in train.columns]

winsor_cols = base_features + log_features
winsor_cols = [c for c in winsor_cols if c in train.columns and c in val.columns and c in test.columns]

# Fit bounds on TRAIN only; apply to VAL/TEST (leakage-safe)
winsor_bounds = _fit_winsor_bounds(train, winsor_cols, q_lo=WINSOR_LOWER_Q, q_hi=WINSOR_UPPER_Q)
train = _apply_winsor(train, winsor_bounds)
val   = _apply_winsor(val, winsor_bounds)
test  = _apply_winsor(test, winsor_bounds)
print("Winsorization done (bounds fit on TRAIN only).")

# Fit scaling on TRAIN only; apply to VAL/TEST (leakage-safe)
scaler_stats = _fit_scaler(train, winsor_cols)
train = _apply_zscore(train, scaler_stats, prefix="z_")
val   = _apply_zscore(val, scaler_stats, prefix="z_")
test  = _apply_zscore(test, scaler_stats, prefix="z_")
print("Standardization done (stats fit on TRAIN only).")


In [None]:


# =============================================================================
# 4. TARGET CONSTRUCTION (t+1) + RAW FEATURE ENGINEERING (Ratios)
# =============================================================================
df['interest_coverage'] = np.nan
if all(c in df.columns for c in ['oibdp', 'xint']):
    df['interest_coverage'] = _safe_div(df['oibdp'], df['xint'])

cond_coverage = df['interest_coverage'] < 1.0
cond_insolvency = (df['seq'] < 0) if 'seq' in df.columns else pd.Series(False, index=df.index)
df['distress_dummy'] = (cond_coverage.fillna(False) | cond_insolvency.fillna(False)).astype(int)

df['roa'] = np.nan
df['cf_roa'] = np.nan
if all(c in df.columns for c in ['ib', 'at']):
    df['roa'] = _safe_div(df['ib'], df['at'])
if all(c in df.columns for c in ['oancf', 'at']):
    df['cf_roa'] = _safe_div(df['oancf'], df['at'])

if 'dltt' in df.columns or 'dlc' in df.columns:
    debt_parts = []
    if 'dltt' in df.columns: debt_parts.append(pd.to_numeric(df['dltt'], errors='coerce'))
    if 'dlc' in df.columns:  debt_parts.append(pd.to_numeric(df['dlc'], errors='coerce'))
    df['total_debt'] = pd.concat(debt_parts, axis=1).sum(axis=1, min_count=1)
else:
    df['total_debt'] = np.nan

df['leverage'] = np.nan
df['debt_to_equity'] = np.nan
if 'total_debt' in df.columns and 'at' in df.columns:
    df['leverage'] = _safe_div(df['total_debt'], df['at'])
if 'total_debt' in df.columns and 'seq' in df.columns:
    df['debt_to_equity'] = _safe_div(df['total_debt'], df['seq'])

df['current_ratio'] = np.nan
df['cash_ratio'] = np.nan
if all(c in df.columns for c in ['act', 'lct']):
    df['current_ratio'] = _safe_div(df['act'], df['lct'])
if all(c in df.columns for c in ['che', 'lct']):
    df['cash_ratio'] = _safe_div(df['che'], df['lct'])

df['re_to_assets'] = np.nan
df['mkt_to_book'] = np.nan
if all(c in df.columns for c in ['re', 'at']):
    df['re_to_assets'] = _safe_div(df['re'], df['at'])
if all(c in df.columns for c in ['mkvalt', 'seq']):
    df['mkt_to_book'] = _safe_div(df['mkvalt'], df['seq'])

df['capex_ratio'] = np.nan
if all(c in df.columns for c in ['capx', 'at']):
    df['capex_ratio'] = _safe_div(df['capx'], df['at'])

df['target_next_year_distress'] = df.groupby('firm_id')['distress_dummy'].shift(-1)
df['label_year'] = df['fyear'] + 1
df = df.dropna(subset=['target_next_year_distress']).reset_index(drop=True)


In [None]:

# =============================================================================
# 7. FEATURE SELECTION DIAGNOSTICS (TRAIN ONLY)
# =============================================================================
target_col = 'target_next_year_distress'

print("\n----- Correlation with Target (TRAIN, winsorized raw features) -----")
corr_df = train[[target_col] + winsor_cols].corr()
for c in winsor_cols:
    r = corr_df.loc[target_col, c]
    print(f"{c:<20} r = {r: .4f}")

print("\n----- Variability Checks (TRAIN, winsorized raw features) -----")
for c in winsor_cols:
    s = pd.to_numeric(train[c], errors='coerce').dropna()
    if s.empty:
        print(f"{c:<20} IQR: NA, Std. Dev.: NA")
        continue
    iqr = s.quantile(0.75) - s.quantile(0.25)
    std = s.std(ddof=1)
    print(f"{c:<20} IQR: {iqr:.6f}, Std. Dev.: {std:.6f}")


In [None]:

# =============================================================================
# 8. ROLLING / FORWARD CV BY YEAR (NO MODEL TRAINING; JUST SPLITS)
# =============================================================================
def rolling_year_folds(df_in, year_col='label_year', n_splits=5, min_train_years=3):
    years = np.sort(df_in[year_col].dropna().unique())
    if len(years) < (min_train_years + n_splits):
        n_splits = max(1, len(years) - min_train_years)
    folds = []
    for k in range(n_splits):
        train_years = years[:min_train_years + k]
        val_year = years[min_train_years + k]
        tr_idx = df_in.index[df_in[year_col].isin(train_years)].to_numpy()
        va_idx = df_in.index[df_in[year_col] == val_year].to_numpy()
        folds.append((tr_idx, va_idx, train_years, val_year))
    return folds

print("\n----- Rolling Year CV folds (within TRAIN_POOL) -----")
train_pool_for_cv = train_pool.copy()  # includes both train+val (but excludes test)
year_folds = rolling_year_folds(train_pool_for_cv, year_col='label_year',
                                n_splits=N_SPLITS_TIME_CV, min_train_years=3)

for i, (tr_idx, va_idx, tr_years, va_year) in enumerate(year_folds, 1):
    print(f"Fold {i}: train_years={tr_years[0]}..{tr_years[-1]} (n={len(tr_idx)}), "
          f"val_year={va_year} (n={len(va_idx)})")

# NOTE (for later, when you train models):
# In each CV fold, winsor/scaling must be FIT on fold-train and APPLIED to fold-val
# to avoid leakage. This follows general leakage-safe preprocessing practice. :contentReference[oaicite:2]{index=2}


In [None]:
df.describe()