# BL Monthly Rebalance Backtest (Modular Complete)
Monthly rebalance logic with:
- rebalance on first trading day of each month
- hold to month end
- option B rolling ridge prediction on rebalance dates only
- 504-day lookback window


## Imports


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

## Config


In [86]:
# Paths
EMBEDDING_PATH = "data/kospidaq_embeddings_OpenAI.xlsx"
RETURN_PATH    = "data/report_return_mapping.xlsx"
ADJ_CLOSE_PATH = "data/adj_close_wide_2014_2026.xlsx"
MCAP_PATH      = "data/market_cap_2014_2025.xlsx"
KOSPI_PATH     = "data/kospi_krx_20130102_20251230.xlsx"
RISK_FREE_PATH = "data/treasury3m_proxy_krx_20130102_20251230.xlsx"

# Core settings
DATA_START_DATE = "2014-01-02"
TARGET_RETURN_COL = "log_return_month"
WINDOW = 252
RIDGE_ALPHA = 10.0
MIN_TRAIN_SAMPLES = 500
MIN_REPORT_COUNT = 10

# Monthly rebalance rule
REBALANCE_FREQ = "MS"

# Risk aversion settings
RISK_FREE_COL = "yield_3m_proxy"
USE_DYNAMIC_RISK_AVERSION = True
RISK_AVERSION_FALLBACK = 0.01
LAMBDA_MIN_OBS = 252
LAMBDA_FLOOR = 0.01
RISK_FREE_TRADING_DAYS = 252

# BL and portfolio settings
TAU = 0.025
LONG_ONLY = True
WEIGHT_CLIP = None

# Cost settings
TRANSACTION_COST_BPS = 0.0
SELL_TAX_BPS = 0.0

# Annualization for monthly series
ANNUALIZATION = 12

# Export path
EXPORT_PATH = "outputs/BL_monthly_rebalance_results_2.xlsx"

## Load Inputs


In [None]:
df_embed = pd.read_excel(EMBEDDING_PATH)
print("df_embed shape:", df_embed.shape)
df_embed.head(2)

df_embed shape: (62450, 1538)


Unnamed: 0,date,ticker,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_1527,embedding_1528,embedding_1529,embedding_1530,embedding_1531,embedding_1532,embedding_1533,embedding_1534,embedding_1535,embedding_1536
0,2025-12-30,??뽰뵠?????而??곸궎,0.002818,-0.025671,0.01925,-0.015114,-0.021794,-4.1e-05,0.006391,-0.006771,...,0.004799,-0.021916,0.024311,0.007462,0.006357,0.001155,-0.020175,0.024161,-0.010189,-0.014638
1,2025-12-30,????쀫퉮??/td>,-0.008313,-0.029964,0.023021,-0.03166,-0.007354,-0.013455,-0.016913,-0.012554,...,0.004176,-0.007008,0.040848,0.010708,0.009853,-0.008992,-0.017814,0.002246,0.002209,0.006499


In [None]:
df_ret = pd.read_excel(RETURN_PATH)
print("df_ret shape:", df_ret.shape)
df_ret.head(2)

df_ret shape: (62450, 14)


Unnamed: 0,date,ticker,ticker_code,log_return_0,log_return_1,log_return_2,log_return_3,log_return_4,log_return_5,log_return_6,log_return_7,log_return_8,log_return_9,log_return_10
0,2025-12-30,??뽰뵠?????而??곸궎,420570,-0.00464,0.049897,0.026393,0.009259,-0.027108,-0.037919,-0.040338,,,,
1,2025-12-30,????쀫퉮??/td>,3490,-0.011025,-0.015643,0.002215,-0.00222,-0.020157,-0.04073,-0.020157,,,,


In [58]:
embedding_cols = [c for c in df_embed.columns if c.startswith("embedding_")]
print("embedding cols:", len(embedding_cols))
embedding_cols[:5]

embedding cols: 1536


['embedding_1', 'embedding_2', 'embedding_3', 'embedding_4', 'embedding_5']

## Preprocess Reports


In [59]:
# Build raw panel and normalize key columns
df_raw = pd.concat(
    [
        df_embed[["date", "ticker"] + embedding_cols],
        df_ret[["ticker_code"]],
    ],
    axis=1,
)

df_raw["date"] = pd.to_datetime(df_raw["date"], errors="coerce")
df_raw["ticker_code"] = pd.to_numeric(df_raw["ticker_code"], errors="coerce")

n_raw_before_drop = len(df_raw)
df_raw = df_raw.dropna(subset=["date", "ticker_code"]).copy()
df_raw["ticker_code"] = df_raw["ticker_code"].astype(int).astype(str).str.zfill(6)
n_raw_after_drop = len(df_raw)

print("rows before drop:", n_raw_before_drop)
print("rows after drop:", n_raw_after_drop)

rows before drop: 62450
rows after drop: 54372


In [74]:
# Keep raw report panel first.
# Monthly aggregation by interval will be applied after monthly calendar is built.
n_before_agg = len(df_raw)
n_after_agg = np.nan

preprocess_summary = pd.Series({
    "raw_reports": int(n_before_agg),
    "after_agg_reports": np.nan,
    "dropped_by_agg": np.nan,
})

print(preprocess_summary.to_string())
df_raw

raw_reports          54372.0
after_agg_reports        NaN
dropped_by_agg           NaN


Unnamed: 0,date,ticker,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_1528,embedding_1529,embedding_1530,embedding_1531,embedding_1532,embedding_1533,embedding_1534,embedding_1535,embedding_1536,ticker_code
0,2025-12-30,제이투케이바이오,0.002818,-0.025671,0.019250,-0.015114,-0.021794,-0.000041,0.006391,-0.006771,...,-0.021916,0.024311,0.007462,0.006357,0.001155,-2.017493e-02,0.024161,-0.010189,-0.014638,420570
1,2025-12-30,대한항공,-0.008313,-0.029964,0.023021,-0.031660,-0.007354,-0.013455,-0.016913,-0.012554,...,-0.007008,0.040848,0.010708,0.009853,-0.008992,-1.781376e-02,0.002246,0.002209,0.006499,003490
2,2025-12-30,LG이노텍,-0.009318,-0.016776,-0.005554,-0.026072,-0.033964,0.013587,-0.035452,-0.001840,...,-0.024268,0.028593,-0.001506,0.001988,0.008615,-3.305544e-02,0.008078,0.000484,-0.002896,011070
3,2025-12-30,삼양식품,-0.028252,-0.034510,0.015855,-0.028252,-0.029229,0.012529,-0.040398,0.004555,...,-0.024093,0.024159,0.019750,0.010548,-0.007373,-2.724863e-02,0.008594,0.016978,-0.004627,003230
4,2025-12-30,KT,-0.017662,-0.016917,0.008262,-0.041310,-0.009637,-0.007754,-0.035675,0.000825,...,-0.009075,0.055585,0.004348,0.012989,0.005492,-2.181962e-02,0.001752,0.014871,0.001962,030200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62444,2013-08-16,엔씨소프트,-0.019838,-0.016961,0.005584,-0.012578,0.000637,0.009023,-0.017938,0.001241,...,-0.024980,0.016988,-0.009193,-0.004752,-0.011886,-2.495316e-02,0.002259,0.011004,0.016880,036570
62446,2013-08-16,오리온,-0.005961,-0.020193,0.020931,-0.026096,-0.008882,-0.005729,-0.011586,0.007816,...,-0.027277,0.029947,-0.009727,0.003958,-0.001741,-1.214257e-02,0.028659,0.000126,-0.001907,271560
62447,2013-08-16,신세계,-0.022820,-0.032110,0.020616,-0.029352,-0.022124,0.019109,-0.011622,-0.000723,...,-0.024623,0.032316,-0.016416,0.014096,0.022510,-1.642850e-02,0.001619,0.028579,0.000686,004170
62448,2013-08-16,롯데칠성,-0.016238,-0.013909,0.027321,-0.023907,0.000069,0.006205,-0.013958,0.017274,...,-0.014649,0.027072,-0.011601,0.016860,-0.011021,-4.706540e-07,-0.000818,0.010358,0.006022,005300


## Load Market Data


In [None]:
px = pd.read_excel(ADJ_CLOSE_PATH, index_col=0)
px.index = pd.to_datetime(px.index)
px.columns = px.columns.astype(str).str.zfill(6)
px = px.sort_index()
print("px shape:", px.shape)
print("px range:", px.index.min(), "~", px.index.max())

px shape: (2951, 2761)
px range: 2014-01-02 00:00:00 ~ 2026-01-09 00:00:00


In [None]:
mc = pd.read_excel(MCAP_PATH, index_col=0)
mc.index = pd.to_datetime(mc.index)
mc.columns = mc.columns.astype(str).str.zfill(6)
mc = mc.sort_index()
print("mc shape:", mc.shape)
print("mc range:", mc.index.min(), "~", mc.index.max())

mc shape: (2945, 2761)
mc range: 2014-01-02 00:00:00 ~ 2025-12-30 00:00:00


In [61]:
rf_df = pd.read_excel(RISK_FREE_PATH, index_col=0)
rf_df.index = pd.to_datetime(rf_df.index)
rf_df = rf_df.sort_index()

rf_col = RISK_FREE_COL if RISK_FREE_COL in rf_df.columns else rf_df.columns[0]
rf_daily = (pd.to_numeric(rf_df[rf_col], errors="coerce") / 100.0 / RISK_FREE_TRADING_DAYS).rename("rf_daily")
rf_daily = rf_daily.sort_index()

print("rf col:", rf_col)
rf_daily.dropna().head()

rf col: yield_3m_proxy


date
2013-04-11    0.000111
2013-04-12    0.000111
2013-04-15    0.000111
2013-04-16    0.000111
2013-04-17    0.000111
Name: rf_daily, dtype: float64

In [62]:
df_kospi = pd.read_excel(KOSPI_PATH, index_col=0)
df_kospi.index = pd.to_datetime(df_kospi.index)
df_kospi = df_kospi.sort_index()

mkt_col = next((c for c in ["close", "CLSPRC_IDX"] if c in df_kospi.columns), df_kospi.columns[0])
print("kospi col:", mkt_col)
df_kospi[[mkt_col]].head()

kospi col: close


Unnamed: 0_level_0,close
date,Unnamed: 1_level_1
2013-04-11,1949.8
2013-04-12,1924.23
2013-04-15,1920.45
2013-04-16,1922.21
2013-04-17,1923.84


## Monthly Calendar and Target


In [63]:
trading_dates = pd.DatetimeIndex(px.index.unique()).sort_values()

month_start = pd.Series(trading_dates, index=trading_dates).groupby(trading_dates.to_period("M")).min()
month_end = pd.Series(trading_dates, index=trading_dates).groupby(trading_dates.to_period("M")).max()

calendar = pd.DataFrame({
    "month": month_start.index.astype(str),
    "rebalance_date": pd.to_datetime(month_start.values),
    "month_end_date": pd.to_datetime(month_end.values),
})
calendar = calendar[calendar["rebalance_date"] >= pd.to_datetime(DATA_START_DATE)].copy()
calendar = calendar.sort_values("rebalance_date").reset_index(drop=True)

print("calendar rows:", len(calendar))
calendar

calendar rows: 145


Unnamed: 0,month,rebalance_date,month_end_date
0,2014-01,2014-01-02,2014-01-29
1,2014-02,2014-02-03,2014-02-28
2,2014-03,2014-03-03,2014-03-31
3,2014-04,2014-04-01,2014-04-30
4,2014-05,2014-05-02,2014-05-30
...,...,...,...
140,2025-09,2025-09-01,2025-09-30
141,2025-10,2025-10-01,2025-10-31
142,2025-11,2025-11-03,2025-11-28
143,2025-12,2025-12-01,2025-12-30


In [64]:
# Monthly log return target: rebalance day close to month-end close
px_start = px.reindex(calendar["rebalance_date"].values)
px_end = px.reindex(calendar["month_end_date"].values)
px_start.index = calendar["rebalance_date"].values
px_end.index = calendar["rebalance_date"].values

ret_monthly_log_wide = np.log(px_end / px_start)
ret_monthly_log_wide.index.name = "date"

target_monthly_long = (
    ret_monthly_log_wide
    .stack()
    .rename(TARGET_RETURN_COL)
    .reset_index()
    .rename(columns={"level_1": "ticker_code"})
)
target_monthly_long["date"] = pd.to_datetime(target_monthly_long["date"])
target_monthly_long["ticker_code"] = target_monthly_long["ticker_code"].astype(str).str.zfill(6)

print("target rows:", len(target_monthly_long))
target_monthly_long

target rows: 313052


Unnamed: 0,date,ticker_code,log_return_month
0,2014-01-02,000020,0.103670
1,2014-01-02,000040,0.362629
2,2014-01-02,000050,0.008929
3,2014-01-02,000070,-0.068977
4,2014-01-02,000080,-0.016223
...,...,...,...
313047,2026-01-02,950170,0.155830
313048,2026-01-02,950190,-0.051725
313049,2026-01-02,950200,-0.032157
313050,2026-01-02,950210,-0.048348


In [65]:
# Monthly simple and excess returns for backtest
ret_monthly_simple_wide = np.expm1(ret_monthly_log_wide)

rf_period_rows = []
for _, row in calendar.iterrows():
    s = pd.to_datetime(row["rebalance_date"])
    e = pd.to_datetime(row["month_end_date"])
    rf_slice = rf_daily.loc[(rf_daily.index > s) & (rf_daily.index <= e)].dropna()
    rf_period = (1.0 + rf_slice).prod() - 1.0 if len(rf_slice) > 0 else 0.0
    rf_period_rows.append((s, float(rf_period)))

rf_period_map = pd.Series(dict(rf_period_rows), name="rf_period_month")
rf_period_map.index = pd.to_datetime(rf_period_map.index)

ret_monthly_excess_wide = ret_monthly_simple_wide.sub(rf_period_map, axis=0)

calendar = calendar.merge(
    rf_period_map.rename("rf_period_month").reset_index().rename(columns={"index": "rebalance_date"}),
    on="rebalance_date",
    how="left"
)
calendar

Unnamed: 0,month,rebalance_date,month_end_date,rf_period_month
0,2014-01,2014-01-02,2014-01-29,0.002001
1,2014-02,2014-02-03,2014-02-28,0.002000
2,2014-03,2014-03-03,2014-03-31,0.001789
3,2014-04,2014-04-01,2014-04-30,0.002211
4,2014-05,2014-05-02,2014-05-30,0.001895
...,...,...,...,...
140,2025-09,2025-09-01,2025-09-30,0.002135
141,2025-10,2025-10-01,2025-10-31,0.001718
142,2025-11,2025-11-03,2025-11-28,0.002033
143,2025-12,2025-12-01,2025-12-30,0.002255


## Build Model Dataset


In [75]:
# Build training pool: daily dedup reports mapped to monthly target
# 1) Daily dedup for training (same date + same ticker mean)
df_daily = (
    df_raw
    .groupby(["date", "ticker_code"], as_index=False)[embedding_cols]
    .mean()
    .sort_values(["date", "ticker_code"])
    .reset_index(drop=True)
)

intervals = pd.IntervalIndex.from_arrays(
    calendar["rebalance_date"],
    calendar["month_end_date"],
    closed="both"
)

# 2) Map each daily report row to its monthly target date (same month rebalance date)
interval_idx_daily = intervals.get_indexer(df_daily["date"])
valid_daily = interval_idx_daily >= 0

df_daily_map = df_daily.loc[valid_daily].copy()
df_daily_map["target_date"] = pd.to_datetime(
    calendar["rebalance_date"].to_numpy()[interval_idx_daily[valid_daily]]
)

df_train_pool = (
    df_daily_map
    .merge(
        target_monthly_long.rename(columns={"date": "target_date"}),
        on=["target_date", "ticker_code"],
        how="inner"
    )
    .sort_values(["date", "ticker_code"])
    .reset_index(drop=True)
)

# 3) Build monthly prediction pool:
#    previous-month reports -> average by (source_month, ticker) -> predict at next rebalance date
interval_idx_raw = intervals.get_indexer(df_raw["date"])
valid_raw = interval_idx_raw >= 0

df_raw_map = df_raw.loc[valid_raw].copy()
df_raw_map["source_rebalance_date"] = pd.to_datetime(
    calendar["rebalance_date"].to_numpy()[interval_idx_raw[valid_raw]]
)

df_prev_month_feat = (
    df_raw_map
    .groupby(["source_rebalance_date", "ticker_code"], as_index=False)[embedding_cols]
    .mean()
    .sort_values(["source_rebalance_date", "ticker_code"])
    .reset_index(drop=True)
)

rebalance_dates_sorted = calendar["rebalance_date"].sort_values().reset_index(drop=True)
next_rebalance_map = pd.Series(
    rebalance_dates_sorted.shift(-1).to_numpy(),
    index=rebalance_dates_sorted.to_numpy()
)

df_prev_month_feat["date"] = pd.to_datetime(df_prev_month_feat["source_rebalance_date"].map(next_rebalance_map))
df_prev_month_feat = df_prev_month_feat.dropna(subset=["date"]).drop(columns=["source_rebalance_date"])

df_test_monthly = (
    df_prev_month_feat
    .merge(target_monthly_long, on=["date", "ticker_code"], how="inner")
    .sort_values(["date", "ticker_code"])
    .reset_index(drop=True)
)

n_after_agg = len(df_prev_month_feat)
preprocess_summary = pd.Series({
    "raw_reports": int(len(df_raw)),
    "daily_dedup_reports": int(len(df_daily)),
    "train_pool_rows": int(len(df_train_pool)),
    "pred_pool_rows": int(len(df_prev_month_feat)),
    "model_rows": int(len(df_test_monthly)),
})
print(preprocess_summary.to_string())

print("df_train_pool rows:", len(df_train_pool))
print("df_test_monthly rows:", len(df_test_monthly))
df_test_monthly.head(10)


raw_reports            54372
daily_dedup_reports    38493
train_pool_rows        37355
pred_pool_rows         26469
model_rows             26072
df_train_pool rows: 37355
df_test_monthly rows: 26072


Unnamed: 0,ticker_code,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_1529,embedding_1530,embedding_1531,embedding_1532,embedding_1533,embedding_1534,embedding_1535,embedding_1536,date,log_return_month
0,120,0.001237,-0.025519,0.009442,-0.013639,-0.026916,-0.004066,-0.027104,0.002795,-0.020388,...,0.045289,0.006628,-0.003573,0.000729,-0.013485,-0.010402,0.010597,0.008999,2014-02-03,0.068993
1,150,-0.017579,-0.028504,0.012249,-0.026864,-0.02351,0.005178,-0.028533,0.003048,-0.006513,...,0.0328,-0.002488,0.009571,-0.002469,-0.018429,0.004957,-0.005431,0.015669,2014-02-03,0.05588
2,370,0.000745,-0.016019,0.006661,-0.035066,-0.02729,-0.005003,-0.025143,-0.00041,-0.018964,...,0.042443,0.004913,-0.013088,0.013542,-0.009076,-0.007748,0.005336,-0.004156,2014-02-03,-0.043624
3,660,-0.007077,-0.024764,0.008736,-0.032214,-0.010171,0.0013,-0.027821,-0.010803,-0.018509,...,0.032441,-3.2e-05,-0.0007,0.009581,-0.025287,0.005696,0.002748,0.010326,2014-02-03,0.020861
4,720,-0.007243,-0.02133,0.021736,-0.019284,-0.014303,0.004713,-0.025762,0.008584,-0.012706,...,0.028884,-0.003981,-0.006561,0.002548,-0.007256,0.002811,0.008781,0.004525,2014-02-03,0.005152
5,1040,-0.009315,-0.030433,0.021934,-0.028544,-0.006529,0.010518,-0.014468,-0.019249,-0.025084,...,0.019737,-0.016584,-0.011879,0.007556,-0.019353,-0.002436,0.001888,0.005227,2014-02-03,0.048204
6,3220,-0.00168,-0.007432,0.010585,-0.029215,-0.012215,0.015092,-0.016615,0.008685,-0.027786,...,0.050371,-0.006242,0.01332,0.015214,-0.029161,-0.016265,-0.001162,0.0145,2014-02-03,-0.032691
7,3490,-0.015566,-0.033127,0.00846,-0.029787,-0.007924,-0.002016,-0.024157,-0.006592,-0.011185,...,0.03009,-0.012906,-0.004077,0.003866,-0.02357,0.013833,-0.000169,0.000659,2014-02-03,-0.032795
8,4370,-0.014342,-0.03769,0.010164,-0.037646,-0.028582,-0.000682,-0.03913,0.004267,-0.019075,...,0.029348,0.011495,0.01076,0.005157,-0.013182,-0.008861,0.003875,0.017366,2014-02-03,0.107593
9,5380,-0.016516,-0.025759,0.016236,-0.035214,-0.008838,-0.000508,-0.023442,0.001586,-0.022367,...,0.0299,-0.00272,0.005267,0.007437,-0.010019,0.011057,0.000815,-0.000308,2014-02-03,0.056678


In [76]:
# Cumulative report count for dynamic universe (use raw report-level history)
report_daily = df_raw[["date", "ticker_code"]].sort_values(["ticker_code", "date"]).copy()
report_daily["cum_reports"] = report_daily.groupby("ticker_code").cumcount() + 1

def eligible_tickers_asof(asof_date, min_report_count=10):
    tmp = report_daily[report_daily["date"] <= pd.to_datetime(asof_date)]
    if len(tmp) == 0:
        return []
    last_cnt = tmp.groupby("ticker_code", as_index=False)["cum_reports"].max()
    out = last_cnt.loc[last_cnt["cum_reports"] >= min_report_count, "ticker_code"].tolist()
    return sorted(out)

sample_asof = calendar["rebalance_date"].iloc[0]
sample_prev = trading_dates[trading_dates.get_loc(sample_asof) - 1] if trading_dates.get_loc(sample_asof) > 0 else sample_asof
print("sample asof:", sample_prev)
print("eligible tickers:", len(eligible_tickers_asof(sample_prev, MIN_REPORT_COUNT)))

sample asof: 2014-01-02 00:00:00
eligible tickers: 26


## Rolling Ridge Option B


In [77]:
def run_ridge_monthly_predict_option_b(
    df_train_pool,
    df_test_monthly,
    embedding_cols,
    target_col,
    rebalance_dates,
    trading_dates,
    eligible_fn,
    window=504,
    alpha=10.0,
    min_report_count=10,
):
    """
    Option B (monthly):
    For each rebalance date t:
    - Train: report_date in [t-window, t-1] using daily dedup rows (df_train_pool)
    - Predict: previous-month aggregated report embeddings already mapped to date t (df_test_monthly)

    Rule:
    - Strict rolling window. Skip dates where idx < window.
    - No MIN_TRAIN_SAMPLES threshold is applied.
    - Fit is attempted when both train and test blocks are non-empty.
    """
    pred_rows = []
    stat_rows = []

    tdates = pd.DatetimeIndex(trading_dates)
    rdates = pd.DatetimeIndex(pd.to_datetime(rebalance_dates)).sort_values()

    for rdate in tqdm(rdates, desc="Ridge monthly option B"):
        if rdate not in tdates:
            continue
        idx = tdates.get_loc(rdate)
        if not isinstance(idx, (int, np.integer)):
            idx = idx.start
        if idx <= 0:
            continue

        # Strict 504-day rolling requirement
        if idx < window:
            prev_date = tdates[idx - 1]
            stat_rows.append({
                "date": rdate,
                "window_start": pd.NaT,
                "window_end_prev": prev_date,
                "n_eligible_tickers": 0,
                "n_train_reports_window": 0,
                "n_test_reports_date": 0,
                "used_for_fit": False,
            })
            continue

        prev_date = tdates[idx - 1]
        start_idx = idx - window
        window_start = tdates[start_idx]

        eligible = eligible_fn(prev_date, min_report_count)
        if len(eligible) == 0:
            stat_rows.append({
                "date": rdate,
                "window_start": window_start,
                "window_end_prev": prev_date,
                "n_eligible_tickers": 0,
                "n_train_reports_window": 0,
                "n_test_reports_date": 0,
                "used_for_fit": False,
            })
            continue

        train = df_train_pool[
            (df_train_pool["date"] >= window_start)
            & (df_train_pool["date"] <= prev_date)
            & (df_train_pool["ticker_code"].isin(eligible))
        ].dropna(subset=embedding_cols + [target_col]).copy()

        test = df_test_monthly[
            (df_test_monthly["date"] == rdate)
            & (df_test_monthly["ticker_code"].isin(eligible))
        ].dropna(subset=embedding_cols).copy()

        used = bool((len(train) > 0) and (len(test) > 0))
        stat_rows.append({
            "date": rdate,
            "window_start": window_start,
            "window_end_prev": prev_date,
            "n_eligible_tickers": int(len(eligible)),
            "n_train_reports_window": int(len(train)),
            "n_test_reports_date": int(len(test)),
            "used_for_fit": used,
        })

        if not used:
            continue

        model = Ridge(alpha=alpha, fit_intercept=True)
        X_train = train[embedding_cols].to_numpy(dtype=float)
        y_train = train[target_col].to_numpy(dtype=float)
        X_test = test[embedding_cols].to_numpy(dtype=float)
        model.fit(X_train, y_train)

        out = test[["date", "ticker_code", target_col]].copy()
        out["pred_return"] = model.predict(X_test)
        pred_rows.append(out)

    df_pred = pd.concat(pred_rows, ignore_index=True) if len(pred_rows) > 0 else pd.DataFrame()
    df_stat = pd.DataFrame(stat_rows).sort_values("date").reset_index(drop=True)
    return df_pred, df_stat

In [78]:
rebalance_dates = calendar["rebalance_date"].tolist()

df_pred, ridge_monthly_report_count = run_ridge_monthly_predict_option_b(
    df_train_pool=df_train_pool,
    df_test_monthly=df_test_monthly,
    embedding_cols=embedding_cols,
    target_col=TARGET_RETURN_COL,
    rebalance_dates=rebalance_dates,
    trading_dates=trading_dates,
    eligible_fn=eligible_tickers_asof,
    window=WINDOW,
    alpha=RIDGE_ALPHA,
    min_report_count=MIN_REPORT_COUNT,
)

print("pred rows:", len(df_pred))
print("pred dates:", df_pred["date"].nunique() if len(df_pred) > 0 else 0)
ridge_monthly_report_count

Ridge monthly option B:   0%|          | 0/145 [00:00<?, ?it/s]

pred rows: 16438
pred dates: 120


Unnamed: 0,date,window_start,window_end_prev,n_eligible_tickers,n_train_reports_window,n_test_reports_date,used_for_fit
0,2014-02-03,NaT,2014-01-29,0,0,0,False
1,2014-03-03,NaT,2014-02-28,0,0,0,False
2,2014-04-01,NaT,2014-03-31,0,0,0,False
3,2014-05-02,NaT,2014-04-30,0,0,0,False
4,2014-06-02,NaT,2014-05-30,0,0,0,False
...,...,...,...,...,...,...,...
139,2025-09-01,2023-08-03,2025-08-29,620,10767,302,True
140,2025-10-01,2023-09-05,2025-09-30,623,10587,177,True
141,2025-11-03,2023-10-05,2025-10-31,627,10867,234,True
142,2025-12-01,2023-11-03,2025-11-28,635,10927,342,True


In [72]:
if len(df_pred) == 0:
    print("No prediction rows. Check WINDOW, data coverage, and target alignment.")
else:
    df_pred.head(10)

## Risk Aversion


In [73]:
mkt_ret = pd.to_numeric(df_kospi[mkt_col], errors="coerce").pct_change().rename("mkt_ret")
risk_df = pd.concat([mkt_ret, rf_daily], axis=1, join="inner").dropna()
risk_df["excess_ret"] = risk_df["mkt_ret"] - risk_df["rf_daily"]

if USE_DYNAMIC_RISK_AVERSION:
    mean_excess = risk_df["excess_ret"].rolling(WINDOW, min_periods=LAMBDA_MIN_OBS).mean()
    var_mkt = risk_df["mkt_ret"].rolling(WINDOW, min_periods=LAMBDA_MIN_OBS).var()
    lambda_daily = mean_excess / var_mkt
    lambda_daily = lambda_daily.replace([np.inf, -np.inf], np.nan)
    lambda_daily = lambda_daily.ffill().fillna(float(RISK_AVERSION_FALLBACK))
    lambda_daily = lambda_daily.clip(lower=float(LAMBDA_FLOOR)).rename("lambda")
else:
    lambda_daily = pd.Series(float(RISK_AVERSION_FALLBACK), index=risk_df.index, name="lambda")

risk_aversion_summary = pd.Series({
    "use_dynamic": USE_DYNAMIC_RISK_AVERSION,
    "window": WINDOW,
    "lambda_min_obs": LAMBDA_MIN_OBS,
    "lambda_floor": LAMBDA_FLOOR,
    "samples": int(len(lambda_daily)),
    "lambda_mean": float(lambda_daily.mean()),
    "lambda_min": float(lambda_daily.min()),
    "lambda_max": float(lambda_daily.max()),
})
print(risk_aversion_summary.to_string())

use_dynamic            True
window                  504
lambda_min_obs          252
lambda_floor           0.01
samples                3110
lambda_mean        1.766762
lambda_min             0.01
lambda_max        12.504487


## BL Helper Functions


In [79]:
def black_litterman_posterior(Pi, Sigma, P, Q, Omega, tau=0.025):
    Sigma_t = tau * Sigma
    A = np.linalg.inv(Sigma_t) + P.T @ np.linalg.inv(Omega) @ P
    b = np.linalg.inv(Sigma_t) @ Pi + P.T @ np.linalg.inv(Omega) @ Q
    return np.linalg.solve(A, b)

def compute_bl_weights(mu_bl, Sigma, risk_aversion, long_only=True, weight_clip=None):
    w = np.linalg.pinv(Sigma) @ mu_bl / max(float(risk_aversion), 1e-8)
    if long_only:
        w = np.clip(w, 0.0, None)
    if weight_clip is not None:
        w = np.clip(w, -abs(weight_clip), abs(weight_clip))
    s = np.sum(w)
    if np.isclose(s, 0.0):
        w = np.ones_like(w) / len(w)
    else:
        w = w / s
    return w

def compute_prior_at_date(view_date, tickers_univ, px, mc, rf_daily, lambda_daily, window=504):
    tdates = pd.DatetimeIndex(px.index)
    if view_date not in tdates:
        raise ValueError("view_date not in price index")

    idx = tdates.get_loc(view_date)
    if not isinstance(idx, (int, np.integer)):
        idx = idx.start
    if idx <= 0:
        raise ValueError("no previous trading day")

    prev_date = tdates[idx - 1]

    price_hist = px.loc[:prev_date, tickers_univ].tail(window + 1)
    ret_hist = price_hist.pct_change().dropna(how="all")
    if len(ret_hist) < 2:
        raise ValueError("not enough return history")

    rf_hist = rf_daily.reindex(ret_hist.index).ffill().fillna(0.0)
    ret_excess = ret_hist.sub(rf_hist, axis=0)

    Sigma_df = ret_excess.cov().reindex(index=tickers_univ, columns=tickers_univ).fillna(0.0)
    Sigma = Sigma_df.to_numpy(dtype=float)

    mc_hist = mc.loc[:prev_date, tickers_univ].ffill()
    if len(mc_hist) == 0:
        raise ValueError("no market cap history")
    mc_row = mc_hist.iloc[-1].fillna(0.0).to_numpy(dtype=float)
    mc_sum = np.sum(mc_row)
    if np.isclose(mc_sum, 0.0):
        w_mkt = np.ones(len(tickers_univ)) / len(tickers_univ)
    else:
        w_mkt = mc_row / mc_sum

    lambda_t = float(lambda_daily.asof(prev_date))
    if (not np.isfinite(lambda_t)) or (lambda_t <= 0):
        lambda_t = float(RISK_AVERSION_FALLBACK)

    Pi = lambda_t * (Sigma @ w_mkt)
    return {
        "Pi": Pi,
        "Sigma": Sigma,
        "w_mkt": w_mkt,
        "lambda_t": lambda_t,
        "prev_date": prev_date,
    }

## Omega MSE by Rebalance Date


In [80]:
if len(df_pred) == 0:
    mse_rebal = pd.Series(dtype=float, name="mse_rebal")
else:
    tmp = df_pred.dropna(subset=[TARGET_RETURN_COL, "pred_return"]).copy()
    tmp["date"] = pd.to_datetime(tmp["date"])
    tmp["rf_period_month"] = tmp["date"].map(rf_period_map).astype(float)
    tmp["y_true_excess_simple"] = np.expm1(tmp[TARGET_RETURN_COL]) - tmp["rf_period_month"]
    tmp["y_pred_excess_simple"] = np.expm1(tmp["pred_return"]) - tmp["rf_period_month"]
    tmp = tmp.replace([np.inf, -np.inf], np.nan).dropna(subset=["y_true_excess_simple", "y_pred_excess_simple"])

    if len(tmp) == 0:
        mse_rebal = pd.Series(dtype=float, name="mse_rebal")
    else:
        mse_rebal = (
            tmp.groupby("date")
            .apply(lambda g: mean_squared_error(g["y_true_excess_simple"], g["y_pred_excess_simple"]))
            .rename("mse_rebal")
            .astype(float)
            .sort_index()
        )
        mse_rebal = mse_rebal.where(np.isfinite(mse_rebal), np.nan).ffill().fillna(1e-8).clip(lower=1e-8)

print("mse dates:", len(mse_rebal))
mse_rebal.head()

mse dates: 120


date
2016-02-01    0.010072
2016-03-02    0.009472
2016-04-01    0.006033
2016-05-02    0.006172
2016-06-01    0.005223
Name: mse_rebal, dtype: float64

## Monthly Backtest


In [81]:
if len(df_pred) > 0:
    df_view = df_pred.dropna(subset=["pred_return"]).copy()
    df_view_by_date = {pd.to_datetime(d): g.copy() for d, g in df_view.groupby("date")}
else:
    df_view = pd.DataFrame()
    df_view_by_date = {}

month_end_map = calendar.set_index("rebalance_date")["month_end_date"]

results = []
weights_hist = []
prev_w = None
prev_tickers = None

if len(df_view_by_date) == 0:
    print("No prediction rows available for backtest.")
else:
    run_dates = sorted(set(calendar["rebalance_date"]).intersection(df_view_by_date.keys()))

    for view_date in tqdm(run_dates, desc="Backtest monthly"):
        view_date = pd.to_datetime(view_date)
        view_block = df_view_by_date[view_date].copy()

        tickers_view = sorted(set(view_block["ticker_code"]).intersection(px.columns).intersection(mc.columns))
        if len(tickers_view) < 2:
            continue

        if view_date not in ret_monthly_excess_wide.index:
            continue
        tickers_view = [t for t in tickers_view if t in ret_monthly_excess_wide.columns]
        if len(tickers_view) < 2:
            continue

        pred_s = (
            view_block.set_index("ticker_code")["pred_return"]
            .reindex(tickers_view)
            .astype(float)
        )
        if pred_s.isna().all():
            continue

        rf_m = float(rf_period_map.asof(view_date)) if len(rf_period_map) > 0 else 0.0
        if not np.isfinite(rf_m):
            rf_m = 0.0

        Q_log = pred_s.to_numpy(dtype=float)
        Q = np.expm1(Q_log) - rf_m

        try:
            prior = compute_prior_at_date(
                view_date=view_date,
                tickers_univ=tickers_view,
                px=px,
                mc=mc,
                rf_daily=rf_daily,
                lambda_daily=lambda_daily,
                window=WINDOW,
            )
        except Exception:
            continue

        Pi = prior["Pi"]
        Sigma = prior["Sigma"]
        w_mkt = prior["w_mkt"]
        lambda_t = prior["lambda_t"]

        n = len(tickers_view)
        k = len(tickers_view)
        P = np.eye(k, n)

        mse_t = float(mse_rebal.asof(view_date)) if len(mse_rebal) > 0 else 1e-8
        if (not np.isfinite(mse_t)) or (mse_t <= 0):
            mse_t = 1e-8
        Omega = np.eye(k) * mse_t

        try:
            mu_bl = black_litterman_posterior(Pi=Pi, Sigma=Sigma, P=P, Q=Q, Omega=Omega, tau=TAU)
            w = compute_bl_weights(mu_bl=mu_bl, Sigma=Sigma, risk_aversion=lambda_t, long_only=LONG_ONLY, weight_clip=WEIGHT_CLIP)
        except Exception:
            continue

        ret_vec = ret_monthly_excess_wide.loc[view_date, tickers_view].astype(float).to_numpy()
        mask = np.isfinite(ret_vec)
        if mask.sum() < 2:
            continue

        ret_vec = ret_vec[mask]
        tickers_ret = np.array(tickers_view)[mask]
        w = w[mask]
        w = w / w.sum() if np.sum(w) != 0 else np.ones_like(w) / len(w)

        w_bm = w_mkt[mask]
        w_bm = w_bm / w_bm.sum() if np.sum(w_bm) != 0 else np.ones_like(w_bm) / len(w_bm)

        if prev_w is None:
            turnover = float(np.sum(np.abs(w)))
            sell_turnover = float(np.sum(w))
        else:
            if (prev_tickers is not None) and np.array_equal(prev_tickers, tickers_ret):
                delta = w - prev_w
                turnover = float(np.sum(np.abs(delta)))
                sell_turnover = float(np.sum(np.clip(-delta, 0.0, None)))
            else:
                turnover = float(np.sum(np.abs(w)))
                sell_turnover = float(np.sum(w))

        cost = turnover * (TRANSACTION_COST_BPS / 10000.0) + sell_turnover * (SELL_TAX_BPS / 10000.0)
        port_ret = float(np.dot(w, ret_vec)) - cost
        bm_ret = float(np.dot(w_bm, ret_vec))

        results.append({
            "rebalance_date": view_date,
            "month_end_date": pd.to_datetime(month_end_map.asof(view_date)),
            "net_ret": port_ret,
            "bm_ret": bm_ret,
            "cost": cost,
            "n_assets": int(len(w)),
            "lambda": float(lambda_t),
            "omega_mse": float(mse_t),
            "rf_period_month": float(rf_m),
        })

        weights_hist.append(pd.DataFrame({
            "date": view_date,
            "ticker_code": tickers_ret,
            "weight": w,
        }))

        prev_w = w
        prev_tickers = tickers_ret

if len(results) == 0:
    df_bt = pd.DataFrame()
    weights_history = pd.DataFrame()
else:
    df_bt = pd.DataFrame(results).sort_values("rebalance_date").reset_index(drop=True)
    weights_history = pd.concat(weights_hist, ignore_index=True)

df_bt

Backtest monthly:   0%|          | 0/120 [00:00<?, ?it/s]

Unnamed: 0,rebalance_date,month_end_date,net_ret,bm_ret,cost,n_assets,lambda,omega_mse,rf_period_month
0,2016-02-01,2016-02-29,-0.056438,-0.001940,0.0,81,0.010000,0.010072,0.001105
1,2016-03-02,2016-03-31,0.019417,-0.001336,0.0,94,0.010000,0.009472,0.001356
2,2016-04-01,2016-04-29,0.013515,0.006820,0.0,70,0.010000,0.006033,0.001215
3,2016-05-02,2016-05-31,-0.015071,0.006890,0.0,96,0.010000,0.006172,0.001192
4,2016-06-01,2016-06-30,-0.024535,-0.030126,0.0,107,0.010000,0.005223,0.001127
...,...,...,...,...,...,...,...,...,...
114,2025-09-01,2025-09-30,0.071548,0.084388,0.0,302,2.238722,0.025583,0.002135
115,2025-10-01,2025-10-31,0.127678,0.221698,0.0,177,3.331861,0.025329,0.001718
116,2025-11-03,2025-11-28,-0.057294,-0.077556,0.0,234,6.460066,0.029766,0.002033
117,2025-12-01,2025-12-30,0.023498,0.076328,0.0,342,5.808110,0.012517,0.002255


## Summary


In [82]:
if len(df_bt) == 0:
    print("No backtest result.")
    summary = pd.Series(dtype=float)
else:
    bt = df_bt.copy().sort_values("rebalance_date")
    bt["cum_net"] = (1.0 + bt["net_ret"]).cumprod()
    bt["cum_bm"] = (1.0 + bt["bm_ret"]).cumprod()

    n = len(bt)
    ann_ret = bt["cum_net"].iloc[-1] ** (ANNUALIZATION / n) - 1.0
    ann_vol = bt["net_ret"].std(ddof=1) * np.sqrt(ANNUALIZATION)
    sharpe = ann_ret / ann_vol if ann_vol > 0 else np.nan

    ann_ret_bm = bt["cum_bm"].iloc[-1] ** (ANNUALIZATION / n) - 1.0
    ann_vol_bm = bt["bm_ret"].std(ddof=1) * np.sqrt(ANNUALIZATION)
    sharpe_bm = ann_ret_bm / ann_vol_bm if ann_vol_bm > 0 else np.nan

    summary = pd.Series({
        "n_months": int(n),
        "ann_return": float(ann_ret),
        "ann_vol": float(ann_vol),
        "ann_sharpe": float(sharpe) if np.isfinite(sharpe) else np.nan,
        "ann_return_bm": float(ann_ret_bm),
        "ann_vol_bm": float(ann_vol_bm),
        "ann_sharpe_bm": float(sharpe_bm) if np.isfinite(sharpe_bm) else np.nan,
        "final_cum_net": float(bt["cum_net"].iloc[-1]),
        "final_cum_bm": float(bt["cum_bm"].iloc[-1]),
    })
    print(summary.to_string())

summary

n_months         119.000000
ann_return         0.027070
ann_vol            0.197403
ann_sharpe         0.137128
ann_return_bm      0.060920
ann_vol_bm         0.191425
ann_sharpe_bm      0.318245
final_cum_net      1.303262
final_cum_bm       1.797570


n_months         119.000000
ann_return         0.027070
ann_vol            0.197403
ann_sharpe         0.137128
ann_return_bm      0.060920
ann_vol_bm         0.191425
ann_sharpe_bm      0.318245
final_cum_net      1.303262
final_cum_bm       1.797570
dtype: float64

In [85]:
if len(weights_history) == 0:
    weights_monthly = pd.DataFrame()
else:
    weights_monthly = (
        weights_history
        .pivot_table(index="date", columns="ticker_code", values="weight", aggfunc="mean")
        .sort_index()
        .fillna(0.0)
    )
weights_monthly

ticker_code,000080,000100,000120,000150,000210,000250,000270,000370,000490,000640,...,452430,453340,456040,458870,460860,462870,473980,475150,483650,950170
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-02-01,0.018257,0.000000,0.027991,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2016-03-02,0.019828,0.041080,0.004452,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2016-04-01,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2016-05-02,0.015319,0.017862,0.023363,0.010924,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2016-06-01,0.012230,0.022730,0.025274,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-09-01,0.000436,0.000000,0.001392,0.000000,0.0,0.0,0.000000,0.002028,0.0,0.000000,...,0.001390,0.000000,0.0,0.000000,0.000000,0.003603,0.000000,0.005816,0.000000,0.000000
2025-10-01,0.000000,0.005896,0.003470,0.007167,0.0,0.0,0.012624,0.000000,0.0,0.004134,...,0.005091,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.003279,0.000357
2025-11-03,0.000000,0.000000,0.000889,0.005903,0.0,0.0,0.010382,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.003123,0.001214,0.000000,0.000000,0.000000,0.000000,0.001360
2025-12-01,0.000560,0.002850,0.002799,0.004888,0.0,0.0,0.005523,0.002362,0.0,0.000000,...,0.001905,0.003273,0.0,0.003671,0.000958,0.001736,0.000000,0.005011,0.000402,0.000000


## Export


In [84]:
config_df = pd.DataFrame({
    "key": [
        "EMBEDDING_PATH", "RETURN_PATH", "ADJ_CLOSE_PATH", "MCAP_PATH", "KOSPI_PATH", "RISK_FREE_PATH",
        "DATA_START_DATE", "TARGET_RETURN_COL", "WINDOW", "RIDGE_ALPHA", "MIN_TRAIN_SAMPLES", "MIN_REPORT_COUNT",
        "RISK_FREE_COL", "TAU", "LONG_ONLY", "WEIGHT_CLIP",
        "TRANSACTION_COST_BPS", "SELL_TAX_BPS", "ANNUALIZATION", "EXPORT_PATH",
    ],
    "value": [
        EMBEDDING_PATH, RETURN_PATH, ADJ_CLOSE_PATH, MCAP_PATH, KOSPI_PATH, RISK_FREE_PATH,
        DATA_START_DATE, TARGET_RETURN_COL, WINDOW, RIDGE_ALPHA, MIN_TRAIN_SAMPLES, MIN_REPORT_COUNT,
        RISK_FREE_COL, TAU, LONG_ONLY, WEIGHT_CLIP,
        TRANSACTION_COST_BPS, SELL_TAX_BPS, ANNUALIZATION, EXPORT_PATH,
    ]
})

export_path = Path(EXPORT_PATH)
export_path.parent.mkdir(parents=True, exist_ok=True)

mse_rebal_df = mse_rebal.to_frame() if isinstance(mse_rebal, pd.Series) and len(mse_rebal) > 0 else pd.DataFrame()
lambda_daily_df = lambda_daily.to_frame(name="lambda") if isinstance(lambda_daily, pd.Series) and len(lambda_daily) > 0 else pd.DataFrame()

with pd.ExcelWriter(export_path, engine="openpyxl") as writer:
    config_df.to_excel(writer, sheet_name="config", index=False)
    preprocess_summary.to_frame(name="value").to_excel(writer, sheet_name="preprocess_summary")
    calendar.to_excel(writer, sheet_name="monthly_calendar", index=False)

    if len(ridge_monthly_report_count) > 0:
        ridge_monthly_report_count.to_excel(writer, sheet_name="ridge_monthly_report_count", index=False)
    if len(df_pred) > 0:
        df_pred.to_excel(writer, sheet_name="ridge_predictions", index=False)
    if len(lambda_daily_df) > 0:
        lambda_daily_df.to_excel(writer, sheet_name="risk_aversion_daily")
    if len(mse_rebal_df) > 0:
        mse_rebal_df.to_excel(writer, sheet_name="omega_mse_rebalance")
    if len(df_bt) > 0:
        df_bt.to_excel(writer, sheet_name="backtest_monthly", index=False)
    if len(weights_history) > 0:
        weights_history.to_excel(writer, sheet_name="weights_monthly_long", index=False)
    if len(weights_monthly) > 0:
        weights_monthly.to_excel(writer, sheet_name="weights_monthly")
    if isinstance(summary, pd.Series) and len(summary) > 0:
        summary.to_frame(name="value").to_excel(writer, sheet_name="performance_summary")

print(f"saved: {export_path}")

saved: outputs\BL_monthly_rebalance_results_2.xlsx


## Workflow Check


In [None]:
workflow_check = pd.Series({
    "raw_report_rows": int(len(df_raw)),
    "pred_monthly_feature_rows": int(n_after_agg),
    "calendar_rows": int(len(calendar)),
    "train_pool_rows": int(len(df_train_pool)),
    "test_pool_rows": int(len(df_test_monthly)),
    "pred_rows": int(len(df_pred)),
    "pred_dates": int(df_pred["date"].nunique()) if len(df_pred) > 0 else 0,
    "mse_dates": int(len(mse_rebal)) if isinstance(mse_rebal, pd.Series) else 0,
    "backtest_rows": int(len(df_bt)),
    "weights_rows": int(len(weights_history)),
})
print(workflow_check.to_string())
workflow_check