# Thesis Results Notebook (Real Outputs)
Dieses Notebook lädt **Stage‑B Outputs** aus `outputs/` und erstellt die Tabellen/Grafiken für deine Results-Sektion.

**Du musst nur** die Run‑Ordnernamen und Labels in den Mappings anpassen.

Erwartete Dateien pro Run:
- `outputs/stageB/<run_id>/monthly/preds.csv` (mit Spalten: `date_t_plus_1`, `y_true`, `y_pred`, `is_active`)
- optional: `outputs/stageB/<run_id>/monthly/scores.csv` (für Incumbent Timeline)


In [2]:

# === 0) Imports ===
import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests

# Optional (für MCS)
from arch.bootstrap import MCS

plt.style.use("seaborn-v0_8-whitegrid")
np.set_printoptions(suppress=True)
pd.set_option("display.max_columns", 200)


In [None]:

# === 1) Projekt-Root finden & src importieren ===
def _locate_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(8):
        if (cur / "src").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import STAGEB_DIR, OUTPUTS

EVAL_OUT = OUTPUTS / "evaluation_results"
FIG_OUT = EVAL_OUT / "figures"
TAB_OUT = EVAL_OUT / "tables"
FIG_OUT.mkdir(parents=True, exist_ok=True)
TAB_OUT.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("StageB dir:   ", STAGEB_DIR)
print("Eval out:     ", EVAL_OUT)


## 2) Konfiguration: Run‑Mappings
- `run_id`: Ordnername in `outputs/stageB/<run_id>/...`
- `label`: Anzeige‑Name (Plots/Tabellen)
- `family`: Modellfamilie (für Aggregationen/Plots)
- `setup`: `Setup I`, `Setup II`, `Setup III` (oder `Baseline`)

In [None]:

# === 2) EDIT THIS: Model Registry ===
MODEL_REGISTRY = [
    # Setup I (ifo only)
    dict(run_id="Model1_without_target", label="Model 1 (ifo only)", family="FamilyA", setup="Setup I"),
    dict(run_id="Model2_without_target", label="Model 2 (ifo only)", family="FamilyB", setup="Setup I"),

    # Setup II (ifo + target blocks)
    dict(run_id="Model1_with_target", label="Model 1 (+target blocks)", family="FamilyA", setup="Setup II"),
    dict(run_id="Model2_with_target", label="Model 2 (+target blocks)", family="FamilyB", setup="Setup II"),

    # Setup III (few features / FI selection)
    dict(run_id="Model1_fi", label="Model 1 (FI top)", family="FamilyA", setup="Setup III"),
    dict(run_id="Model2_fi", label="Model 2 (FI top)", family="FamilyB", setup="Setup III"),
]

# === 2b) EDIT THIS: Baselines (optional) ===
BASELINE_REGISTRY = [
    dict(run_id="Baseline_random_walk", label="Random Walk", family="Baseline", setup="Baseline"),
    dict(run_id="Baseline_ar1", label="AR(1)", family="Baseline", setup="Baseline"),
    dict(run_id="Baseline_expmean", label="Exp. Mean", family="Baseline", setup="Baseline"),
]

SETUP_ORDER = ["Setup I", "Setup II", "Setup III"]

# Signifikanz/Tests
ALPHA_SIG = 0.05
HAC_LAGS = 3

# MCS
MCS_ALPHA = 0.10
MCS_BLOCK_LEN = 6

# Rolling plot
ROLLING_WINDOW_MONTHS = 24


## 3) Loader: aktive Stage‑B Prognosen (`is_active=True`)
Wir verwenden die **aktive Policy‑Sequenz** (wie in deinen Test‑Notebooks).

In [None]:

def _stageb_preds_path(run_id: str) -> Path:
    return STAGEB_DIR / run_id / "monthly" / "preds.csv"

def load_active_stageb_predictions(run_id: str) -> pd.DataFrame:
    """Return DataFrame indexed by date_t_plus_1 with columns y_true, y_pred (active rows only)."""
    path = _stageb_preds_path(run_id)
    if not path.exists():
        raise FileNotFoundError(f"Missing: {path}")

    df = pd.read_csv(path, parse_dates=["date_t_plus_1"])
    df = df[df["is_active"] == True].copy()
    if df.empty:
        raise ValueError(f"No is_active==True rows found in: {path}")

    df = df.sort_values("date_t_plus_1").set_index("date_t_plus_1")
    df = df[["y_true", "y_pred"]].copy()
    df = df[~df.index.duplicated(keep="last")]
    return df

def load_many_active_predictions(registry):
    out = {}
    for row in registry:
        rid = row["run_id"]
        try:
            out[rid] = load_active_stageb_predictions(rid)
        except Exception as e:
            print(f"[WARN] {rid}: {e}")
    return out

stageb_active = load_many_active_predictions(MODEL_REGISTRY + BASELINE_REGISTRY)
print(f"Loaded active Stage-B runs: {len(stageb_active)}")


## 4) Metrics & Tests (RMSE/MAE, DM, MCS)

In [None]:

def rmse(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.mean(np.abs(y_true - y_pred)))

def losses_se(df_pred: pd.DataFrame) -> pd.Series:
    return (df_pred["y_true"] - df_pred["y_pred"]) ** 2

def dm_test_hac(loss_a: pd.Series, loss_b: pd.Series, hac_lags: int = HAC_LAGS, alternative: str = "two-sided"):
    """DM as HAC-robust regression of d_t = loss_a - loss_b on constant."""
    df = pd.concat([loss_a.rename("a"), loss_b.rename("b")], axis=1).dropna()
    d = df["a"] - df["b"]

    X = np.ones((len(d), 1))
    fit = sm.OLS(d.values, X).fit(cov_type="HAC", cov_kwds={"maxlags": hac_lags})
    t = float(fit.tvalues[0])
    p2 = float(fit.pvalues[0])  # two-sided from statsmodels

    if alternative == "two-sided":
        p = p2
    elif alternative == "less":
        # H1: mean(d) < 0  (A better than B)
        p = p2 / 2 if t < 0 else 1 - (p2 / 2)
    elif alternative == "greater":
        # H1: mean(d) > 0  (A worse than B)
        p = p2 / 2 if t > 0 else 1 - (p2 / 2)
    else:
        raise ValueError("alternative must be 'two-sided', 'less', or 'greater'")

    return t, float(p), int(len(d))

def compute_overall_metrics(df_pred: pd.DataFrame) -> dict:
    y_true = df_pred["y_true"].to_numpy()
    y_pred = df_pred["y_pred"].to_numpy()
    return {
        "rmse": rmse(y_true, y_pred),
        "mae": mae(y_true, y_pred),
        "n": int(len(df_pred)),
        "start": df_pred.index.min(),
        "end": df_pred.index.max(),
    }

def align_losses_matrix(run_ids):
    series = []
    for rid in run_ids:
        dfp = stageb_active[rid]
        series.append(losses_se(dfp).rename(rid))
    return pd.concat(series, axis=1).dropna()

def compute_mcs_inclusion(loss_mat: pd.DataFrame, alpha: float = MCS_ALPHA, block_size: int = MCS_BLOCK_LEN):
    proc = MCS(loss_mat, size=alpha, block_size=block_size, method="stationary")
    proc.compute()
    return set(proc.included), set(proc.excluded), proc

def fmt_p(p):
    if pd.isna(p):
        return ""
    if p < 0.01:
        return "<0.01"
    if p > 0.99:
        return "1.00"
    return f"{p:.2f}"


## 5) Table R1: pro Setup – RMSE, MAE, DM vs best competitor, MCS badge
**DM ist hier pairwise** (jeder gegen den besten im Setup).

In [None]:

def build_table_r1_for_setup(setup: str) -> pd.DataFrame:
    rows = [r for r in MODEL_REGISTRY if r["setup"] == setup and r["run_id"] in stageb_active]
    if len(rows) == 0:
        raise ValueError(f"No runs loaded for setup={setup}")

    metrics = []
    for r in rows:
        m = compute_overall_metrics(stageb_active[r["run_id"]])
        metrics.append(dict(
            run_id=r["run_id"], label=r["label"], family=r["family"], setup=r["setup"],
            rmse=m["rmse"], mae=m["mae"], n=m["n"],
        ))
    dfm = pd.DataFrame(metrics).sort_values("rmse")
    best_run = dfm.iloc[0]["run_id"]

    dm_p = {}
    for rid in dfm["run_id"]:
        if rid == best_run:
            dm_p[rid] = np.nan
            continue
        _, p, _ = dm_test_hac(
            losses_se(stageb_active[rid]),
            losses_se(stageb_active[best_run]),
            alternative="two-sided"
        )
        dm_p[rid] = p
    dfm["dm_p_vs_best"] = dfm["run_id"].map(dm_p)

    run_ids = dfm["run_id"].tolist()
    loss_mat = align_losses_matrix(run_ids)
    included, _, _ = compute_mcs_inclusion(loss_mat)
    dfm["mcs_in"] = dfm["run_id"].apply(lambda x: x in included)

    dfm = dfm[["label", "family", "rmse", "mae", "dm_p_vs_best", "mcs_in", "n"]].copy()
    dfm = dfm.rename(columns={
        "label": "Model",
        "family": "Family",
        "rmse": "RMSE",
        "mae": "MAE",
        "dm_p_vs_best": "DM p (vs best)",
        "mcs_in": "MCS",
        "n": "N",
    })
    return dfm

tables_r1 = {}
for s in SETUP_ORDER:
    try:
        tables_r1[s] = build_table_r1_for_setup(s)
        print(f"[OK] Table R1 for {s}: {tables_r1[s].shape}")
    except Exception as e:
        print(f"[SKIP] {s}: {e}")

tables_r1


In [None]:

# Display + export Table R1
for setup, df in tables_r1.items():
    out_csv = TAB_OUT / f"Table_R1_{setup.replace(' ', '_')}.csv"
    df.to_csv(out_csv, index=False)

    print("\n", "="*80)
    print(f"Table R1 – {setup}")
    display(df.style.format({"RMSE":"{:.3f}", "MAE":"{:.3f}", "DM p (vs best)": fmt_p}))
    print("Saved:", out_csv)


## 6) Figure R1 (pro Setup): Lollipop – RMSE pro Modell (MCS markiert)

In [None]:

def plot_mcs_lollipop_for_setup(setup: str, filename: str | None = None):
    df = tables_r1[setup].copy().sort_values("RMSE", ascending=False).reset_index(drop=True)
    y = np.arange(len(df))

    xmin = df["RMSE"].min() - 0.02
    xmax = df["RMSE"].max() + 0.05

    fig, ax = plt.subplots(figsize=(10, 0.5*len(df) + 2), dpi=120)
    ax.hlines(y=y, xmin=xmin, xmax=df["RMSE"], alpha=0.35, linewidth=1)

    for i, row in df.iterrows():
        in_mcs = bool(row["MCS"])
        marker = "D" if in_mcs else "o"
        face = "black" if in_mcs else "white"
        ax.plot(row["RMSE"], i, marker=marker, markersize=9,
                markerfacecolor=face, markeredgecolor="black", markeredgewidth=1.5, linestyle="")
        p = row["DM p (vs best)"]
        if pd.notna(p):
            ax.text(row["RMSE"] + 0.002, i, f"p={fmt_p(p)}", va="center", fontsize=9)

    ax.set_yticks(y)
    ax.set_yticklabels(df["Model"], fontsize=11, fontweight="bold")
    ax.set_xlabel("RMSE (active Stage‑B sequence)")
    ax.set_title(f"Figure R1 – Model comparison + MCS ({setup})", pad=12)
    ax.set_xlim(xmin, xmax)
    ax.grid(True, axis="x", linestyle=":", alpha=0.6)

    plt.tight_layout()
    if filename:
        out = FIG_OUT / filename
        fig.savefig(out, bbox_inches="tight")
        print("Saved:", out)
    plt.show()

for s in tables_r1.keys():
    plot_mcs_lollipop_for_setup(s, filename=f"Figure_R1_Lollipop_{s.replace(' ','_')}.png")


## 7) Setup‑Vergleich über Familien: DM‑Plot mit Setup I als Benchmark
Pro Familie wird das beste Modell je Setup (min RMSE) genommen. Dann wird die RMSE‑Änderung vs Setup I (%) geplottet inkl. one‑sided DM‑Signifikanz.

In [None]:

def best_run_per_family_and_setup() -> pd.DataFrame:
    rows = []
    for r in MODEL_REGISTRY:
        rid = r["run_id"]
        if rid not in stageb_active:
            continue
        m = compute_overall_metrics(stageb_active[rid])
        rows.append(dict(
            run_id=rid, label=r["label"], family=r["family"], setup=r["setup"],
            rmse=m["rmse"], mae=m["mae"], n=m["n"]
        ))
    df = pd.DataFrame(rows)
    if df.empty:
        raise ValueError("No models loaded.")
    return df.sort_values("rmse").groupby(["family", "setup"], as_index=False).first()

df_best = best_run_per_family_and_setup()
df_best


In [None]:

def plot_setup_gain_vs_setupI(df_best: pd.DataFrame, filename: str | None = None) -> pd.DataFrame:
    ref = df_best[df_best["setup"] == "Setup I"][["family", "run_id", "rmse"]].rename(
        columns={"run_id":"run_id_ref", "rmse":"rmse_ref"}
    )
    df = df_best.merge(ref, on="family", how="inner")
    df["pct_change_vs_I"] = (df["rmse"] - df["rmse_ref"]) / df["rmse_ref"] * 100.0

    pvals, sigs = [], []
    for _, row in df.iterrows():
        if row["setup"] == "Setup I":
            pvals.append(np.nan)
            sigs.append(False)
            continue
        _, p, _ = dm_test_hac(
            losses_se(stageb_active[row["run_id"]]),
            losses_se(stageb_active[row["run_id_ref"]]),
            alternative="less"  # H1: setup run better than setup I
        )
        pvals.append(p)
        sigs.append(p < ALPHA_SIG)

    df["dm_p_vs_I"] = pvals
    df["sig_vs_I"] = sigs

    fam_order = (
        df[df["setup"] != "Setup I"]
          .sort_values("pct_change_vs_I", ascending=True)["family"]
          .unique()
          .tolist()
    )
    y_map = {f:i for i,f in enumerate(fam_order)}

    marker_map = {"Setup II":"o", "Setup III":"D"}

    fig, ax = plt.subplots(figsize=(10, 0.6*len(fam_order)+2), dpi=120)
    ax.axvline(0, color="black", linewidth=1.5)
    ax.grid(True, axis="x", linestyle="--", alpha=0.5)

    for _, row in df[df["setup"] != "Setup I"].iterrows():
        yy = y_map[row["family"]]
        x = row["pct_change_vs_I"]
        m = marker_map.get(row["setup"], "o")
        face = "black" if row["sig_vs_I"] else "white"
        ax.hlines(yy, 0, x, alpha=0.20, linewidth=2)
        ax.plot(x, yy, marker=m, markersize=9, markerfacecolor=face, markeredgecolor="black", markeredgewidth=1.8, linestyle="")
        ax.text(x + (0.8 if x>=0 else -0.8), yy, f"{row['setup']}: p={fmt_p(row['dm_p_vs_I'])}",
                va="center", ha="left" if x>=0 else "right", fontsize=9)

    ax.set_yticks(range(len(fam_order)))
    ax.set_yticklabels(fam_order, fontsize=11, fontweight="bold")
    ax.set_xlabel("RMSE change vs Setup I (%)  (negative = improvement)")
    ax.set_title("Setup comparison per family (best per setup; DM vs Setup I)", pad=12)
    plt.tight_layout()

    if filename:
        out = FIG_OUT / filename
        fig.savefig(out, bbox_inches="tight")
        print("Saved:", out)

    plt.show()
    return df

df_setup_gain = plot_setup_gain_vs_setupI(df_best, filename="DM_Plot_Setup_Comparison_vs_SetupI.png")
df_setup_gain.head()


## 8) Figure R2 (pro Setup): 24M Rolling RMSE – pro Familie (best run)
(Eher Appendix, aber hier direkt erzeugt.)

In [None]:

def rolling_rmse_from_preds(df_pred: pd.DataFrame, window: int = ROLLING_WINDOW_MONTHS) -> pd.Series:
    se = losses_se(df_pred)
    return (se.rolling(window=window, min_periods=window).mean() ** 0.5).rename(f"roll_rmse_{window}")

def plot_rolling_rmse_by_family_per_setup(df_best: pd.DataFrame, window: int = ROLLING_WINDOW_MONTHS, filename_prefix: str = "Figure_R2_RollingRMSE"):
    for setup in SETUP_ORDER:
        sub = df_best[df_best["setup"] == setup].copy()
        if sub.empty:
            continue

        fig, ax = plt.subplots(figsize=(11, 6), dpi=120)
        for _, row in sub.iterrows():
            rid = row["run_id"]
            fam = row["family"]
            rr = rolling_rmse_from_preds(stageb_active[rid], window=window)
            ax.plot(rr.index, rr.values, label=fam)

        ax.set_title(f"{window}M rolling RMSE by family (best run) – {setup}")
        ax.set_ylabel("Rolling RMSE")
        ax.set_xlabel("Date")
        ax.legend(ncol=2, frameon=True)
        ax.grid(True, axis="y", linestyle=":", alpha=0.6)
        plt.tight_layout()

        out = FIG_OUT / f"{filename_prefix}_{setup.replace(' ','_')}.png"
        fig.savefig(out, bbox_inches="tight")
        print("Saved:", out)
        plt.show()

plot_rolling_rmse_by_family_per_setup(df_best)


## 9) Baselines: Table R3 + Relative RMSE Matrix (Appendix)

In [None]:

def build_table_r3_baselines() -> pd.DataFrame:
    rows = []
    for r in BASELINE_REGISTRY:
        rid = r["run_id"]
        if rid not in stageb_active:
            continue
        m = compute_overall_metrics(stageb_active[rid])
        rows.append(dict(run_id=rid, Model=r["label"], RMSE=m["rmse"], MAE=m["mae"], N=m["n"]))
    return pd.DataFrame(rows).sort_values("RMSE")

try:
    table_r3 = build_table_r3_baselines()
    out_csv = TAB_OUT / "Table_R3_Baselines.csv"
    table_r3.to_csv(out_csv, index=False)
    display(table_r3.style.format({"RMSE":"{:.3f}", "MAE":"{:.3f}"}))
    print("Saved:", out_csv)

    BEST_BASELINE_RID = table_r3.iloc[0]["run_id"]
    BEST_BASELINE_LABEL = table_r3.iloc[0]["Model"]
    print(f"Best baseline: {BEST_BASELINE_LABEL} (run_id={BEST_BASELINE_RID})")
except Exception as e:
    print("[SKIP] Baselines:", e)
    table_r3 = None


In [None]:

def relative_rmse_matrix(models_registry, baselines_registry) -> pd.DataFrame:
    model_rows = [r for r in models_registry if r["run_id"] in stageb_active]
    base_rows  = [r for r in baselines_registry if r["run_id"] in stageb_active]
    if not model_rows or not base_rows:
        raise ValueError("Need at least 1 model and 1 baseline loaded.")

    model_rmse = {r["label"]: compute_overall_metrics(stageb_active[r["run_id"]])["rmse"] for r in model_rows}

    out = pd.DataFrame(index=[b["label"] for b in base_rows], columns=["Abs RMSE (Benchmark)"] + list(model_rmse.keys()), dtype=float)

    for b in base_rows:
        b_rmse = compute_overall_metrics(stageb_active[b["run_id"]])["rmse"]
        out.loc[b["label"], "Abs RMSE (Benchmark)"] = b_rmse
        for mlabel, m_rmse in model_rmse.items():
            out.loc[b["label"], mlabel] = m_rmse / b_rmse

    return out

try:
    rel_mat = relative_rmse_matrix(MODEL_REGISTRY, BASELINE_REGISTRY)
    out_csv = TAB_OUT / "Appendix_Relative_RMSE_Matrix.csv"
    rel_mat.to_csv(out_csv)
    display(rel_mat.style.format("{:.3f}"))
    print("Saved:", out_csv)
except Exception as e:
    print("[SKIP] Relative RMSE matrix:", e)


## 10) Baselines: one‑sided DM je Benchmark + Holm‑Bonferroni (FWER)
Pro Familie wird die **beste Kombination über alle Setups** gewählt (min RMSE).

In [None]:

def best_run_per_family_overall(df_best: pd.DataFrame) -> pd.DataFrame:
    df = df_best.sort_values("rmse").groupby("family", as_index=False).first()
    df["Model_Label"] = df.apply(lambda r: f"{r['family']} ({r['setup'].replace('Setup ','S')})", axis=1)
    return df

def plot_dm_vs_each_baseline_with_holm(df_best: pd.DataFrame, filename: str | None = None):
    base_rows = [r for r in BASELINE_REGISTRY if r["run_id"] in stageb_active]
    if not base_rows:
        raise ValueError("No baselines loaded.")

    fam_best = best_run_per_family_overall(df_best)
    panels = len(base_rows)

    fig, axes = plt.subplots(1, panels, figsize=(6*panels, 6), dpi=120)
    if panels == 1:
        axes = [axes]

    for ax, b in zip(axes, base_rows):
        b_rid = b["run_id"]
        b_label = b["label"]
        b_rmse = compute_overall_metrics(stageb_active[b_rid])["rmse"]

        p_raw = []
        rows = []
        for _, r in fam_best.iterrows():
            rid = r["run_id"]
            # H1: model better => loss(model) < loss(baseline)
            _, p, _ = dm_test_hac(losses_se(stageb_active[rid]), losses_se(stageb_active[b_rid]), alternative="less")
            p_raw.append(p)
            rows.append(dict(Model_Label=r["Model_Label"], RMSE=r["rmse"], P_raw=p))

        reject, p_adj, _, _ = multipletests(p_raw, alpha=ALPHA_SIG, method="holm")
        dfp = pd.DataFrame(rows)
        dfp["P_adj"] = p_adj
        dfp["Significant"] = reject
        dfp = dfp.sort_values("RMSE", ascending=False).reset_index(drop=True)

        y = np.arange(len(dfp))
        ax.axvline(b_rmse, color="black", linewidth=2, alpha=0.8)
        ax.hlines(y=y, xmin=dfp["RMSE"], xmax=b_rmse, alpha=0.25, linewidth=1)

        for i, row in dfp.iterrows():
            better = row["RMSE"] < b_rmse
            sig = bool(row["Significant"]) and better
            marker = "D" if sig else "o"
            face = "black" if sig else "white"
            ax.plot(row["RMSE"], i, marker=marker, markersize=9,
                    markerfacecolor=face, markeredgecolor="black", markeredgewidth=1.5, linestyle="")
            ax.text(row["RMSE"] - 0.002, i, f"p_adj={fmt_p(row['P_adj'])}", va="center", ha="right", fontsize=9)

        ax.set_yticks(y)
        ax.set_yticklabels(dfp["Model_Label"], fontsize=10, fontweight="bold")
        ax.set_title(f"vs. {b_label}")
        ax.set_xlabel("RMSE (lower is better)")
        ax.grid(True, axis="x", linestyle=":", alpha=0.6)

    plt.suptitle("One-sided pairwise DM tests vs baselines (Holm-Bonferroni across families)", y=1.02)
    plt.tight_layout()

    if filename:
        out = FIG_OUT / filename
        fig.savefig(out, bbox_inches="tight")
        print("Saved:", out)

    plt.show()

plot_dm_vs_each_baseline_with_holm(df_best, filename="Baselines_DM_Holm_Panels.png")


## 11) Online Policy Timeline (Appendix)
Zeigt aktive `config_id` über die Zeit + Switch‑Marker + `wrmse_window`.

In [None]:

def load_stageb_scores(run_id: str) -> pd.DataFrame:
    path = STAGEB_DIR / run_id / "monthly" / "scores.csv"
    if not path.exists():
        raise FileNotFoundError(path)
    return pd.read_csv(path)

def plot_incumbent_timeline(run_id: str, title: str | None = None, filename: str | None = None):
    df = load_stageb_scores(run_id).copy()
    df = df.sort_values(["t", "config_id"]).groupby("t", as_index=False).first()

    # map t -> date from preds file (for nicer axis)
    dfp = pd.read_csv(_stageb_preds_path(run_id), parse_dates=["date_t_plus_1"]).sort_values("date_t_plus_1")
    t_to_date = dfp.drop_duplicates("t").set_index("t")["date_t_plus_1"]
    df["date"] = df["t"].map(t_to_date)

    fig, ax1 = plt.subplots(figsize=(11, 4), dpi=120)
    ax1.plot(df["date"], df["active_idx"], drawstyle="steps-mid", linewidth=2)
    ax1.set_ylabel("Active config_id")
    ax1.set_xlabel("Date")
    ax1.grid(True, axis="y", linestyle=":", alpha=0.6)

    switches = df[df["switched"] == True]
    ax1.scatter(switches["date"], switches["active_idx"], s=60, marker="D")

    ax2 = ax1.twinx()
    ax2.plot(df["date"], df["wrmse_window"], linestyle="--", linewidth=1.5, alpha=0.8)
    ax2.set_ylabel("WRMSE window (active)")

    ax1.set_title(title or f"Incumbent timeline – {run_id}")
    plt.tight_layout()

    if filename:
        out = FIG_OUT / filename
        fig.savefig(out, bbox_inches="tight")
        print("Saved:", out)

    plt.show()

SELECTED_RUN_IDS = [r["run_id"] for r in MODEL_REGISTRY[:2] if r["run_id"] in stageb_active]
for rid in SELECTED_RUN_IDS:
    try:
        plot_incumbent_timeline(rid, filename=f"Appendix_IncumbentTimeline_{rid}.png")
    except Exception as e:
        print(f"[SKIP] timeline for {rid}: {e}")


## 12) LGBM‑Spezialfall (Setup I): „alle ifo“ vs „nicht alle ifo“ (nur Text/DM)
Trage unten zwei Run‑IDs ein und der one‑sided DM‑Test wird ausgegeben.

In [None]:

# EDIT THIS (optional)
LGBM_SETUP1_ALL_IFO_RUN = "LGBM_all_ifo_setup1"
LGBM_SETUP1_SUBSET_IFO_RUN = "LGBM_subset_ifo_setup1"

def report_dm_two_runs(run_a: str, run_b: str, label_a: str, label_b: str, one_sided_a_better: bool = True):
    if run_a not in stageb_active or run_b not in stageb_active:
        print("[SKIP] One of the runs is not loaded.")
        return

    la = losses_se(stageb_active[run_a])
    lb = losses_se(stageb_active[run_b])

    alt = "less" if one_sided_a_better else "two-sided"
    # test if A is better than B (lower loss)
    t, p, n = dm_test_hac(la, lb, alternative=alt)

    m_a = compute_overall_metrics(stageb_active[run_a])
    m_b = compute_overall_metrics(stageb_active[run_b])

    print(f"DM test: {label_a} vs {label_b}  (alt={alt}, HAC={HAC_LAGS}, n={n})")
    print(f"  RMSE({label_a})={m_a['rmse']:.3f} | RMSE({label_b})={m_b['rmse']:.3f}")
    print(f"  t={t:.3f}, p={p:.4f}")

report_dm_two_runs(
    LGBM_SETUP1_ALL_IFO_RUN,
    LGBM_SETUP1_SUBSET_IFO_RUN,
    label_a="LGBM (all ifo, S1)",
    label_b="LGBM (subset ifo, S1)",
    one_sided_a_better=True
)


## EN: Screening 7000 vs 700 (Setup I) – one-sided DM (is 7000 better?)


In [1]:

EN_700_RUN = "elastic_net_with_target_700"
EN_7000_RUN = "elastic_net_with_targetfeatures_7000"

# Ensure both are loaded (reload only if missing)
for rid in [EN_700_RUN, EN_7000_RUN]:
    if rid not in stageb_active:
        try:
            stageb_active[rid] = load_active_stageb_predictions(rid)
            print(f"[OK] loaded: {rid}")
        except Exception as e:
            print(f"[FAIL] {rid}: {e}")

# H1: 7000 better than 700  => mean(loss_7000 - loss_700) < 0
report_dm_two_runs(
    run_a=EN_7000_RUN,
    run_b=EN_700_RUN,
    label_a="Elastic Net (7000 features)",
    label_b="Elastic Net (700 features)",
    one_sided_a_better=True
)


NameError: name 'stageb_active' is not defined