In [None]:
# Step0 Setup
from pathlib import Path
import sys, json, yaml, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reproducibility
np.random.seed(42)
warnings.filterwarnings("ignore")

# Find project root (expects 'config' and 'src' directories)
def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p/"config").exists() and (p/"src").exists():
            return p
    raise RuntimeError("Project root with 'config' and 'src' not found.")
ROOT = find_project_root(Path.cwd())

# Artifact directories
ARTIFACT_DIRS = [
    ROOT/"artifacts/data",
    ROOT/"artifacts/models",
    ROOT/"artifacts/forecasts",
    ROOT/"artifacts/metrics",
    ROOT/"artifacts/reports",
    ROOT/"artifacts/tmp"
]
for d in ARTIFACT_DIRS:
    d.mkdir(parents=True, exist_ok=True)

# Utility to load YAML config
def _load_config(cfg_path: Path) -> dict:
    if not cfg_path.exists():
        raise FileNotFoundError(f"Config not found: {cfg_path}")
    return yaml.safe_load(cfg_path.read_text()) or {}

# Load data and training config for metadata
data_cfg = _load_config(ROOT/"config"/"data_config.yaml")
train_cfg = _load_config(ROOT/"config"/"train_config.yaml")


In [None]:
# Step1 Artefakte laden & normalisieren
# Load feature groups (TECH, MACRO etc)
feat_groups = yaml.safe_load((ROOT/"artifacts/config"/"feature_groups.yaml").read_text())

# Load and consolidate metrics JSONs
metrics_list = []
metrics_dir = ROOT/"artifacts"/"metrics"
for f in metrics_dir.glob("*.json"):
    data = json.load(f.open())
    # If nested, get test metrics (assumption)
    if "test" in data:
        m = data["test"]
    else:
        m = data
    # Add model identifier
    m["model_name"] = f.stem
    metrics_list.append(m)
metrics_df = pd.json_normalize(metrics_list)
# Ensure relevant columns
for col in ["accuracy","f1","auc","brier","sharpe","sortino","rachev"]:
    if col not in metrics_df.columns:
        metrics_df[col] = np.nan

# Identify model class by model_name
def assign_class(name: str) -> str:
    name_low = name.lower()
    if any(x in name_low for x in ["naive","persistence","baseline","randomwalk"]):
        return "Baseline"
    if "linear" in name_low or "ridge" in name_low or "ols" in name_low:
        return "Linear"
    if "arima" in name_low:
        return "ARIMA"
    if "forest" in name_low or "random" in name_low or "rf" in name_low:
        return "RandomForest"
    if "lstm" in name_low:
        return "LSTM"
    if "ensemble" in name_low:
        return "Ensemble"
    return "Other"
metrics_df["model_class"] = metrics_df["model_name"].apply(assign_class)

# Load forecast CSVs (predictions)
forecasts = []
for f in (ROOT/"artifacts"/"forecasts").glob("*.csv"):
    df = pd.read_csv(f, parse_dates=True, index_col=0)
    df["model_name"] = f.stem
    forecasts.append(df)
forecasts_df = pd.concat(forecasts, ignore_index=False) if forecasts else pd.DataFrame()

# Load feature importances / coefficients
fi_list = []
fi_dir = ROOT/"artifacts"/"feature_importance"
for f in fi_dir.glob("*.csv"):
    df = pd.read_csv(f)
    df["model_name"] = f.stem
    fi_list.append(df)
fi_df = pd.concat(fi_list, ignore_index=True) if fi_list else pd.DataFrame()
# Normalize importances per model
if not fi_df.empty and "importance" in fi_df.columns:
    fi_df["importance"] = fi_df.groupby("model_name")["importance"].transform(lambda x: x / x.sum())
# Assume fi_df columns: feature, importance, model_name


In [None]:
# Step2 F2 – Beste Modellklasse
# Ranking by metrics (higher is better except Brier where lower is better)
metrics_sorted = {}
for metric in ["accuracy","f1","auc","sharpe","sortino","rachev"]:
    metrics_sorted[metric] = metrics_df.sort_values(metric, ascending=False)["model_name"].tolist()
metrics_sorted["brier"] = metrics_df.sort_values("brier", ascending=True)["model_name"].tolist()

# Class aggregation: compute mean, median and bootstrap CIs for each metric
agg_stats = []
np.random.seed(42)
for cls, group in metrics_df.groupby("model_class"):
    metrics_vals = group[["accuracy","f1","auc","brier","sharpe","sortino","rachev"]]
    row = {"model_class": cls}
    for col in ["accuracy","f1","auc","brier","sharpe","sortino","rachev"]:
        vals = group[col].dropna().values
        if len(vals)==0:
            row.update({f"{col}_mean": np.nan, f"{col}_med": np.nan, f"{col}_ci_low": np.nan, f"{col}_ci_high": np.nan})
            continue
        mean = vals.mean()
        med = np.median(vals)
        # Bootstrap for mean CI
        boots = []
        for _ in range(1000):
            sample = np.random.choice(vals, size=len(vals), replace=True)
            boots.append(sample.mean())
        ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
        row[f"{col}_mean"] = mean
        row[f"{col}_med"] = med
        row[f"{col}_ci_low"] = ci_low
        row[f"{col}_ci_high"] = ci_high
    agg_stats.append(row)
by_class_df = pd.DataFrame(agg_stats)

# Save by_model and by_class master tables
metrics_export = metrics_df.copy()
metrics_export = metrics_export[["model_name","model_class","accuracy","f1","auc","brier","sharpe","sortino","rachev"]]
(metrics_export).to_csv(ROOT/"artifacts"/"reports"/"99_master"/"by_model.csv", index=False)
(by_class_df).to_csv(ROOT/"artifacts"/"reports"/"99_master"/"by_class.csv", index=False)


In [None]:
# Step3 F1 – Integriert vs. isoliert
# Identify feature set (assuming model names encode this, e.g., containing 'Tech' or 'Macro')
def feature_set(name: str) -> str:
    nl = name.lower()
    if "tech" in nl and "macro" in nl:
        return "Integrated"
    if "tech" in nl:
        return "Tech"
    if "macro" in nl:
        return "Macro"
    return "Integrated"  # default to integrated if unclear

metrics_df["featureset"] = metrics_df["model_name"].apply(feature_set)

# Compare integrated vs Tech/Macro for each class
f1_diff_rows = []
for cls in metrics_df["model_class"].unique():
    df_cls = metrics_df[metrics_df["model_class"] == cls]
    int_vals = df_cls[df_cls["featureset"]=="Integrated"]
    tech_vals = df_cls[df_cls["featureset"]=="Tech"]
    mac_vals = df_cls[df_cls["featureset"]=="Macro"]
    # Compute means if multiple models per setting
    int_mean = int_vals[["f1","auc","sharpe"]].mean()
    tech_mean = tech_vals[["f1","auc","sharpe"]].mean()
    mac_mean = mac_vals[["f1","auc","sharpe"]].mean()
    if not int_mean.empty and not tech_mean.empty:
        df_res = {
            "model_class": cls,
            "ΔF1_vs_Tech": float(int_mean["f1"] - tech_mean["f1"]),
            "ΔAUC_vs_Tech": float(int_mean["auc"] - tech_mean["auc"]),
            "ΔSharpe_vs_Tech": float(int_mean["sharpe"] - tech_mean["sharpe"]),
            "ΔF1_vs_Macro": float(int_mean["f1"] - mac_mean["f1"]) if not mac_mean.empty else np.nan,
            "ΔAUC_vs_Macro": float(int_mean["auc"] - mac_mean["auc"]) if not mac_mean.empty else np.nan,
            "ΔSharpe_vs_Macro": float(int_mean["sharpe"] - mac_mean["sharpe"]) if not mac_mean.empty else np.nan
        }
    else:
        df_res = {"model_class": cls,
                  "ΔF1_vs_Tech": np.nan, "ΔAUC_vs_Tech": np.nan, "ΔSharpe_vs_Tech": np.nan,
                  "ΔF1_vs_Macro": np.nan, "ΔAUC_vs_Macro": np.nan, "ΔSharpe_vs_Macro": np.nan}
    # Placeholder for Diebold-Mariano and McNemar p-values
    # Actual calculation would require model predictions/losses
    df_res.update({
        "DM_pval_Tech": np.nan,
        "DM_pval_Macro": np.nan,
        "McNemar_pval_Tech": np.nan,
        "McNemar_pval_Macro": np.nan
    })
    f1_diff_rows.append(df_res)
f1_diff_df = pd.DataFrame(f1_diff_rows)
f1_diff_df.to_csv(ROOT/"artifacts"/"reports"/"99_master"/"f1_integrated_vs_isolated.csv", index=False)


In [None]:
# Step4 F3 – Merkmalsrelevanz
if not fi_df.empty:
    # Merge model class into feature importances
    class_map = metrics_df.set_index("model_name")["model_class"].to_dict()
    fi_df["model_class"] = fi_df["model_name"].map(class_map)
    fi_df["featureset"] = fi_df["model_name"].apply(feature_set)
    # Calculate total tech vs macro importance in integrated models
    tech_feats = set(feat_groups.get("TECH", []))
    macro_feats = set(feat_groups.get("MACRO", []))
    fi_int = fi_df[fi_df["featureset"]=="Integrated"]
    tech_importance = fi_int[fi_int["feature"].isin(tech_feats)]["importance"].sum()
    macro_importance = fi_int[fi_int["feature"].isin(macro_feats)]["importance"].sum()
    macro_vs_tech = (macro_importance / tech_importance) if tech_importance>0 else np.nan

    # Aggregate importance by feature and model class
    agg_imp = fi_df.groupby(["feature","model_class"])["importance"].sum().unstack(fill_value=0)
    agg_imp = agg_imp.reset_index().rename_axis(None, axis=1)
    # Determine top-10 features per class (Integrated models only)
    top_feats = {}
    for cls in fi_df["model_class"].unique():
        df_cls = fi_df[(fi_df["model_class"]==cls)&(fi_df["featureset"]=="Integrated")]
        top_feats[cls] = list(df_cls.groupby("feature")["importance"].sum().nlargest(10).index)
    # Consensus top features (those appearing in at least 3 classes' top lists)
    from collections import Counter
    all_top = [f for feats in top_feats.values() for f in feats]
    consensus = [f for f,c in Counter(all_top).items() if c >= 3]
    # Export aggregated importances
    agg_imp.to_csv(ROOT/"artifacts"/"reports"/"99_master"/"f3_aggregated_importance.csv", index=False)


In [None]:
# Step5 Visualisierungen
# 5.1 Heatmap: Model classes vs Metrics (mean of each class)
heat_data = by_class_df.set_index("model_class")[[c+"_mean" for c in ["accuracy","f1","auc","brier","sharpe","sortino","rachev"]]]
plt.figure(figsize=(6,4))
sns.heatmap(heat_data, annot=True, fmt=".3f", cmap="coolwarm")
plt.title("Model Class Performance Metrics (Mean ± CI)")
plt.savefig(ROOT/"artifacts"/"reports"/"99_metrics_heatmap.png")
plt.close()

# 5.2 Cumulative returns: placeholder logic
# (Assumes forecasts_df has columns 'model_name','y_return_next_pct','y_direction_next')
if not forecasts_df.empty:
    # Example: cumulative returns of best single model vs best ensemble vs buy&hold
    # Identify best model by Sharpe
    best_model = metrics_df.loc[metrics_df["sharpe"].idxmax()]["model_name"]
    # Compute strategy returns (assuming y_direction next as strategy signal)
    forecasts_df["strategy_return"] = forecasts_df["y_ret_next"] * (2*forecasts_df["y_direction_next"]-1)
    cum_returns = {}
    for model in [best_model, "Ensemble_Best", "BuyHold"]:
        if model in forecasts_df["model_name"].values:
            dfm = forecasts_df[forecasts_df["model_name"]==model]
            cum_returns[model] = (1+dfm["strategy_return"]).cumprod()
    # Plot cumulative returns
    plt.figure()
    for model, series in cum_returns.items():
        plt.plot(series.index, series.values, label=model)
    plt.legend(); plt.title("Cumulative Returns")
    plt.savefig(ROOT/"artifacts"/"reports"/"99_cum_returns.png")
    plt.close()

# 5.3 Bar plot: Integrated advantage per class for ΔF1, ΔAUC, ΔSharpe
plt.figure(figsize=(6,4))
metrics = ["ΔF1_vs_Tech","ΔAUC_vs_Tech","ΔSharpe_vs_Tech"]
for i, metric in enumerate(metrics):
    plt.bar(np.arange(len(f1_diff_df))+i*0.2, f1_diff_df[metric], width=0.2, label=metric)
plt.xticks(np.arange(len(f1_diff_df))+0.2, f1_diff_df["model_class"])
plt.ylabel("Integrated - Tech")
plt.legend(); plt.title("Integrated vs Tech Advantage per Class")
plt.savefig(ROOT/"artifacts"/"reports"/"99_integrated_advantage.png")
plt.close()

# 5.4 Top features (Integrated) aggregated by class
# Using consensus features importance
plt.figure(figsize=(6,4))
mean_imp = agg_imp.copy().set_index("feature")
mean_imp = mean_imp.loc[consensus] if consensus else mean_imp
mean_imp = mean_imp.div(mean_imp.sum(axis=0), axis=1)  # relative importance
mean_imp.plot(kind="bar")
plt.ylabel("Importance") 
plt.title("Top Features (Integrated) by Class")
plt.savefig(ROOT/"artifacts"/"reports"/"99_top_features.png")
plt.close()


In [None]:
# Step6 Repro-Block
print("Date range:", data_cfg.get("dataset",{}).get("start_date"), "→", data_cfg.get("dataset",{}).get("end_date"))
print("Seed:", np.random.get_state()[1][0])
print(f"Library versions: pandas {pd.__version__}, numpy {np.__version__}")
if "train_years" in train_cfg:
    print("Training window (years):", train_cfg["train_years"])
print("Validation step (months):", train_cfg.get("step_months"))
print("Embargo period (months):", train_cfg.get("embargo_months"))
# List used artifact files
used_files = list((ROOT/"artifacts"/"metrics").glob("*.json")) \
           + list((ROOT/"artifacts"/"forecasts").glob("*.csv")) \
           + list((ROOT/"artifacts"/"feature_importance").glob("*.csv"))
print("Loaded artifact files:")
for f in used_files:
    print(f.relative_to(ROOT))


In [None]:
# Step7 Übersicht 99 & Exporte
# Determine best model class (highest mean AUC)
best_class = by_class_df.loc[by_class_df["auc_mean"].idxmax()]["model_class"]
integrated_gains = {}
for metric in ["f1","auc","sharpe"]:
    col = "Δ" + metric.upper() + "_vs_Tech"
    integrated_gains[metric] = f1_diff_df[col].mean()
top_feats_integrated = consensus  # from earlier consensus list
macro_vs_tech_contrib = macro_vs_tech

summary = pd.DataFrame([{
    "best_model_class": best_class,
    "integrated_gain_f1": integrated_gains.get("f1"),
    "integrated_gain_auc": integrated_gains.get("auc"),
    "integrated_gain_sharpe": integrated_gains.get("sharpe"),
    "top_features_integrated": ";".join(top_feats_integrated) if top_feats_integrated else "",
    "macro_vs_tech_contrib": macro_vs_tech_contrib
}])
summary.to_csv(ROOT/"artifacts"/"reports"/"99_summary_overview.csv", index=False)
# Save summary as image (table)
fig, ax = plt.subplots(figsize=(6,1))
ax.axis("off"); ax.table(cellText=summary.values, colLabels=summary.columns, loc="center")
plt.savefig(ROOT/"artifacts"/"reports"/"99_summary_overview.png")
plt.close()

# (Optional) Generate a minimal HTML report
html_content = f"""<html><body>
<h2>Model Evaluation Summary</h2>
{summary.to_html(index=False)}
</body></html>"""
with open(ROOT/"artifacts"/"reports"/"99_summary.html", "w") as f:
    f.write(html_content)
