In [1]:
# Step0: Setup (robuste Pfade, Imports, Logging, Feature-Set-YAML)
from pathlib import Path
import pandas as pd
import numpy as np
import yaml, pickle, logging, warnings
from typing import Dict, List, Tuple
from scipy.stats import norm

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.stats.contingency_tables import mcnemar

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, log_loss
)

import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("arimax")

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    chain = [start, *start.parents]
    for p in chain:
        if (p / "artifacts" / "data" / "features_monthly.parquet").exists():
            return p
    for p in chain:
        if (p / "artifacts" / "data").exists():
            return p
    for p in chain:
        if (p / "config").exists() and (p / "src").exists():
            return p
    raise AssertionError("Project root not found – expected 'artifacts/data' or 'config'+'src' somewhere above.")

ROOT = find_project_root(Path.cwd())
ARTIFACTS = ROOT / "artifacts"
DATA_DIR = ARTIFACTS / "data"
CONF_DIR = ARTIFACTS / "config"
FORECASTS_DIR = ARTIFACTS / "forecasts"
METRICS_DIR = ARTIFACTS / "metrics"
MODELS_DIR = ARTIFACTS / "models"
REPORTS_DIR = ARTIFACTS / "reports"
for p in [DATA_DIR, CONF_DIR, FORECASTS_DIR, METRICS_DIR, MODELS_DIR, REPORTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)
logger.info(f"ROOT={ROOT} | DATA_DIR={DATA_DIR}")

# Feature-Set-Konfig laden/erstellen
FEATS_CFG = CONF_DIR / "features_config.yaml"
if not FEATS_CFG.exists():
    default_cfg_feats = {
        "TECH": ["3M_SMA_Return","12M_SMA_Return","3M_Momentum","Volatility_6M","Return_Lag1"],
        "MACRO": ["FedFunds_Delta_bps","Inflation_YoY_pct","UnemploymentRate","VIX","EPU_US","FSI","Gold_USD_oz","WTI_Spot","USD_per_EUR"],
    }
    default_cfg_feats["INTEGRATED"] = default_cfg_feats["TECH"] + default_cfg_feats["MACRO"]
    with FEATS_CFG.open("w", encoding="utf-8") as f:
        yaml.safe_dump(default_cfg_feats, f, sort_keys=False)

with FEATS_CFG.open("r", encoding="utf-8") as f:
    cfg_feats = yaml.safe_load(f) or {}

TECH_FEATS = cfg_feats.get("TECH", [])
MACRO_FEATS = cfg_feats.get("MACRO", [])
INTEGRATED_FEATS = cfg_feats.get("INTEGRATED", TECH_FEATS + MACRO_FEATS)

TARGET_RET = "y_return_next_pct"
TARGET_DIR = "y_direction_next"

def _sappend(s: pd.Series, item: pd.Series) -> pd.Series:
    return pd.concat([s, item])

def _dappend(df: pd.DataFrame, row_df: pd.DataFrame) -> pd.DataFrame:
    return pd.concat([df, row_df])


INFO: ROOT=C:\Users\gamer\Desktop\AktienPrognose | DATA_DIR=C:\Users\gamer\Desktop\AktienPrognose\artifacts\data


In [2]:
# Step1: Daten/Featuresets (Load, Align, Splits, Sanity)
features_path = DATA_DIR / "features_monthly.parquet"
raw_path = DATA_DIR / "raw_data.parquet"

assert features_path.exists(), f"Missing file: {features_path}"
features_df = pd.read_parquet(features_path)
features_df.index = pd.to_datetime(features_df.index)
features_df = features_df.sort_index()

# raw optional
raw_df = None
if raw_path.exists():
    raw_df = pd.read_parquet(raw_path)
    raw_df.index = pd.to_datetime(raw_df.index)
    raw_df = raw_df.sort_index()

TRAIN_START, TRAIN_END = "2009-02-28", "2019-12-31"
TEST_START, TEST_END   = "2020-01-31", "2025-05-31"

train_df = features_df.loc[TRAIN_START:TRAIN_END].copy()
test_df  = features_df.loc[TEST_START:TEST_END].copy()

def _X(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    cols = [c for c in cols if c in df.columns]
    return df[cols].astype(float)

X_train_sets = {
    "TECH": _X(train_df, TECH_FEATS).dropna(),
    "MACRO": _X(train_df, MACRO_FEATS).dropna(),
    "INTEGRATED": _X(train_df, INTEGRATED_FEATS).dropna(),
}
X_test_sets = {
    "TECH": _X(test_df, TECH_FEATS).dropna(),
    "MACRO": _X(test_df, MACRO_FEATS).dropna(),
    "INTEGRATED": _X(test_df, INTEGRATED_FEATS).dropna(),
}

def _align_targets(df: pd.DataFrame, idx: pd.Index) -> Tuple[pd.Series, pd.Series]:
    yr = df.reindex(idx)[TARGET_RET].astype(float)
    yd = df.reindex(idx)[TARGET_DIR].astype(int)
    return yr, yd

y_train_all = train_df[TARGET_RET].astype(float)
y_train_dir_all = train_df[TARGET_DIR].astype(int)
y_test_all = test_df[TARGET_RET].astype(float)
y_test_dir_all = test_df[TARGET_DIR].astype(int)

# Sanity: sicherstellen, dass alle Sets Targets haben
for fs, Xtr in X_train_sets.items():
    _yr, _yd = _align_targets(train_df, Xtr.index)
    assert _yr.notna().all() and _yd.notna().all(), f"Targets contain NaN in {fs} train"
for fs, Xte in X_test_sets.items():
    _yr, _yd = _align_targets(test_df, Xte.index)
    assert _yr.notna().all() and _yd.notna().all(), f"Targets contain NaN in {fs} test"


In [3]:
# Step2: TSCV-Splitter (5 Folds, Val=12M, Embargo=1M)
from typing import Iterable

def build_folds() -> List[Tuple[pd.Timestamp, pd.Timestamp, pd.Timestamp]]:
    years = [2015, 2016, 2017, 2018, 2019]
    folds = []
    for y in years:
        train_end = pd.Timestamp(f"{y-1}-11-30")  # bis Nov Vorjahr (Embargo: Dez)
        val_start = pd.Timestamp(f"{y}-01-31")    # Val: Jan..Dez
        val_end   = pd.Timestamp(f"{y}-12-31")
        folds.append((train_end, val_start, val_end))
    return folds

tscv_folds = build_folds()

def slice_xy(X: pd.DataFrame, y: pd.Series, start: pd.Timestamp, end: pd.Timestamp) -> Tuple[pd.DataFrame, pd.Series]:
    idx = (X.index >= start) & (X.index <= end)
    return X.loc[idx], y.loc[idx]


In [4]:
# Step3 (Fix): ARIMAX OOF → Kalibrierung → OOF-Threshold  | use .forecast(steps=...) to avoid exog shape mismatch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score

param_grid = [(p,d,q) for p in [0,1,2] for d in [0,1,2] for q in [0,1,2]]

calibrators, thresholds = {}, {}
best_orders_cv, oof_store = {}, {}

for fs, Xtr in X_train_sets.items():
    ytr_reg, ytr_dir = _align_targets(train_df, Xtr.index)

    fold_rows = []
    oof_frames = []

    for (train_end, val_start, val_end) in tscv_folds:
        X_train_fold = Xtr.loc[:train_end]
        y_train_fold = ytr_reg.loc[:train_end]
        # validation window based on available X rows
        X_val_fold = Xtr.loc[(Xtr.index >= val_start) & (Xtr.index <= val_end)]
        y_val_fold = ytr_reg.reindex(X_val_fold.index)

        if len(X_val_fold) == 0:
            continue

        best_aic, best_order, best_pred, best_f1 = np.inf, None, None, -np.inf

        for (p,d,q) in param_grid:
            try:
                res = ARIMA(endog=y_train_fold, exog=X_train_fold, order=(p,d,q)).fit()
                aic = res.aic
                # IMPORTANT: forecast with exact step-count to match exog rows
                pred = res.forecast(steps=len(X_val_fold), exog=X_val_fold)
                pred.index = X_val_fold.index
            except Exception:
                continue

            f1 = f1_score((y_val_fold>=0).astype(int), (pred>=0).astype(int))
            if (aic < best_aic) or (np.isclose(aic, best_aic) and f1 > best_f1):
                best_aic, best_order, best_pred, best_f1 = aic, (p,d,q), pred.copy(), f1

        if best_pred is None:
            # fallback AR(1)
            res = ARIMA(endog=y_train_fold, exog=X_train_fold, order=(1,0,0)).fit()
            best_order, best_aic = (1,0,0), res.aic
            best_pred = res.forecast(steps=len(X_val_fold), exog=X_val_fold)
            best_pred.index = X_val_fold.index

        fold_rows.append((f"{val_start.date()}_{val_end.date()}", best_order, float(best_aic)))
        oof_frames.append(pd.DataFrame({
            "date": best_pred.index,
            "y_point": best_pred.values,
            "y_true_dir": (y_val_fold>=0).astype(int).values
        }))

    best_orders_cv[fs] = fold_rows
    oof_df = pd.concat(oof_frames, axis=0).set_index("date").sort_index()

    cal = LogisticRegression(max_iter=2000)
    cal.fit(oof_df[["y_point"]].values, oof_df["y_true_dir"].values)
    calibrators[fs] = cal

    oof_proba = cal.predict_proba(oof_df[["y_point"]].values)[:,1]
    y_true_oof = oof_df["y_true_dir"].values
    best_th, best_f1 = 0.5, -np.inf
    for th in np.linspace(0.01, 0.99, 99):
        y_hat = (oof_proba >= th).astype(int)
        f1 = f1_score(y_true_oof, y_hat)
        if f1 > best_f1:
            best_f1, best_th = f1, th
    thresholds[fs] = float(best_th)
    oof_store[fs] = oof_df.assign(y_proba=oof_proba)

    pd.DataFrame(fold_rows, columns=["val_window","best_order","best_aic"]).to_csv(
        METRICS_DIR / f"cv_details_arima_x_{fs}.csv", index=False
    )


In [5]:
# Step4: Walk-Forward-Test (expanding origin) + Modelle speichern
best_orders_full: Dict[str, Tuple[int,int,int]] = {}
forecasts: Dict[str, pd.DataFrame] = {}
last_models: Dict[str, object] = {}

for fs, Xtr in X_train_sets.items():
    ytr_reg, _ = _align_targets(train_df, Xtr.index)
    # Best (p,d,q) auf Full-Train per AIC
    best_aic = np.inf; best_order = None
    for (p,d,q) in param_grid:
        try:
            res = ARIMA(endog=ytr_reg, exog=Xtr, order=(p,d,q)).fit()
        except Exception:
            continue
        if res.aic < best_aic:
            best_aic, best_order = res.aic, (p,d,q)
    if best_order is None:
        best_order = (1,0,0)
    best_orders_full[fs] = best_order

    Xte = X_test_sets[fs].copy()
    yte_ret, yte_dir = _align_targets(test_df, Xte.index)

    train_y = ytr_reg.copy()
    train_X = Xtr.copy()
    dates = Xte.index
    y_point_preds = []

    for dt in dates:
        res = ARIMA(endog=train_y, exog=train_X, order=best_order).fit()
        y_point = res.predict(start=dt, end=dt, exog=Xte.loc[dt:dt]).iloc[0]
        y_point_preds.append(y_point)
        if dt in yte_ret.index:
            train_y = _sappend(train_y, pd.Series({dt: yte_ret.loc[dt]}))
            train_X = _dappend(train_X, Xte.loc[dt:dt])
        last_models[fs] = res

    y_point_preds = np.array(y_point_preds, dtype=float)
    proba = calibrators[fs].predict_proba(y_point_preds.reshape(-1,1))[:,1]
    y_pred = (proba >= thresholds[fs]).astype(int)

    fc_df = pd.DataFrame({
        "date": dates,
        "y_true": yte_dir.values.astype(int),
        "y_pred": y_pred.astype(int),
        "y_proba": proba.astype(float),
        "y_point": y_point_preds.astype(float)
    }).set_index("date")
    forecasts[fs] = fc_df
    fc_df.to_csv(FORECASTS_DIR / f"arima_x_{fs}.csv")

    with open(MODELS_DIR / f"arimax_last_{fs}.pkl", "wb") as f:
        pickle.dump(last_models[fs], f)
    with open(MODELS_DIR / f"calibrator_{fs}.pkl", "wb") as f:
        pickle.dump(calibrators[fs], f)


In [6]:
# Step5: Metriken, Signifikanz (DM 0/1-Loss vs Always-Up & vs Linear-Logit; McNemar vs Always-Up), JSON
import json

def dm_test_01_loss(y_true: np.ndarray, y_pred_a: np.ndarray, y_pred_b: np.ndarray) -> float:
    la = (y_true != y_pred_a).astype(int)
    lb = (y_true != y_pred_b).astype(int)
    d = la - lb
    dm_stat = np.mean(d) / (np.std(d, ddof=1) / np.sqrt(len(d)))
    p = 2 * (1 - norm.cdf(abs(dm_stat)))
    return float(p)

metrics_summary = []

for fs, fc in forecasts.items():
    y_true = fc["y_true"].values.astype(int)
    y_pred = fc["y_pred"].values.astype(int)
    y_proba = fc["y_proba"].values.astype(float)

    F1 = f1_score(y_true, y_pred)
    AUC = roc_auc_score(y_true, y_proba)
    ACC = accuracy_score(y_true, y_pred)
    PREC = precision_score(y_true, y_pred, zero_division=0)
    REC = recall_score(y_true, y_pred, zero_division=0)

    # Baselines
    always_up = np.ones_like(y_true, dtype=int)

    # Linear-Logit baseline (train auf full train)
    Xtr = X_train_sets[fs]; _, ytr_dir = _align_targets(train_df, Xtr.index)
    logit = LogisticRegression(max_iter=2000)
    logit.fit(Xtr.values, ytr_dir.values)
    Xte = X_test_sets[fs].reindex(fc.index)
    logit_proba = logit.predict_proba(Xte.values)[:,1]
    logit_pred = (logit_proba >= 0.5).astype(int)

    p_dm_vs_always = dm_test_01_loss(y_true, y_pred, always_up)
    p_dm_vs_logit  = dm_test_01_loss(y_true, y_pred, logit_pred)

    b01 = np.sum((y_pred == y_true) & (always_up != y_true))
    b10 = np.sum((y_pred != y_true) & (always_up == y_true))
    tbl = [[0, b01],[b10, 0]]
    mcn = mcnemar(tbl, exact=False, correction=True)

    met = {
        "F1": float(F1), "AUC": float(AUC), "Accuracy": float(ACC),
        "Precision": float(PREC), "Recall": float(REC),
        "threshold": float(thresholds[fs]),
        "oof_f1": float(f1_score(oof_store[fs]["y_true_dir"], (oof_store[fs]["y_proba"]>=thresholds[fs]).astype(int))),
        "oof_auc": float(roc_auc_score(oof_store[fs]["y_true_dir"], oof_store[fs]["y_proba"])),
        "cv_details_path": str((METRICS_DIR / f"cv_details_arima_x_{fs}.csv").as_posix()),
        "calibrator_path": str((MODELS_DIR / f"calibrator_{fs}.pkl").as_posix()),
        "dm_p_vs_always": float(p_dm_vs_always),
        "dm_p_vs_logit": float(p_dm_vs_logit),
        "mcnemar_p_vs_always": float(mcn.pvalue),
    }
    metrics_summary.append({"FeatureSet": fs, **met})

    with open(MODELS_DIR / f"logit_baseline_{fs}.pkl", "wb") as f:
        pickle.dump(logit, f)
    with open(METRICS_DIR / f"arima_x_{fs}.json", "w") as f:
        json.dump(met, f, indent=2)

summary_df = pd.DataFrame(metrics_summary).set_index("FeatureSet")
summary_df.to_csv(REPORTS_DIR / "21_summary_table.csv")

# einfache Heatmap ohne seaborn
vals = summary_df[["F1","AUC","Accuracy","Precision","Recall"]].values
plt.figure(figsize=(7,3))
plt.imshow(vals, aspect="auto")
plt.colorbar()
plt.xticks(range(5), ["F1","AUC","ACC","PREC","REC"])
plt.yticks(range(len(summary_df.index)), summary_df.index)
plt.title("ARIMAX Metrics by Feature Set")
plt.tight_layout()
plt.savefig(REPORTS_DIR / "21_summary_table.png")
plt.close()


In [7]:
# Step6 (Fix): Use .forecast(steps=...) and align indices to avoid exog shape mismatch
LAST_VAL_YEAR = 2019
last_train_end = pd.Timestamp(f"{LAST_VAL_YEAR-1}-11-30")
last_val_start = pd.Timestamp(f"{LAST_VAL_YEAR}-01-31")
last_val_end   = pd.Timestamp(f"{LAST_VAL_YEAR}-12-31")

for fs in X_train_sets.keys():
    # Coefficients of last walk-forward fit
    res_last = last_models[fs]
    coef = pd.Series(res_last.params, name="coef")
    coef.index.name = "param"
    coef.to_csv(REPORTS_DIR / f"21_coeffs_{fs}.csv")

    # Permutation Importance (Δ log-loss) im letzten Val-Fenster
    Xtr = X_train_sets[fs]
    ytr_reg, _ = _align_targets(train_df, Xtr.index)

    X_train_fold = Xtr.loc[:last_train_end]
    y_train_fold = ytr_reg.loc[:last_train_end]
    X_val_fold   = Xtr.loc[last_val_start:last_val_end]
    if X_val_fold.empty:
        continue
    y_val_dir    = train_df.loc[X_val_fold.index, TARGET_DIR].astype(int)

    order = best_orders_full.get(fs, (1,0,0))
    try:
        res_fold = ARIMA(endog=y_train_fold, exog=X_train_fold, order=order).fit()
    except Exception:
        res_fold = ARIMA(endog=y_train_fold, exog=X_train_fold, order=(1,0,0)).fit()

    # IMPORTANT: forecast with exact step-count to match exog rows
    y_point_val = res_fold.forecast(steps=len(X_val_fold), exog=X_val_fold)
    y_point_val.index = X_val_fold.index

    base_proba = calibrators[fs].predict_proba(y_point_val.values.reshape(-1,1))[:,1]
    base_ll = log_loss(y_val_dir.values, base_proba, labels=[0,1])

    deltas = []
    for feat in X_val_fold.columns:
        X_perm = X_val_fold.copy()
        X_perm[feat] = np.random.permutation(X_perm[feat].values)
        y_point_perm = res_fold.forecast(steps=len(X_perm), exog=X_perm)
        y_point_perm.index = X_perm.index
        proba_perm = calibrators[fs].predict_proba(y_point_perm.values.reshape(-1,1))[:,1]
        ll_perm = log_loss(y_val_dir.values, proba_perm, labels=[0,1])
        deltas.append((feat, float(ll_perm - base_ll)))

    imp_df = pd.DataFrame(deltas, columns=["feature","delta_logloss"]).sort_values("delta_logloss", ascending=False).head(20)
    imp_df.to_csv(REPORTS_DIR / f"21_importance_{fs}.csv", index=False)

    plt.figure(figsize=(7,6))
    plt.barh(imp_df["feature"][::-1], imp_df["delta_logloss"][::-1])
    plt.xlabel("Δ log-loss (higher = more important)")
    plt.title(f"Permutation Importance (last train window) – {fs}")
    plt.tight_layout()
    plt.savefig(REPORTS_DIR / f"importance_{fs}.png")
    plt.close()


In [8]:
# Step7: Plots (Trefferquote über Zeit, kumulierte Rendite)
for fs, fc in forecasts.items():
    dates = fc.index
    y_true = fc["y_true"].values.astype(int)
    y_pred = fc["y_pred"].values.astype(int)

    hits = (y_true == y_pred).astype(int)
    cum_hit = np.cumsum(hits) / (np.arange(len(hits)) + 1)
    plt.figure(figsize=(8,3))
    plt.plot(dates, cum_hit, marker="o")
    plt.ylim(0,1)
    plt.title(f"Cumulative Hit Rate – {fs}")
    plt.ylabel("Hit rate")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(REPORTS_DIR / f"21_hitrate_{fs}.png")
    plt.close()

    y_ret = test_df.loc[dates, TARGET_RET].values.astype(float)
    strat_ret = np.where(y_pred==1, y_ret, 0.0)
    cum_ret = np.cumprod(1.0 + strat_ret/100.0) - 1.0
    plt.figure(figsize=(8,3))
    plt.plot(dates, cum_ret, marker="o")
    plt.title(f"Cumulative Return – {fs}")
    plt.ylabel("Cumulative return")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(REPORTS_DIR / f"21_cumret_{fs}.png")
    plt.close()


In [9]:
# Step8: Persistenz (Thresholds, Orders, Pfade)
persist = {
    "thresholds": thresholds,
    "best_orders_full": {k: list(v) for k,v in best_orders_full.items()},
    "best_orders_cv": {k: [(w, list(o), a) for (w,o,a) in v] for k,v in best_orders_cv.items()},
    "paths": {
        "forecasts_dir": str(FORECASTS_DIR),
        "metrics_dir": str(METRICS_DIR),
        "models_dir": str(MODELS_DIR),
        "reports_dir": str(REPORTS_DIR),
        "features_cfg": str(FEATS_CFG),
        "features_parquet": str(features_path),
        "raw_parquet": str(raw_path) if raw_path.exists() else None,
    }
}
with open(MODELS_DIR / "arimax_persistence.yaml", "w", encoding="utf-8") as f:
    yaml.safe_dump(persist, f, sort_keys=False, allow_unicode=True)


In [10]:
# Step9: Übersicht 21 (Summary & Probabilites-Plot)
summary = pd.read_csv(REPORTS_DIR / "21_summary_table.csv")
summary["Rank_F1"]  = summary["F1"].rank(ascending=False, method="min")
summary["Rank_AUC"] = summary["AUC"].rank(ascending=False, method="min")
summary.to_csv(REPORTS_DIR / "21_summary_table_ranked.csv", index=False)

plt.figure(figsize=(10,6))
for fs, fc in forecasts.items():
    plt.plot(fc.index, fc["y_proba"].values, label=fs)
plt.legend()
plt.title("P(up) over Test Period")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(REPORTS_DIR / "21_probabilities_over_time.png")
plt.close()
