In [34]:
# --- Block 5: Walk-Forward Cross-Validation + Hyperparameter-Search ---
import os, sys, json, time, logging, glob
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ROOT = os.path.abspath("..")
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

# TensorFlow logging ruhigstellen
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)

In [35]:
# ---- Core-Config laden -------------------------------------------------
with open(os.path.join(ROOT, "config.json"), "r") as f:
    C = json.load(f)

TICKER, START, END, INTERVAL = C["ticker"], C["start"], C["end"], C["interval"]
HORIZON  = int(C["horizon"])
LOOKBACK_DEFAULT = int(C["lookback"])
BATCH    = int(C.get("batch", 64))
SEED     = int(C.get("seed", 42))
FEATURESET = C.get("featureset", "v2")
EPS_MODE   = C.get("epsilon_mode", "abs")
EPSILON    = float(C.get("epsilon", 0.0005))
RESULTS_DIR = Path(C.get("results_dir", "../results"))

np.random.seed(SEED); tf.random.set_seed(SEED)

# WFCV Run-Ordner
RUN_DIR = RESULTS_DIR / time.strftime("%Y-%m-%d_%H-%M-%S_wfcv")
(RUN_DIR / "plots").mkdir(parents=True, exist_ok=True)
print("WFCV_RUN_DIR:", RUN_DIR)

WFCV_RUN_DIR: ..\results\2025-10-19_16-45-57_wfcv


In [36]:
# ---- Optional: schneller Smoke-Test -----------------------------------
FAST = False            # <- nur auf True setzen, wenn du bewusst 3 Folds & kurze Runs willst
EPOCHS_GRID = 60        # Block-5-EPOCHS (unabhängig von Block 3)
N_FOLDS = 5             # Ziel: wirklich 5 Folds
if FAST:
    EPOCHS_GRID = 25
    N_FOLDS = 3

In [37]:
# ---- Daten & Features --------------------------------------------------
import yaml, re, glob, os

yaml_path = f"../data/features_{FEATURESET}.yml"
meta = {}
label_h = label_mode = label_eps = None

if os.path.exists(yaml_path):
    with open(yaml_path, "r") as f:
        meta = yaml.safe_load(f) or {}
    lab = (meta or {}).get("label", {})
    label_h    = lab.get("horizon", None)
    label_mode = lab.get("mode", None)
    label_eps  = lab.get("epsilon", None)

def _parse_h_meps_from_name(path: str):
    mH = re.search(r"_cls_h(\d+)_", path)
    me = re.search(r"_(abs|rel)(\d+p\d+)\.csv$", path)
    H  = int(mH.group(1)) if mH else None
    md = me.group(1) if me else None
    eps= float(me.group(2).replace("p",".")) if me else None
    return H, md, eps

def _infer_from_existing_files(tkr, itv, start, end, mode_hint=None, eps_hint=None):
    pat = f"../data/{tkr}_{itv}_{start}_{end}_cls_h*_.csv".replace("_ .csv",".csv")
    cands = sorted(glob.glob(pat), key=os.path.getmtime)
    # Wenn Mode/Eps bekannt, einschränken
    if mode_hint and (eps_hint is not None):
        tag = f"{mode_hint}{str(eps_hint).replace('.','p')}"
        cands = [c for c in cands if c.endswith(f"_{tag}.csv")]
    if not cands:
        return None
    return _parse_h_meps_from_name(cands[-1])

# 1) Primär: YAML-Wahrheit
H_FOR_FILE    = int(label_h)    if label_h    is not None else None
MODE_FOR_FILE = str(label_mode) if label_mode is not None else None
EPS_FOR_FILE  = float(label_eps) if label_eps is not None else None

# 2) Sonst: aus vorhandenen CSVs ableiten (ggf. mit Mode/Eps-Hinweis aus YAML)
if (H_FOR_FILE is None) or (MODE_FOR_FILE is None) or (EPS_FOR_FILE is None):
    inferred = _infer_from_existing_files(TICKER, INTERVAL, START, END,
                                          mode_hint=MODE_FOR_FILE, eps_hint=EPS_FOR_FILE)
    if inferred is not None:
        H_i, M_i, E_i = inferred
        H_FOR_FILE    = H_FOR_FILE    if H_FOR_FILE    is not None else H_i
        MODE_FOR_FILE = MODE_FOR_FILE if MODE_FOR_FILE is not None else M_i
        EPS_FOR_FILE  = EPS_FOR_FILE  if EPS_FOR_FILE  is not None else E_i

# 3) Wenn danach immer noch unbestimmt → explizit scheitern (Anwender-Hinweis)
if (H_FOR_FILE is None) or (MODE_FOR_FILE is None) or (EPS_FOR_FILE is None):
    raise RuntimeError(
        "Label-Definition (H/mode/epsilon) konnte nicht aus YAML oder existierenden CSV-Dateien bestimmt werden.\n"
        f"Erwartet YAML unter: {yaml_path}\n"
        "Oder eine Datei wie: ../data/"
        f"{TICKER}_{INTERVAL}_{START}_{END}_cls_h<H>_<abs|rel><epsilon_mit_p>.csv"
    )

eps_tag   = f"{MODE_FOR_FILE}{str(EPS_FOR_FILE).replace('.','p')}"
TRAIN_CSV = f"../data/{TICKER}_{INTERVAL}_{START}_{END}_cls_h{H_FOR_FILE}_{eps_tag}.csv"

if not os.path.exists(TRAIN_CSV):
    pat = f"../data/{TICKER}_{INTERVAL}_{START}_{END}_cls_h*_{eps_tag}.csv"
    candidates = sorted(glob.glob(pat), key=os.path.getmtime)
    if candidates:
        TRAIN_CSV = candidates[-1]
    else:
        raise FileNotFoundError(
            f"Train CSV nicht gefunden: {TRAIN_CSV}\n"
            f"Gesucht nach Pattern: {pat}\n"
            "Hinweis: Block 2 mit dieser Label-Definition laufen lassen."
        )

print("Loaded TRAIN_CSV:", TRAIN_CSV)
df = pd.read_csv(TRAIN_CSV, index_col=0, parse_dates=True).sort_index()

OHLCV = {"open","high","low","close","volume"}
if meta:
    FEATURES_ALL = [c for c in meta.get("features", []) if c in df.columns]
else:
    FEATURES_ALL = [c for c in df.columns if c not in (OHLCV | {"target"})]
assert len(FEATURES_ALL) > 0, "Keine Features gefunden."

X_full = df[FEATURES_ALL].copy()
y_full = df["target"].astype(int).copy()

print("Label pos_rate:", round(y_full.mean(), 3), "| n:", len(y_full))


Loaded TRAIN_CSV: ../data/AAPL_1d_2012-01-01_2025-09-01_cls_h5_abs0p0005.csv
Label pos_rate: 0.563 | n: 3402


In [38]:
# ---- Walk-Forward Splits (fix: 5 Folds, val=20%, min_train=45%) -------
def make_wf_splits(n, n_folds=5, val_frac=0.20, min_train_frac=0.45):
    """
    Rolling WF:
      train = [0 : train_end)
      val   = [train_end : val_end)
    Erzeugt *genau* n_folds Fenster, gleichmäßig über den Restbereich verteilt.
    """
    val_len   = max(60, int(round(n * val_frac)))             # min ca. 3 Monate
    min_train = max(200, int(round(n * min_train_frac)))
    start_val_end = min_train + val_len
    if start_val_end + 1 > n:
        raise ValueError(f"Dataset zu kurz für val_frac/min_train_frac: n={n}, "
                         f"min_train={min_train}, val_len={val_len}")

    # Val-Enden gleichmäßig verteilen (inkl. letzter Punkt nahe n)
    val_ends = np.linspace(start_val_end, n, num=n_folds, endpoint=True).astype(int)

    # In seltenen Fällen können nahe beieinander liegende val_ends Duplikate erzeugen -> deduplizieren
    val_ends = np.unique(val_ends)
    if len(val_ends) < n_folds:
        # Fallback: mit Schrittweite erhöhen, bis wir n_folds erhalten
        step = max(1, (n - start_val_end) // n_folds)
        val_ends = np.arange(start_val_end, start_val_end + step * n_folds, step)
        val_ends = np.clip(val_ends, start_val_end, n)

    stops = []
    for ve in val_ends[:n_folds]:
        te = int(ve - val_len)
        te = max(te, LOOKBACK_DEFAULT + 1)   # genug für Windowing
        if te <= 0 or ve <= te or ve > n:
            continue
        stops.append((slice(0, te), slice(te, ve)))
    if len(stops) != n_folds:
        raise RuntimeError(f"Erzeugte nur {len(stops)} von {n_folds} Folds. "
                           f"Bitte val_frac/min_train_frac prüfen.")
    return stops

n = len(df)
splits = make_wf_splits(n, n_folds=N_FOLDS, val_frac=0.20, min_train_frac=0.45)
print("Anzahl Folds:", len(splits))
if len(splits) != N_FOLDS:
    raise RuntimeError("Es sind nicht exakt die gewünschten Folds entstanden.")

Anzahl Folds: 5


In [39]:
# ---- Hilfsfunktionen: Windowing + Pipeline -----------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, average_precision_score, roc_auc_score
from tensorflow.keras import layers, regularizers, callbacks, optimizers, models

def make_windows(X_df, y_ser, lookback):
    Xv = X_df.values.astype(np.float32)
    yv = y_ser.values.astype(np.int32)
    xs, ys = [], []
    for i in range(lookback-1, len(X_df)):
        xs.append(Xv[i - lookback + 1 : i + 1])
        ys.append(yv[i])
    return np.stack(xs, axis=0), np.array(ys)

def build_model(n_features, width1=64, width2=32, dropout=0.10, lr=5e-4, use_gru=True):
    rnn = layers.GRU if use_gru else layers.LSTM
    m = models.Sequential([
        layers.Input(shape=(None, n_features)),
        rnn(width1, return_sequences=True, recurrent_dropout=dropout),
        layers.LayerNormalization(),
        rnn(width2, recurrent_dropout=dropout),
        layers.LayerNormalization(),
        layers.Dense(16, activation="relu", kernel_regularizer=regularizers.l2(1e-5)),
        layers.Dense(1, activation="sigmoid"),
    ])
    m.compile(
        optimizer=optimizers.Adam(learning_rate=lr),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.AUC(name="auc"),
                 tf.keras.metrics.AUC(name="auprc", curve="PR")]
    )
    return m

def fit_eval_fold(X_tr, y_tr, X_va, y_va, lookback, hp, seed=SEED, batch=BATCH, epochs=EPOCHS_GRID):
    tf.keras.backend.clear_session()

    # Scaler nur auf Train
    scaler = StandardScaler()
    X_tr_s = pd.DataFrame(scaler.fit_transform(X_tr), index=X_tr.index, columns=X_tr.columns)
    X_va_s = pd.DataFrame(scaler.transform(X_va),     index=X_va.index, columns=X_va.columns)

    Xtr_win, ytr = make_windows(X_tr_s, y_tr, lookback)
    Xva_win, yva = make_windows(X_va_s, y_va, lookback)

    ds_tr = tf.data.Dataset.from_tensor_slices((Xtr_win, ytr)) \
            .shuffle(len(Xtr_win), seed=seed) \
            .batch(batch).prefetch(tf.data.AUTOTUNE)
    ds_va = tf.data.Dataset.from_tensor_slices((Xva_win, yva)) \
            .batch(batch).prefetch(tf.data.AUTOTUNE)

    tf.keras.utils.set_random_seed(seed)
    model = build_model(
        n_features=Xtr_win.shape[-1],
        width1=hp["width1"], width2=hp["width2"],
        dropout=hp["dropout"], lr=hp["lr"], use_gru=(hp["cell"]=="GRU")
    )

    cbs = [
        callbacks.EarlyStopping(monitor="val_auprc", mode="max", patience=6, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(monitor="val_auprc", mode="max", factor=0.5, patience=3, min_lr=1e-5),
        callbacks.TerminateOnNaN(),
    ]

    print(f"  -> fit (epochs={epochs}, batch={batch}) ...", flush=True)
    hist = model.fit(ds_tr, validation_data=ds_va, epochs=epochs, verbose=1 if FAST else 0, callbacks=cbs)

    yva_proba = model.predict(ds_va, verbose=0).ravel()
    yva_pred  = (yva_proba >= 0.5).astype(int)

    metrics = dict(
        mcc=float(matthews_corrcoef(yva, yva_pred)),
        bal_acc=float(balanced_accuracy_score(yva, yva_pred)),
        auprc=float(average_precision_score(yva, yva_proba)),
        auroc=float(roc_auc_score(yva, yva_proba)),
        epochs_trained=int(len(hist.history["loss"]))
    )
    tf.keras.backend.clear_session()
    return metrics

In [40]:
# --- Grids --------------------------------------------------------------
LOOKBACK_GRID = [LOOKBACK_DEFAULT] if FAST else [LOOKBACK_DEFAULT]  # kannst hier weitere LB adden
HP_GRID = [
    dict(width1=32, width2=16, dropout=0.10, lr=5e-4, cell="GRU"),
] if FAST else [
    dict(width1=32, width2=16, dropout=0.10, lr=5e-4, cell="GRU"),
    # weitere Kandidaten optional:
    # dict(width1=64, width2=32, dropout=0.10, lr=5e-4, cell="LSTM"),
]

FEATURE_SUBSETS = {
    "all": FEATURES_ALL,
    "mom_only": [c for c in FEATURES_ALL
                 if ("logret" in c) or ("macd" in c) or (c in {"sma_diff","rsi_14","bb_pos"})],
}

print("Sanity check vor der Suche:")
print("  n Zeilen df:", len(df))
print("  FEATURESET:", FEATURESET)
print("  FEATURES_ALL:", len(FEATURES_ALL))
print("  FEATURE_SUBSETS:", list(FEATURE_SUBSETS.keys()))
print("  LOOKBACK_GRID:", LOOKBACK_GRID)
print("  HP_GRID:", len(HP_GRID))
print("  #splits:", len(splits))
if len(splits) > 0:
    tr_s, va_s = splits[0]
    print("  Fold0 sizes (train/val):", tr_s.stop, va_s.stop - va_s.start)

Sanity check vor der Suche:
  n Zeilen df: 3402
  FEATURESET: v2
  FEATURES_ALL: 11
  FEATURE_SUBSETS: ['all', 'mom_only']
  LOOKBACK_GRID: [60]
  HP_GRID: 1
  #splits: 5
  Fold0 sizes (train/val): 1531 680


In [41]:
# ---- Suche --------------------------------------------------------------
from time import perf_counter
print("Starte Suche ...", flush=True)
records = []
total_combos = 0

def _size(slc):  # helper
    return slc.stop - (slc.start or 0)

t0 = perf_counter()
for feat_name, FEATS in FEATURE_SUBSETS.items():
    if len(FEATS) == 0:
        print(f"[{feat_name}] übersprungen: 0 Features.")
        continue

    for lookback in LOOKBACK_GRID:
        for hp in HP_GRID:
            total_combos += 1
            ran_folds = 0
            for fold_id, (tr_s, va_s) in enumerate(splits, start=1):
                X_tr, y_tr = X_full.iloc[tr_s][FEATS], y_full.iloc[tr_s]
                X_va, y_va = X_full.iloc[va_s][FEATS], y_full.iloc[va_s]

                # Minimalcheck für Windowing
                if len(X_tr) < (lookback + 50) or len(X_va) < (lookback + 10):
                    print(f"[{feat_name} | LB={lookback} | {hp['cell']} {hp['width1']}/{hp['width2']} "
                          f"dp={hp['dropout']} lr={hp['lr']}] Skip-Fold (zu kurz): "
                          f"train={len(X_tr)}, val={len(X_va)}, needed>={(lookback+50)}/{(lookback+10)}")
                    continue

                mets = fit_eval_fold(X_tr, y_tr, X_va, y_va, lookback, hp, epochs=EPOCHS_GRID)
                ran_folds += 1

                rec = {
                    "feature_set": FEATURESET,
                    "features_used": feat_name,
                    "n_features": len(FEATS),
                    "lookback": lookback,
                    **hp,
                    "fold": fold_id,
                    **mets
                }
                records.append(rec)
                print(f"[{feat_name} | LB={lookback} | {hp['cell']} {hp['width1']}/{hp['width2']} "
                      f"dp={hp['dropout']} lr={hp['lr']}] Fold{fold_id}: "
                      f"MCC={mets['mcc']:.3f} AUPRC={mets['auprc']:.3f}")

            print(f"--> Summary [{feat_name} | LB={lookback} | {hp['cell']} {hp['width1']}/{hp['width2']}] "
                  f"ran_folds={ran_folds}/{len(splits)}")

t1 = perf_counter()
print(f"\nSuche fertig: combos={total_combos}, records={len(records)}, Dauer={t1-t0:.1f}s")

results = pd.DataFrame.from_records(records)
csv_path = RUN_DIR / "wfcv_results.csv"
results.to_csv(csv_path, index=False)
print("Geschrieben:", csv_path)

Starte Suche ...
  -> fit (epochs=60, batch=64) ...
[all | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold1: MCC=-0.086 AUPRC=0.625
  -> fit (epochs=60, batch=64) ...
[all | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold2: MCC=-0.031 AUPRC=0.611
  -> fit (epochs=60, batch=64) ...
[all | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold3: MCC=0.074 AUPRC=0.535
  -> fit (epochs=60, batch=64) ...
[all | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold4: MCC=0.024 AUPRC=0.563
  -> fit (epochs=60, batch=64) ...
[all | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold5: MCC=-0.037 AUPRC=0.524
--> Summary [all | LB=60 | GRU 32/16] ran_folds=5/5
  -> fit (epochs=60, batch=64) ...
[mom_only | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold1: MCC=0.022 AUPRC=0.673
  -> fit (epochs=60, batch=64) ...
[mom_only | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold2: MCC=0.073 AUPRC=0.653
  -> fit (epochs=60, batch=64) ...
[mom_only | LB=60 | GRU 32/16 dp=0.1 lr=0.0005] Fold3: MCC=0.107 AUPRC=0.538
  -> fit (epochs=60, batch=64) ...
[mom_only | LB=60 | GR

In [43]:
# ---- Aggregation & Best-Config -----------------------------------------
agg_cols = ["feature_set","features_used","n_features","lookback","width1","width2","dropout","lr","cell"]
agg = (results.groupby(agg_cols)
       .agg(mcc_mean=("mcc","mean"), mcc_std=("mcc","std"),
            auprc_mean=("auprc","mean"), auprc_std=("auprc","std"),
            auroc_mean=("auroc","mean"),
            balacc_mean=("bal_acc","mean"),
            n_folds=("mcc","count"))
       .reset_index())

# Ranking: mcc_mean (desc) -> auprc_mean (desc) -> mcc_std (asc)
agg = agg.sort_values(["mcc_mean","auprc_mean","mcc_std"], ascending=[False, False, True])
agg.to_csv(RUN_DIR / "wfcv_results_agg.csv", index=False)

best = agg.iloc[0].to_dict() if len(agg) else {}
with open(RUN_DIR / "best_config.json", "w") as f:
    json.dump(best, f, indent=2)

# (Optional) Top-5
agg.head(5).to_csv(RUN_DIR / "wfcv_results_top5.csv", index=False)

print("Best config:", best)

Best config: {'feature_set': 'v2', 'features_used': 'mom_only', 'n_features': 9, 'lookback': 60, 'width1': 32, 'width2': 16, 'dropout': 0.1, 'lr': 0.0005, 'cell': 'GRU', 'mcc_mean': 0.0452054373880242, 'mcc_std': 0.04308407858833591, 'auprc_mean': 0.6119489902015658, 'auprc_std': 0.053643328854503874, 'auroc_mean': 0.5534801696414117, 'balacc_mean': 0.5103286957052815, 'n_folds': 5}


In [44]:
# ---- Score-Grids als Plot ----------------------------------------------
pivot_mcc = agg.pivot_table(index="lookback",
                            columns=["features_used", "cell", "width1"],
                            values="mcc_mean")
pivot_au  = agg.pivot_table(index="lookback",
                            columns=["features_used", "cell", "width1"],
                            values="auprc_mean")

for name, pivot in [("score_grid_mcc.png", pivot_mcc), ("score_grid_auprc.png", pivot_au)]:
    plt.figure(figsize=(10,5))
    im = plt.imshow(pivot.values, aspect="auto")
    plt.colorbar(im)
    plt.yticks(range(len(pivot.index)), pivot.index)
    plt.xticks(range(pivot.shape[1]), [str(c) for c in pivot.columns], rotation=45, ha="right")
    plt.title(name.replace("_"," ").replace(".png",""))
    plt.tight_layout()
    plt.savefig(RUN_DIR / "plots" / name, dpi=160)
    plt.close()

In [45]:
# ---- Run-Info dump ------------------------------------------------------
run_info = {
    "seed": SEED,
    "epochs_grid": EPOCHS_GRID,
    "n_folds": N_FOLDS,
    "val_frac": 0.20,
    "min_train_frac": 0.45,
    "lookback_grid": LOOKBACK_GRID,
    "hp_grid": HP_GRID,
    "feature_subsets": list(FEATURE_SUBSETS.keys()),
    "train_csv": TRAIN_CSV,
    "label_resolution": {
        "source": "yaml" if os.path.exists(yaml_path) and (label_h is not None) else "inferred_from_csv",
        "yaml_path": yaml_path
    },
    "labels": {"horizon": H_FOR_FILE, "mode": MODE_FOR_FILE, "epsilon": EPS_FOR_FILE}
}
with open(RUN_DIR / "wfcv_run_info.json", "w") as f:
    json.dump(run_info, f, indent=2)

print("\nBlock 5 abgeschlossen. Artefakte:")
print(" -", RUN_DIR / "wfcv_results.csv")
print(" -", RUN_DIR / "wfcv_results_agg.csv")
print(" -", RUN_DIR / "best_config.json")
print(" -", RUN_DIR / "wfcv_results_top5.csv")
print(" -", RUN_DIR / "plots")


Block 5 abgeschlossen. Artefakte:
 - ..\results\2025-10-19_16-45-57_wfcv\wfcv_results.csv
 - ..\results\2025-10-19_16-45-57_wfcv\wfcv_results_agg.csv
 - ..\results\2025-10-19_16-45-57_wfcv\best_config.json
 - ..\results\2025-10-19_16-45-57_wfcv\wfcv_results_top5.csv
 - ..\results\2025-10-19_16-45-57_wfcv\plots
