In [15]:
# --- Block 5: Walk-Forward Cross-Validation + Hyperparameter-Search ---
import os, sys, json, time
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ROOT = os.path.abspath("..")
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

In [16]:
import os, logging
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)


In [17]:
# ---- Config laden ------------------------------------------------------
with open(os.path.join(ROOT, "config.json"), "r") as f:
    C = json.load(f)

TICKER, START, END, INTERVAL = C["ticker"], C["start"], C["end"], C["interval"]
HORIZON  = int(C["horizon"])
LOOKBACK_DEFAULT = int(C["lookback"])  # kann im Grid überschrieben werden
BATCH    = int(C.get("batch", 64))
EPOCHS   = int(C.get("epochs", 60))
SEED     = int(C.get("seed", 42))
FEATURESET = C.get("featureset", "v2")  # "v1" oder "v2"
EPS_MODE   = C.get("epsilon_mode", "abs")
EPSILON    = float(C.get("epsilon", 0.0005))

np.random.seed(SEED)
import tensorflow as tf
tf.random.set_seed(SEED)

RESULTS_DIR = Path(C.get("results_dir", "../results"))
RUN_DIR = RESULTS_DIR / time.strftime("%Y-%m-%d_%H-%M-%S_wfcv")
(RUN_DIR / "plots").mkdir(parents=True, exist_ok=True)
print("WFCV_RUN_DIR:", RUN_DIR)

WFCV_RUN_DIR: ..\results\2025-10-18_16-11-46_wfcv


In [18]:
# Quick toggle für schnelle, sichtbare Läufe
FAST = True  # <- für Smoke-Test einschalten; für den „vollen“ Run auf False

if FAST:
    EPOCHS = min(EPOCHS, 25)        # kürzer trainieren
    C["wfcv_folds"] = 3             # weniger Folds


In [19]:
# ---- Daten & Features --------------------------------------------------
import yaml, os, glob

# aus config lesen
EPS_MODE   = C.get("epsilon_mode", "abs")
EPSILON    = float(C.get("epsilon", 0.001))
FEATURESET = C.get("featureset", "v2")

# YAML-Metadaten aus Block 2 bevorzugen (garantiert konsistent zu erzeugtem CSV)
yaml_path = f"../data/features_{FEATURESET}.yml"
meta = {}
label_h = label_mode = label_eps = None
if os.path.exists(yaml_path):
    with open(yaml_path, "r") as f:
        meta = yaml.safe_load(f) or {}
    lab = (meta or {}).get("label", {})
    label_h   = lab.get("horizon", None)
    label_mode= lab.get("mode", None)
    label_eps = lab.get("epsilon", None)

H_FOR_FILE     = int(label_h) if label_h is not None else int(C["horizon"])
MODE_FOR_FILE  = str(label_mode) if label_mode is not None else EPS_MODE
EPS_FOR_FILE   = float(label_eps) if label_eps is not None else EPSILON

eps_tag   = f"{MODE_FOR_FILE}{str(EPS_FOR_FILE).replace('.','p')}"
TRAIN_CSV = f"../data/{TICKER}_{INTERVAL}_{START}_{END}_cls_h{H_FOR_FILE}_{eps_tag}.csv"

if not os.path.exists(TRAIN_CSV):
    # Fallback: suche passendes File per glob (selber Ticker/Zeitraum, selbe eps_tag)
    pat = f"../data/{TICKER}_{INTERVAL}_{START}_{END}_cls_h*_{eps_tag}.csv"
    candidates = sorted(glob.glob(pat), key=os.path.getmtime)
    if candidates:
        TRAIN_CSV = candidates[-1]
    else:
        raise FileNotFoundError(
            f"Train CSV nicht gefunden: {TRAIN_CSV}\n"
            f"Gesucht nach Pattern: {pat}\n"
            f"Hinweis: Block 2 mit HORIZON/epsilon erneut laufen lassen."
        )

print("Loaded TRAIN_CSV:", TRAIN_CSV)
df = pd.read_csv(TRAIN_CSV, index_col=0, parse_dates=True).sort_index()

OHLCV = {"open","high","low","close","volume"}

# Feature-Liste aus YAML, Fallback: alle nicht-OHLCV/target
if meta:
    FEATURES_ALL = [c for c in meta.get("features", []) if c in df.columns]
else:
    FEATURES_ALL = [c for c in df.columns if c not in (OHLCV | {"target"})]
assert len(FEATURES_ALL) > 0, "Keine Features gefunden."

X_full = df[FEATURES_ALL].copy()
y_full = df["target"].astype(int).copy()


Loaded TRAIN_CSV: ../data/AAPL_1d_2012-01-01_2025-09-01_cls_h5_abs0p0005.csv


In [20]:
print("Label pos_rate:", round(y_full.mean(), 3), "| n:", len(y_full))

Label pos_rate: 0.563 | n: 3402


In [21]:
# ---- Walk-Forward Splits -----------------------------------------------
def make_wf_splits(n, n_folds=5, val_frac=0.15, min_train_frac=0.50):
    """Rolling WF: Train = [0:train_end], Val = (train_end:train_end+val_len].
       Schritte so gewählt, dass wir n_folds valide Fenster bekommen."""
    val_len = max(60, int(n * val_frac))     # min ~3 Monate (bei 1d ~60)
    min_train = max(200, int(n * min_train_frac))
    stops = []
    # letztes Val-Ende darf nicht über n hinausgehen
    last_val_end = n
    # wir verteilen n_folds Stopps gleichmäßig zwischen [min_train+val_len, n]
    span = last_val_end - (min_train + val_len)
    if span < n_folds:
        n_folds = max(1, span // max(1, val_len//2))
    for k in range(n_folds):
        val_end = min_train + val_len + int((k+1) * span / (n_folds))
        train_end = val_end - val_len
        train_slice = slice(0, train_end)             # [0, train_end)
        val_slice   = slice(train_end, val_end)       # [train_end, val_end)
        stops.append((train_slice, val_slice))
    return stops

n = len(df)
splits = make_wf_splits(n, n_folds=int(C.get("wfcv_folds", 5)))
print("Anzahl Folds:", len(splits))

Anzahl Folds: 3


In [22]:
# ---- Hilfsfunktionen: Windowing + Pipeline -----------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    balanced_accuracy_score, matthews_corrcoef,
    average_precision_score, roc_auc_score
)

def make_windows(X_df, y_ser, lookback):
    Xv = X_df.values.astype(np.float32)
    yv = y_ser.values.astype(np.int32)
    xs, ys = [], []
    for i in range(lookback-1, len(X_df)):
        xs.append(Xv[i - lookback + 1 : i + 1])
        ys.append(yv[i])
    return np.stack(xs, axis=0), np.array(ys)

from tensorflow.keras import layers, regularizers, callbacks, optimizers, models

def build_model(n_features, width1=64, width2=32, dropout=0.10, lr=5e-4, use_gru=True):
    rnn = layers.GRU if use_gru else layers.LSTM
    m = models.Sequential([
        layers.Input(shape=(None, n_features)),
        rnn(width1, return_sequences=True, recurrent_dropout=dropout),
        layers.LayerNormalization(),
        rnn(width2, recurrent_dropout=dropout),
        layers.LayerNormalization(),
        layers.Dense(16, activation="relu", kernel_regularizer=regularizers.l2(1e-5)),
        layers.Dense(1, activation="sigmoid"),
    ])
    m.compile(
        optimizer=optimizers.Adam(learning_rate=lr),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.AUC(name="auc"),
                 tf.keras.metrics.AUC(name="auprc", curve="PR")]
    )
    return m

def fit_eval_fold(X_tr, y_tr, X_va, y_va, lookback, hp, seed=SEED, batch=BATCH, epochs=EPOCHS):
    tf.keras.backend.clear_session()

    # Scaler nur auf Train
    scaler = StandardScaler()
    X_tr_s = pd.DataFrame(scaler.fit_transform(X_tr), index=X_tr.index, columns=X_tr.columns)
    X_va_s = pd.DataFrame(scaler.transform(X_va),     index=X_va.index, columns=X_va.columns)

    Xtr_win, ytr = make_windows(X_tr_s, y_tr, lookback)
    Xva_win, yva = make_windows(X_va_s, y_va, lookback)

    ds_tr = tf.data.Dataset.from_tensor_slices((Xtr_win, ytr)).shuffle(len(Xtr_win), seed=seed)\
            .batch(batch).prefetch(tf.data.AUTOTUNE)
    ds_va = tf.data.Dataset.from_tensor_slices((Xva_win, yva)).batch(batch).prefetch(tf.data.AUTOTUNE)

    tf.keras.utils.set_random_seed(seed)
    model = build_model(
        n_features=Xtr_win.shape[-1],
        width1=hp["width1"], width2=hp["width2"],
        dropout=hp["dropout"], lr=hp["lr"], use_gru=(hp["cell"]=="GRU")
    )

    cbs = [
        callbacks.EarlyStopping(monitor="val_auprc", mode="max", patience=6, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(monitor="val_auprc", mode="max", factor=0.5, patience=3, min_lr=1e-5),
        callbacks.TerminateOnNaN(),
    ]

    print(f"  -> fit (epochs={epochs}, batch={batch}) ...", flush=True)
    hist = model.fit(ds_tr, validation_data=ds_va, epochs=epochs, verbose=1 if FAST else 0, callbacks=cbs)

    yva_proba = model.predict(ds_va, verbose=0).ravel()
    yva_pred  = (yva_proba >= 0.5).astype(int)

    metrics = dict(
        mcc=float(matthews_corrcoef(yva, yva_pred)),
        bal_acc=float(balanced_accuracy_score(yva, yva_pred)),
        auprc=float(average_precision_score(yva, yva_proba)),
        auroc=float(roc_auc_score(yva, yva_proba)),
        epochs_trained=int(len(hist.history["loss"]))
    )
    tf.keras.backend.clear_session()
    return metrics

In [23]:
LOOKBACK_GRID = [60] if FAST else [60, 90]
HP_GRID = [
    dict(width1=32, width2=16, dropout=0.10, lr=5e-4, cell="GRU"),
] if FAST else [
    dict(width1=32, width2=16, dropout=0.10, lr=5e-4, cell="GRU"),
    dict(width1=64, width2=32, dropout=0.10, lr=5e-4, cell="LSTM"),
]

FEATURE_SUBSETS = {
    "all": FEATURES_ALL,
} if FAST else {
    "all": FEATURES_ALL,
    "mom_only": [c for c in FEATURES_ALL if ("logret" in c) or ("macd" in c) or (c in {"sma_diff","rsi_14","bb_pos"})],
}

In [24]:
# Optional: Feature-Subsets (Momentum vs. alles)
FEATURE_SUBSETS = {
    "all": FEATURES_ALL,
    # Momentum-/Trendnah: Returns, MACD-Familie, SMA-Diff, RSI, BB-Position
    "mom_only": [c for c in FEATURES_ALL
                 if ("logret" in c) or ("macd" in c) or (c in {"sma_diff","rsi_14","bb_pos"})],
}

In [25]:
print("Sanity check vor der Suche:")
print("  n Zeilen df:", len(df))
print("  FEATURESET:", FEATURESET)
print("  FEATURES_ALL:", len(FEATURES_ALL))
print("  FEATURE_SUBSETS:", list(FEATURE_SUBSETS.keys()) if 'FEATURE_SUBSETS' in globals() else None)
print("  LOOKBACK_GRID:", LOOKBACK_GRID if 'LOOKBACK_GRID' in globals() else None)
print("  HP_GRID:", len(HP_GRID) if 'HP_GRID' in globals() else None)
print("  #splits:", len(splits) if 'splits' in globals() else None)
if 'splits' in globals() and len(splits)>0:
    tr_s, va_s = splits[0]
    print("  Fold0 sizes (train/val):", tr_s.stop, va_s.stop - va_s.start)


Sanity check vor der Suche:
  n Zeilen df: 3402
  FEATURESET: v2
  FEATURES_ALL: 11
  FEATURE_SUBSETS: ['all', 'mom_only']
  LOOKBACK_GRID: [60]
  HP_GRID: 1
  #splits: 3
  Fold0 sizes (train/val): 2098 510


In [26]:
# ---- Suche ---------------------------------------------------------------
from time import perf_counter
print("Starte Suche ...", flush=True)
records = []
total_combos = 0

def _size(slc):  # helper für readable sizes
    return slc.stop - (slc.start or 0)

t0 = perf_counter()
for feat_name, FEATS in FEATURE_SUBSETS.items():
    if len(FEATS) == 0:
        print(f"[{feat_name}] übersprungen: 0 Features.")
        continue

    for lookback in LOOKBACK_GRID:
        for hp in HP_GRID:
            total_combos += 1
            fold_id = 0
            skipped_too_short = 0
            ran_folds = 0

            for tr_s, va_s in splits:
                fold_id += 1
                X_tr, y_tr = X_full.iloc[tr_s][FEATS], y_full.iloc[tr_s]
                X_va, y_va = X_full.iloc[va_s][FEATS], y_full.iloc[va_s]

                # genug Daten für Windowing?
                if len(X_tr) < (lookback + 50) or len(X_va) < (lookback + 10):
                    skipped_too_short += 1
                    # Optional: einmalig begründen
                    if skipped_too_short == 1:
                        print(f"[{feat_name} | LB={lookback} | {hp['cell']} {hp['width1']}/{hp['width2']} "
                              f"dp={hp['dropout']} lr={hp['lr']}] Skip-Fold (zu kurz): "
                              f"train={len(X_tr)}, val={len(X_va)}, needed>={(lookback+50)}/{(lookback+10)}")
                    continue

                mets = fit_eval_fold(X_tr, y_tr, X_va, y_va, lookback, hp)
                ran_folds += 1

                rec = {
                    "feature_set": FEATURESET,
                    "features_used": feat_name,
                    "n_features": len(FEATS),
                    "lookback": lookback,
                    **hp,
                    "fold": fold_id,
                    **mets
                }
                records.append(rec)
                print(f"[{feat_name} | LB={lookback} | {hp['cell']} {hp['width1']}/{hp['width2']} "
                      f"dp={hp['dropout']} lr={hp['lr']}] Fold{fold_id}: "
                      f"MCC={mets['mcc']:.3f} AUPRC={mets['auprc']:.3f}")

            # Kombinations-Zusammenfassung
            print(f"--> Summary [{feat_name} | LB={lookback} | {hp['cell']} {hp['width1']}/{hp['width2']}] "
                  f"ran_folds={ran_folds}, skipped_too_short={skipped_too_short}")

t1 = perf_counter()
print(f"\nSuche fertig: combos={total_combos}, records={len(records)}, Dauer={t1-t0:.1f}s")

results = pd.DataFrame.from_records(records)
csv_path = RUN_DIR / "wfcv_results.csv"
results.to_csv(csv_path, index=False)
print("Geschrieben:", csv_path)


Starte Suche ...
  -> fit (epochs=25, batch=64) ...
Epoch 1/25
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - auc: 0.5046 - auprc: 0.5797 - loss: 0.7429 - val_auc: 0.4137 - val_auprc: 0.4655 - val_loss: 0.7400 - learning_rate: 5.0000e-04
Epoch 2/25
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - auc: 0.5411 - auprc: 0.6138 - loss: 0.6814 - val_auc: 0.4068 - val_auprc: 0.4662 - val_loss: 0.7512 - learning_rate: 5.0000e-04
Epoch 3/25
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - auc: 0.6004 - auprc: 0.6631 - loss: 0.6664 - val_auc: 0.4283 - val_auprc: 0.4777 - val_loss: 0.7597 - learning_rate: 5.0000e-04
Epoch 4/25
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - auc: 0.6250 - auprc: 0.6909 - loss: 0.6592 - val_auc: 0.4290 - val_auprc: 0.4741 - val_loss: 0.7609 - learning_rate: 5.0000e-04
Epoch 5/25
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - a

In [28]:
# ---- Aggregation & Best-Config -----------------------------------------
agg_cols = ["feature_set","features_used","n_features","lookback","width1","width2","dropout","lr","cell"]
agg = (results.groupby(agg_cols)
       .agg(mcc_mean=("mcc","mean"), mcc_std=("mcc","std"),
            auprc_mean=("auprc","mean"), auprc_std=("auprc","std"),
            auroc_mean=("auroc","mean"),
            balacc_mean=("bal_acc","mean"),
            n_folds=("mcc","count"))
       .reset_index())

# Primär: mcc_mean (desc), Sekundär: auprc_mean (desc), Tertiär: mcc_std (asc)
agg = agg.sort_values(["mcc_mean","auprc_mean","mcc_std"], ascending=[False, False, True])
agg.to_csv(RUN_DIR / "wfcv_results_agg.csv", index=False)

best = agg.iloc[0].to_dict()
with open(RUN_DIR / "best_config.json", "w") as f:
    json.dump(best, f, indent=2)
print("Best config:", best)

# (Optional) Top-5 als Quick-View Datei für Review:
agg.head(5).to_csv(RUN_DIR / "wfcv_results_top5.csv", index=False)


Best config: {'feature_set': 'v2', 'features_used': 'all', 'n_features': 11, 'lookback': 60, 'width1': 32, 'width2': 16, 'dropout': 0.1, 'lr': 0.0005, 'cell': 'GRU', 'mcc_mean': 0.06302429207806833, 'mcc_std': 0.07025573661938782, 'auprc_mean': 0.5348673311040054, 'auprc_std': 0.05858423890602883, 'auroc_mean': 0.5107069887569304, 'balacc_mean': 0.5252016599121944, 'n_folds': 3}


In [29]:
# ---- einfache Score-Grids als Plot -------------------------------------
pivot_mcc = agg.pivot_table(index="lookback",
                            columns=["features_used", "cell", "width1"],
                            values="mcc_mean")
pivot_au  = agg.pivot_table(index="lookback",
                            columns=["features_used", "cell", "width1"],
                            values="auprc_mean")

for name, pivot in [("score_grid_mcc.png", pivot_mcc), ("score_grid_auprc.png", pivot_au)]:
    plt.figure(figsize=(10,5))
    im = plt.imshow(pivot.values, aspect="auto")
    plt.colorbar(im)
    plt.yticks(range(len(pivot.index)), pivot.index)
    plt.xticks(range(pivot.shape[1]), [str(c) for c in pivot.columns], rotation=45, ha="right")
    plt.title(name.replace("_"," ").replace(".png",""))
    plt.tight_layout()
    plt.savefig(RUN_DIR / "plots" / name, dpi=160)
    plt.close()

print("\nBlock 5 abgeschlossen. Artefakte:")
print(" -", RUN_DIR / "wfcv_results.csv")
print(" -", RUN_DIR / "wfcv_results_agg.csv")
print(" -", RUN_DIR / "best_config.json")
print(" -", RUN_DIR / "plots")


Block 5 abgeschlossen. Artefakte:
 - ..\results\2025-10-18_16-11-46_wfcv\wfcv_results.csv
 - ..\results\2025-10-18_16-11-46_wfcv\wfcv_results_agg.csv
 - ..\results\2025-10-18_16-11-46_wfcv\best_config.json
 - ..\results\2025-10-18_16-11-46_wfcv\plots
