In [90]:
import os, sys, json, time, glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import (
    roc_curve, auc, precision_recall_curve, average_precision_score,
    classification_report, confusion_matrix, brier_score_loss,
    balanced_accuracy_score, matthews_corrcoef
)
from sklearn.calibration import calibration_curve
from sklearn.isotonic import IsotonicRegression
import joblib

In [91]:
# --- Setup & Config laden ---
ROOT = os.path.abspath("..")
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

with open(os.path.join(ROOT, "config.json"), "r") as f:
    C = json.load(f)

TICKER   = C["ticker"]; START = C["start"]; END = C["end"]; INTERVAL = C["interval"]
HORIZON  = int(C["horizon"]); LOOKBACK = int(C["lookback"])
BATCH    = int(C["batch"]);   EPOCHS   = int(C["epochs"])
SEED     = int(C.get("seed", 42))

RESULTS_DIR = Path(C.get("results_dir", "../results"))
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

def _latest_run_dir(results_dir: Path) -> Path:
    runs = sorted(results_dir.glob("*_lstm"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not runs:
        raise FileNotFoundError("Kein RUN_DIR gefunden. Bitte Block 3 trainieren.")
    return runs[0]

RUN_DIR = _latest_run_dir(RESULTS_DIR)
print("RUN_DIR:", RUN_DIR)

RUN_DIR: ..\results\2025-10-03_13-48-57_lstm


In [92]:
# --- Artefakte laden ---
MODEL_PATH  = (RUN_DIR / "best.keras") if (RUN_DIR / "best.keras").exists() else (RUN_DIR / "model.keras")
SCALER_PATH = RUN_DIR / "scaler.joblib"
CFG_PATH    = RUN_DIR / "config.json"
assert MODEL_PATH.exists() and SCALER_PATH.exists() and CFG_PATH.exists(), "Fehlende Artefakte."

with open(CFG_PATH, "r") as f:
    RCFG = json.load(f)

# Wichtig: compile=False
model  = keras.models.load_model(MODEL_PATH, compile=False)
scaler = joblib.load(SCALER_PATH)

# --- Daten laden (aus Block 2 erzeugte CSV) ---
TRAIN_CSV = f"../data/{TICKER}_{INTERVAL}_{START}_{END}_cls_h{HORIZON}.csv"
df = pd.read_csv(TRAIN_CSV, index_col=0, parse_dates=True).sort_index()

In [93]:
# Feature-Spalten bestimmen (alle außer OHLCV + target, ggf. durch YAML/RCFG eingeschränkt)
OHLCV = {"open","high","low","close","volume"}
candidates = [c for c in df.columns if c not in (OHLCV | {"target"})]
if FEATURES:
    FEATURES = [c for c in FEATURES if c in candidates]
else:
    FEATURES = candidates
assert len(FEATURES) > 0, f"Keine Features gefunden. Kandidaten: {candidates}"

X = df[FEATURES].copy()
y = df["target"].astype(int).copy()

In [94]:
# --- Chronologische Splits (70/15/15) ---
n = len(df)
n_train = int(n * 0.70)
n_val   = int(n * 0.15)
n_test  = n - n_train - n_val

train_idx = slice(0, n_train)
val_idx   = slice(n_train, n_train + n_val)
test_idx  = slice(n_train + n_val, n)

X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_val,   y_val   = X.iloc[val_idx],   y.iloc[val_idx]
X_test,  y_test  = X.iloc[test_idx],  y.iloc[test_idx]

In [95]:
# --- Skalierung (nur TRAIN fit – wie in Block 3) ---
X_train_s = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=FEATURES)
X_val_s   = pd.DataFrame(scaler.transform(X_val),   index=X_val.index,   columns=FEATURES)
X_test_s  = pd.DataFrame(scaler.transform(X_test),  index=X_test.index,  columns=FEATURES)

In [96]:
# --- Konsistenz-Checks: Featureliste vs. Modell ---
# 1) Reihenfolge festnageln (damit Scaler/Modell exakt dieselbe Spaltenreihenfolge sehen)
FEATURES = list(FEATURES)
print("FEATURES (eval):", FEATURES)

# 2) Model-Input-Dimension prüfen
model_in = getattr(model.input_shape, "__iter__", None)
in_dim = model.input_shape[-1] if model_in else model.layers[0].input_shape[-1]
assert in_dim == len(FEATURES), (
    f"Feature-Mismatch: Model expects {in_dim} features, but FEATURES has {len(FEATURES)}.\n"
    "→ Dies passiert z.B., wenn du Block 2/3 mit einem anderen Feature-Set neu trainiert hast, "
    "aber hier noch ein älteres RUN_DIR verwendest."
)

FEATURES (eval): ['logret_1d', 'sma_diff']


In [97]:
# --- Windowing (wie Block 3) ---
def make_windows(X_df: pd.DataFrame, y_ser: pd.Series, lookback: int):
    X_values = X_df.values.astype(np.float32)
    y_values = y_ser.values.astype(np.int32)
    n = len(X_df)
    xs, ys, idx_end = [], [], []
    for i in range(lookback-1, n):
        xs.append(X_values[i - lookback + 1 : i + 1])
        ys.append(y_values[i])
        idx_end.append(X_df.index[i])  # Endzeitpunkt des Fensters
    return np.stack(xs, axis=0), np.array(ys), pd.DatetimeIndex(idx_end)

Xtr_win, ytr, idx_tr = make_windows(X_train_s, y_train, LOOKBACK)
Xva_win, yva, idx_va = make_windows(X_val_s,   y_val,   LOOKBACK)
Xte_win, yte, idx_te = make_windows(X_test_s,  y_test,  LOOKBACK)

def to_ds(X, y, batch, shuffle):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(X), seed=SEED, reshuffle_each_iteration=False)
    return ds.batch(BATCH).prefetch(tf.data.AUTOTUNE)

ds_val  = to_ds(Xva_win, yva, BATCH, shuffle=False)
ds_test = to_ds(Xte_win, yte, BATCH, shuffle=False)

In [98]:
# --- Roh-Wahrscheinlichkeiten ---
y_val_proba  = model.predict(ds_val,  verbose=0).ravel()
y_test_proba = model.predict(ds_test, verbose=0).ravel()

In [99]:
# === Kalibrierung: Isotonic vs. Platt (Temperature-like) ======================
from sklearn.linear_model import LogisticRegression
from scipy.special import logit, expit

# 1) Isotonic (wie gehabt)
iso = IsotonicRegression(out_of_bounds="clip").fit(y_val_proba, yva)
val_iso  = iso.transform(y_val_proba)
test_iso = iso.transform(y_test_proba)

# 2) Platt/Temperature-ähnlich: Logistic Regr. auf Logits
#    (entspricht sigmoid(a*logit(p)+b)); stabiler als isotonic bei knapper Datenlage.
logit_val = logit(np.clip(y_val_proba, 1e-6, 1-1e-6))
lr_platt = LogisticRegression(solver="lbfgs")
lr_platt.fit(logit_val.reshape(-1,1), yva)
def platt_transform(p):
    z = logit(np.clip(p, 1e-6, 1-1e-6)).reshape(-1,1)
    return lr_platt.predict_proba(z)[:,1]

val_platt  = platt_transform(y_val_proba)
test_platt = platt_transform(y_test_proba)

# Wähle Kalibrator per Val-Brier
brier_iso   = brier_score_loss(yva, val_iso)
brier_platt = brier_score_loss(yva, val_platt)
use_platt = brier_platt <= brier_iso
y_val_cal  = val_platt  if use_platt else val_iso
y_test_cal = test_platt if use_platt else test_iso
calib_name = "platt" if use_platt else "isotonic"

joblib.dump({"type": calib_name, "iso": iso if not use_platt else None,
             "platt_coef": (float(lr_platt.coef_[0,0]), float(lr_platt.intercept_[0]))},
            RUN_DIR / "calibrator.pkl")

print(f"Kalibrator gewählt: {calib_name} | Brier(val) iso={brier_iso:.4f}, platt={brier_platt:.4f}")

# === Schwelle via Expected Return auf VAL (mit Pos-Rate-Bounds) ===============
def choose_thr_expected_return(y_true, y_prob, fwd_ret, bounds=(0.2,0.8)):
    uniq = np.unique(y_prob); cand = np.r_[0.0, uniq, 1.0]
    best_t, best_er = 0.5, -1e9
    for t in cand:
        sig = (y_prob >= t).astype(int)
        pr = sig.mean()
        if not (bounds[0] <= pr <= bounds[1]): continue
        er = float((sig * fwd_ret).mean())      # erwartete Rendite
        if er > best_er: best_er, best_t = er, float(t)
    return best_t, best_er

# fwd-returns an die Fenster-Endindizes der Val/Tests anlegen
close = df["close"].copy()
fwd_val = (np.log(close.shift(-HORIZON)) - np.log(close)).reindex(idx_va)
fwd_tst = (np.log(close.shift(-HORIZON)) - np.log(close)).reindex(idx_te)

thr, er_val = choose_thr_expected_return(yva, y_val_cal, fwd_val.values, bounds=(0.2,0.8))
print(f"Threshold@ER(val) = {thr:.3f} | E[ret]_val={er_val:.6f}")

# (verhindert jegliche Off-by-…-Fehler)
y_test_from_ds = np.concatenate([y.numpy() for _, y in ds_test], axis=0).astype(int)

y_test_proba  = model.predict(ds_test, verbose=0).ravel()
# Falls du schon kalibrierst:
y_test_proba_cal = calibrator.transform(y_test_proba)

# Sicherstellen, dass alles passt
assert len(y_test_from_ds) == len(y_test_proba_cal), (
    f"len(y_true)={len(y_test_from_ds)} vs len(proba)={len(y_test_proba_cal)}"
)

# Threshold anwenden
y_test_pred = (y_test_proba_cal >= thr).astype(int)

# Jetzt Confusion Matrix etc.
cm  = confusion_matrix(y_test_from_ds, y_test_pred)
rep = classification_report(y_test_from_ds, y_test_pred, digits=3, output_dict=True)
fpr, tpr, _ = roc_curve(yte, y_test_cal); roc_auc = auc(fpr, tpr)
prec, rec, _ = precision_recall_curve(yte, y_test_cal); ap = average_precision_score(yte, y_test_cal)
brier = brier_score_loss(yte, y_test_cal)
bal_acc = balanced_accuracy_score(yte, y_test_pred)
mcc = matthews_corrcoef(yte, y_test_pred)

# === Decile-Lift & Precision@k ===============================================
def decile_lift(y_true, y_prob, k=10):
    dfp = pd.DataFrame({"y":y_true, "p":y_prob}).sort_values("p", ascending=False)
    dfp["decile"] = np.ceil((np.arange(len(dfp))+1) / (len(dfp)/k)).astype(int).clip(1,k)
    grp = dfp.groupby("decile")["y"].agg(["mean","count"]).rename(columns={"mean":"pos_rate"})
    lift = grp["pos_rate"] / dfp["y"].mean()
    grp["lift"] = lift
    return grp

def precision_at_k(y_true, y_prob, ks=(25,50,100)):
    order = np.argsort(-y_prob)
    out = {}
    for k in ks:
        sel = order[:min(k, len(order))]
        out[f"P@{k}"] = float(y_true[sel].mean())
    return out

deciles = decile_lift(yte, y_test_cal, 10)
p_at = precision_at_k(yte, y_test_cal, (25,50,100))

deciles.to_csv(RUN_DIR / "decile_lift_test.csv")

Kalibrator gewählt: isotonic | Brier(val) iso=0.2440, platt=0.2498
Threshold@ER(val) = 0.500 | E[ret]_val=0.000813


In [100]:
# --- Test-Evaluation (kalibriert @ thr) ---
y_test_pred = (y_test_proba_cal >= thr).astype(int)

cm   = confusion_matrix(yte, y_test_pred)
rep  = classification_report(yte, y_test_pred, digits=3, output_dict=True)

fpr, tpr, _ = roc_curve(yte, y_test_proba_cal)
roc_auc = auc(fpr, tpr)

prec, rec, _ = precision_recall_curve(yte, y_test_proba_cal)
ap = average_precision_score(yte, y_test_proba_cal)

brier   = brier_score_loss(yte, y_test_proba_cal)
bal_acc = balanced_accuracy_score(yte, y_test_pred)
mcc     = matthews_corrcoef(yte, y_test_pred)

print("Confusion matrix (test):\n", cm)
print("MCC:", round(mcc,3), "BalancedAcc:", round(bal_acc,3), "AUROC:", round(roc_auc,3), "AUPRC:", round(ap,3))

Confusion matrix (test):
 [[ 61 147]
 [ 63 181]]
MCC: 0.039 BalancedAcc: 0.518 AUROC: 0.519 AUPRC: 0.55


In [101]:
# --- (Optional) Block-Bootstrap-CI für MCC (zeitsensitiv) ---
rng = np.random.default_rng(SEED)
def block_bootstrap_mcc(y_true, y_prob, threshold, n=300, block=LOOKBACK):
    idx = np.arange(len(y_true))
    scores = []
    for _ in range(n):
        starts = rng.integers(0, max(1, len(idx)-block+1), size=max(1, len(idx)//block))
        bs = np.concatenate([np.arange(s, min(s+block, len(idx))) for s in starts])
        yp = (y_prob[bs] >= threshold).astype(int)
        scores.append(matthews_corrcoef(y_true[bs], yp))
    return np.percentile(scores, [2.5, 50, 97.5]).astype(float)

mcc_ci = block_bootstrap_mcc(yte, y_test_proba_cal, thr, n=300, block=LOOKBACK)
print("MCC Bootstrap CI [2.5,50,97.5]:", [round(x,3) for x in mcc_ci])

MCC Bootstrap CI [2.5,50,97.5]: [np.float64(-0.046), np.float64(0.029), np.float64(0.107)]


In [102]:
# --- Plots ---
FIG_DIR = RUN_DIR / "figures"
FIG_DIR.mkdir(exist_ok=True, parents=True)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC (Test)"); plt.legend()
plt.tight_layout(); plt.savefig(FIG_DIR / "roc_test.png", dpi=160); plt.close()

plt.figure(figsize=(6,4))
plt.plot(rec, prec, label=f"AP={ap:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision-Recall (Test)"); plt.legend()
plt.tight_layout(); plt.savefig(FIG_DIR / "pr_test.png", dpi=160); plt.close()

prob_true, prob_pred = calibration_curve(yte, y_test_proba_cal, n_bins=10, strategy="quantile")
plt.figure(figsize=(6,4))
plt.plot([0,1],[0,1],"--")
plt.plot(prob_pred, prob_true, marker="o")
plt.xlabel("Vorhergesagt"); plt.ylabel("Tatsächlich"); plt.title("Kalibrierung (Test)")
plt.tight_layout(); plt.savefig(FIG_DIR / "calibration_test.png", dpi=160); plt.close()

plt.figure(figsize=(4.5,4))
plt.imshow(cm, interpolation="nearest"); plt.title("Confusion Matrix (Test)"); plt.colorbar()
ticks = np.arange(2); plt.xticks(ticks, ["0","1"]); plt.yticks(ticks, ["0","1"])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.tight_layout(); plt.savefig(FIG_DIR / "cm_test.png", dpi=160); plt.close()

plt.figure(figsize=(6,4))
plt.hist(y_test_proba_cal, bins=30)
plt.axvline(thr, linestyle="--")
plt.title("P(y=1) Verteilung (Test, kalibriert)")
plt.tight_layout(); plt.savefig(FIG_DIR / "proba_hist_test.png", dpi=160); plt.close()

pos_rate_test = float(yte.mean())
plt.figure(figsize=(6,4))
plt.plot(rec, prec, label=f"AP={ap:.3f}")
plt.hlines(pos_rate_test, xmin=0, xmax=1, linestyles="--", label=f"Baseline={pos_rate_test:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision-Recall (Test)")
plt.legend()
plt.tight_layout(); plt.savefig(FIG_DIR / "pr_test.png", dpi=160); plt.close()

In [103]:
# --- Predictions-CSV (Test, kalibriert) ---
preds_test = pd.DataFrame({
    "timestamp": idx_te,
    "y_true": yte,
    "y_proba_cal": y_test_proba_cal,
    "y_pred": y_test_pred,
}).set_index("timestamp")
preds_test.to_csv(RUN_DIR / "preds_test.csv")

In [104]:
# --- Einfache Handels-Strategie & Backtest (kalibriert @ thr) ---
close = df["close"].copy()
fwd_logret = (np.log(close.shift(-HORIZON)) - np.log(close)).reindex(idx_te)

# --- Konservative Variante: Entry am Folgetag (T+1) ---
signals_t = (preds_test["y_proba_cal"] >= thr).astype(int).reindex(idx_te)
signals_t1 = signals_t.shift(1).fillna(0)  # erster Tag kein Trade
fwd_logret_t1 = (np.log(close.shift(-HORIZON)) - np.log(close)).reindex(idx_te)

equity_t1 = (signals_t1 * fwd_logret_t1).fillna(0).cumsum().apply(np.exp)

def _sharpe(logrets, periods_per_year=252):
    if len(logrets) < 2: return float("nan")
    mu = logrets.mean() * periods_per_year
    sigma = logrets.std(ddof=1) * np.sqrt(periods_per_year)
    return float(mu / (sigma + 1e-12))

def _cagr(equity_series, periods_per_year=252):
    if len(equity_series) < 2: return float("nan")
    T = len(equity_series) / periods_per_year
    return float((equity_series.iloc[-1] / equity_series.iloc[0])**(1.0/T) - 1.0)

backtest = {
    "n_trades": int(signals.sum()),
    "avg_holding_h": HORIZON,
    "strategy": {
        "CAGR": _cagr(equity),
        "Sharpe": _sharpe(strategy_logret.dropna()),
        "final_equity": float(equity.iloc[-1]),
    },
    "buy_hold": {
        "CAGR": _cagr(bh_equity),
        "final_equity": float(bh_equity.iloc[-1]),
    },
}

plt.figure(figsize=(8,4))
plt.plot(equity.index, equity.values, label="Entry@t (optimistisch)")
plt.plot(equity_t1.index, equity_t1.values, label="Entry@t+1 (konservativ)")
plt.plot(bh_equity.index, bh_equity.values, label="Buy & Hold", linestyle="--")
plt.title(f"Equity Curves (H={HORIZON})")
plt.legend(); plt.tight_layout()
plt.savefig(FIG_DIR / "equity_curves_t_vs_t1.png", dpi=160); plt.close()

plt.figure(figsize=(6,4))
plt.boxplot([fwd_logret[signals==0].dropna(), fwd_logret[signals==1].dropna()], tick_labels=["Signal=0","Signal=1"])
plt.title("Forward Log-Return nach Signal")
plt.tight_layout(); plt.savefig(FIG_DIR / "forward_returns_by_signal.png", dpi=160); plt.close()

In [105]:
# === Vor/Nach: Histogramm + Brier ============================================
brier_raw = brier_score_loss(yte, y_test_proba)
plt.figure(figsize=(6,4))
plt.hist(y_test_proba, bins=30, alpha=0.6, label="raw")
plt.hist(y_test_cal,  bins=30, alpha=0.6, label=f"calibrated ({calib_name})")
plt.axvline(thr, linestyle="--", label=f"thr={thr:.3f}")
plt.title("P(y=1) – raw vs. calibrated (Test)")
plt.legend(); plt.tight_layout()
plt.savefig(RUN_DIR / "figures/proba_hist_raw_vs_cal.png", dpi=160); plt.close()

print(f"Brier raw={brier_raw:.4f} → calibrated={brier:.4f}")
print("Precision@k:", p_at)
print("Top-Deciles (head):\n", deciles.head(3))

Brier raw=0.2504 → calibrated=0.2484
Precision@k: {'P@25': 0.6, 'P@50': 0.5, 'P@100': 0.5}
Top-Deciles (head):
         pos_rate  count      lift
decile                           
1       0.577778     45  1.070310
2       0.288889     45  0.535155
3       0.622222     45  1.152641


In [106]:
# --- Ergebnisse schreiben ---
out = {
    "config": RCFG,
    "features_used": FEATURES,
    "threshold_selection": {
        "strategy": "max_mcc_with_pos_rate_bounds",
        "threshold": float(thr),
        "pos_rate_bounds": [0.2, 0.8],
        "val_mcc": float(score_val),
    },
    "calibration": {"method": "isotonic", "path": str(RUN_DIR / "calibrator.joblib")},
    "metrics": {
        "test": {
            "roc_auc": float(roc_auc),
            "auprc": float(ap),
            "brier": float(brier),
            "balanced_accuracy": float(bal_acc),
            "mcc": float(mcc),
            "mcc_bootstrap_ci": [float(mcc_ci[0]), float(mcc_ci[1]), float(mcc_ci[2])],
            "confusion_matrix": cm.tolist(),
            "report": rep,
        }
    },
    "backtest": backtest,
    "calibration": {
    "method": "isotonic",
    "path": str(RUN_DIR / "calibrator.joblib"),
    "brier_raw": float(brier_raw),
    "brier_cal": float(brier_cal)
},
"diagnostics": {
    "val_pos_rate_cal": float(y_val_proba_cal.mean()),
    "test_pos_rate_cal": float(y_test_proba_cal.mean()),
    "test_pred_pos_rate_at_thr": float((y_test_proba_cal >= thr).mean()),
},
"backtest_conservative": {
    "final_equity": float(equity_t1.iloc[-1]),
}
}
with open(RUN_DIR / "evaluation.json", "w") as f:
    json.dump(out, f, indent=2)

print("\nBlock 4 abgeschlossen.")
print("Artefakte:")
print(" -", RUN_DIR / "preds_test.csv")
print(" -", RUN_DIR / "evaluation.json")
print(" -", RUN_DIR / "figures")


Block 4 abgeschlossen.
Artefakte:
 - ..\results\2025-10-03_13-48-57_lstm\preds_test.csv
 - ..\results\2025-10-03_13-48-57_lstm\evaluation.json
 - ..\results\2025-10-03_13-48-57_lstm\figures
