In [1]:
# %% [step0-setup]

"""
Setup & Konstanten:

* Importe, Logging, Pfade, Typen
* Globale Parameter (Zeiträume, CV-Größen, Grid, Feature-Gruppen-YAML)
* Hilfsfunktionen für IO, Index-Checks, Pfaderstellung
"""
from __future__ import annotations

import json
import logging
import math
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple

import joblib
import numpy as np
import pandas as pd
import yaml
from matplotlib import pyplot as plt
from numpy.typing import ArrayLike
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (
    accuracy_score,
    auc,
    brier_score_loss,
    f1_score,
    log_loss,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted
from sklearn.inspection import permutation_importance

# ---------- Logging ----------

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("linear_logit")

# --- Artefakt-Pfade einrichten ---

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    chain = [start, *start.parents]
    # 1) Bevorzugt: exakte Daten-Datei vorhanden (verhindert notebooks\artifacts-Falle)
    for p in chain:
        if (p / "artifacts" / "data" / "features_monthly.parquet").exists():
            return p
    # 2) Falls nur das artifacts/data-Verzeichnis existiert
    for p in chain:
        if (p / "artifacts" / "data").exists():
            return p
    # 3) Klassische Marker (falls du config/src als Root-Kriterium nutzt)
    for p in chain:
        if (p / "config").exists() and (p / "src").exists():
            return p
    raise AssertionError("Project root not found – expected 'artifacts/data' or 'config'+'src' somewhere above.")

ROOT = find_project_root(Path.cwd())
ARTIFACTS = ROOT / "artifacts"
DATA_DIR = ARTIFACTS / "data"
CONF_DIR = ARTIFACTS / "config"
FORECASTS_DIR = ARTIFACTS / "forecasts"
METRICS_DIR = ARTIFACTS / "metrics"
REPORTS_DIR = ARTIFACTS / "reports"
MODELS_DIR = ARTIFACTS / "models"  # zusätzlich benötigt

for p in [DATA_DIR, CONF_DIR, FORECASTS_DIR, METRICS_DIR, REPORTS_DIR, MODELS_DIR]:
    p.mkdir(parents=True, exist_ok=True)
logger.info(f"ROOT={ROOT} | DATA_DIR={DATA_DIR}")

# (Alias-Namen beibehalten, damit restlicher Code unverändert bleiben kann)
ARTIFACTS_DIR = ARTIFACTS

# ---------- Daten-Pfade ----------

FEATURES_PARQUET = DATA_DIR / "features_monthly.parquet"
RAW_PARQUET = DATA_DIR / "raw_data.parquet"
FEATURE_GROUPS_YAML = CONF_DIR / "feature_groups.yaml"  # wird geladen/erstellt

# ---------- Zeiträume ----------

TRAIN_START = pd.Timestamp("2009-02-28")
TRAIN_END = pd.Timestamp("2019-12-31")
TEST_START = pd.Timestamp("2020-01-31")
TEST_END = pd.Timestamp("2025-05-31")

# ---------- CV-Parameter ----------

N_SPLITS = 5
VAL_SIZE = 12  # Monate
EMBARGO = 1   # Gap

# ---------- Grid-Search ----------

GRID_C = [0.1, 0.5, 1.0, 2.0, 5.0]

# ---------- Reproduzierbarkeit ----------

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


2025-08-24 17:53:45,872 | INFO | ROOT=C:\Users\gamer\Desktop\AktienPrognose | DATA_DIR=C:\Users\gamer\Desktop\AktienPrognose\artifacts\data


In [2]:
# %% [step1-daten_und_featuresets]

"""
Daten laden, Spalten prüfen, Feature-Sets laden/erstellen (YAML oder Heuristik)

* Lädt features_monthly & raw_data
* Zielspalten prüfen
* Preisspalte heuristisch bestimmen
* Indexe alignen, Zeitfenster schneiden
* Feature-Sets (TECH/MACRO/INTEGRATED) laden bzw. heuristisch ableiten und speichern
"""
def _ensure_datetime_index(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """Sichert, dass ein DatetimeIndex vorliegt (sonst Fehler)."""
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError(f"{name} benötigt einen DatetimeIndex.")
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
    return df


def _read_parquet_safely(path: Path, name: str) -> pd.DataFrame:
    """Robustes Laden von Parquet mit klaren Fehlermeldungen."""
    if not path.exists():
        raise FileNotFoundError(f"Fehlende Datei: {path} ({name})")
    try:
        df = pd.read_parquet(path)
    except Exception as e:
        raise RuntimeError(f"Fehler beim Laden von {path}: {e}") from e
    return _ensure_datetime_index(df, name)


def _infer_price_column(df_raw: pd.DataFrame) -> str:
    """Heuristik zur Wahl der Preis-Spalte (S&P 500 o.ä.), bevorzugt GSPC/SP500/adjclose."""
    candidates_priority = [
        "SP500",
        "^GSPC_adjclose",
        "^GSPC",
        "GSPC",
        "sp500",
        "S&P500",
        "Adj Close",
        "Adj_Close",
        "Close",
        "close",
    ]
    for c in candidates_priority:
        if c in df_raw.columns:
            return c
    num_cols = [c for c in df_raw.columns if pd.api.types.is_numeric_dtype(df_raw[c])]
    if not num_cols:
        raise RuntimeError("Keine numerischen Spalten in raw_data gefunden.")
    ac_scores: Dict[str, float] = {}
    for c in num_cols:
        s = df_raw[c].dropna()
        if len(s) < 24:
            continue
        ac = s.autocorr(lag=1)
        ac_scores[c] = ac if not np.isnan(ac) else -999.0
    if not ac_scores:
        raise RuntimeError("Konnte keine geeignete Preis-Spalte heuristisch bestimmen.")
    best = max(ac_scores, key=ac_scores.get)
    logger.warning(f"Preisspalte heuristisch gewählt: {best}")
    return best


def _load_or_infer_feature_sets(df_features: pd.DataFrame) -> Dict[str, List[str]]:
    """Lädt TECH/MACRO aus YAML, oder leitet sie heuristisch aus Spaltennamen ab."""
    target_cols = {"y_direction_next", "y_return_next_pct"}
    available = [c for c in df_features.columns if c not in target_cols]

    # YAML vorhanden?
    if FEATURE_GROUPS_YAML.exists():
        try:
            with open(FEATURE_GROUPS_YAML, "r", encoding="utf-8") as f:
                cfg = yaml.safe_load(f) or {}
            tech = cfg.get("TECH_FEATURES", [])
            macro = cfg.get("MACRO_FEATURES", [])
            assert isinstance(tech, list) and isinstance(macro, list)
            tech = [c for c in tech if c in available]
            macro = [c for c in macro if c in available and c not in tech]
            logger.info(f"Feature-Sets aus YAML geladen: TECH={len(tech)}, MACRO={len(macro)}")
            if tech and macro:
                return {"TECH": tech, "MACRO": macro, "INTEGRATED": sorted(set(tech + macro))}
        except Exception as e:
            logger.error(f"Feature-Gruppen YAML ungültig/fehlend, nutze Heuristik: {e}")

    # Heuristik
    tech_patterns = [
        "SMA", "EMA", "MA", "Momentum", "Mom", "Volatility", "Vol",
        "Return_Lag", "RSI", "MACD", "Bollinger", "BB", "ATR", "Stoch"
    ]
    macro_patterns = [
        "CPI", "Inflat", "Unemployment", "VIX", "EPU", "FSI", "Fed",
        "Funds", "Delta", "USD", "EUR", "WTI", "Gold", "oil", "Brent",
        "DGS", "Yield", "Rate"
    ]
    tech: List[str] = []
    macro: List[str] = []
    for c in available:
        uc = c.upper()
        if any(pat.upper() in uc for pat in tech_patterns):
            tech.append(c)
        elif any(pat.upper() in uc for pat in macro_patterns):
            macro.append(c)
        else:
            macro.append(c)

    tech = sorted(set(tech))
    macro = [c for c in sorted(set(macro)) if c not in tech]
    if not tech:
        tech = [c for c in available if ("SMA" in c or "Mom" in c or "Vol" in c)][:5]
    if not macro:
        macro = [c for c in available if c not in tech]

    integ = sorted(set(tech + macro))
    logger.info(f"Feature-Sets heuristisch bestimmt: TECH={len(tech)}, MACRO={len(macro)}, INTEGRATED={len(integ)}")

    # Speichern
    try:
        FEATURE_GROUPS_YAML.parent.mkdir(parents=True, exist_ok=True)
        with open(FEATURE_GROUPS_YAML, "w", encoding="utf-8") as f:
            yaml.safe_dump(
                {"TECH_FEATURES": tech, "MACRO_FEATURES": macro},
                f,
                sort_keys=False,
                allow_unicode=True,
            )
        logger.info(f"Feature-Gruppen gespeichert: {FEATURE_GROUPS_YAML}")
    except Exception as e:
        logger.error(f"Konnte Feature-Gruppen nicht speichern: {e}")

    return {"TECH": tech, "MACRO": macro, "INTEGRATED": integ}


# --- Laden der Artefakte ---

df_features: pd.DataFrame = _read_parquet_safely(FEATURES_PARQUET, "features_monthly")
df_raw: pd.DataFrame = _read_parquet_safely(RAW_PARQUET, "raw_data")

# --- Targets prüfen ---

required_targets = ["y_direction_next", "y_return_next_pct"]
missing_targets = [c for c in required_targets if c not in df_features.columns]
if missing_targets:
    raise KeyError(f"Zielspalten fehlen in features_monthly: {missing_targets}")

# --- Preisreihe bestimmen ---

price_col = _infer_price_column(df_raw)
s_price = df_raw[price_col].astype(float)

# --- Auf gemeinsame Monatsachse beschränken ---

common_idx = df_features.index.intersection(s_price.index)
df_features = df_features.loc[common_idx].copy()
s_price = s_price.loc[common_idx].copy()

# --- Feature-Sets laden/ableiten ---

FEATURE_GROUPS = _load_or_infer_feature_sets(df_features)

# --- Zeitfenster schneiden ---

df_train = df_features.loc[(df_features.index >= TRAIN_START) & (df_features.index <= TRAIN_END)].copy()
df_test = df_features.loc[(df_features.index >= TEST_START) & (df_features.index <= TEST_END)].copy()
s_price_train = s_price.loc[df_train.index]
s_price_test = s_price.loc[df_test.index]

logger.info(f"Train {df_train.index.min().date()} → {df_train.index.max().date()} | n={len(df_train)}")
logger.info(f"Test  {df_test.index.min().date()} → {df_test.index.max().date()} | n={len(df_test)}")



2025-08-24 17:53:45,921 | INFO | Feature-Sets aus YAML geladen: TECH=5, MACRO=9
2025-08-24 17:53:45,922 | INFO | Train 2009-02-28 → 2019-12-31 | n=131
2025-08-24 17:53:45,922 | INFO | Test  2020-01-31 → 2025-05-31 | n=65


In [3]:
# %% [step2-tscv_splitter]

"""
Zeitreihen-CV (Expanding, 5-Fold) mit 12M Validierung und 1M Embargo (gap).
Hilfsfunktionen:

* get_tscv(): TimeSeriesSplit Objekt
* split_Xy(): Feature/Target-Splits
"""
def get_tscv(n_splits: int = N_SPLITS, test_size: int = VAL_SIZE, gap: int = EMBARGO) -> TimeSeriesSplit:
    """Erzeugt TimeSeriesSplit mit festem Validierungsfenster und Embargo."""
    return TimeSeriesSplit(n_splits=n_splits, test_size=test_size, gap=gap)


def split_Xy(
    df: pd.DataFrame,
    y_reg: str = "y_return_next_pct",
    y_clf: str = "y_direction_next",
) -> Tuple[pd.DataFrame, pd.Series, pd.Series]:
    """Teilt Datensatz in X und Ziele (Reg + Clf)."""
    missing = [c for c in (y_reg, y_clf) if c not in df.columns]
    if missing:
        raise KeyError(f"Missing target column(s): {missing}")
    X = df.drop(columns=[y_reg, y_clf])
    yr = df[y_reg]
    yc = df[y_clf].astype("int8")
    return X, yr, yc

In [4]:
# %% [step3-grid_oof_kalibrierung_threshold]

"""
Grid-Search (LR) → OOF-Scores → Platt-Kalibrierung → Schwellen-Optimierung (F1).
Optionale OOF-Evaluierung mit SGDClassifier (nur Vergleich, keine Persistenz).
Ergebnis je Feature-Set:

* best_C
* OOF-Entscheidungsscores, kalibrierte OOF-Probas
* globaler Schwellenwert theta (F1-optimal)
* CV-Details (cv_results_) als CSV
"""
from dataclasses import dataclass
from typing import Tuple, List, Dict
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import f1_score, log_loss, roc_auc_score
from sklearn.utils.validation import check_is_fitted

@dataclass
class OOFResult:
    best_C: float
    oof_scores: np.ndarray          # decision_function Scores (nur für gültige OOF-Indizes)
    oof_proba_cal: np.ndarray       # kalibrierte Wahrscheinlichkeiten (Positivklasse), gültige Indizes
    threshold: float                # globales theta (F1-optimal)
    cv_details_path: Path
    coef_per_fold: pd.DataFrame     # Koeffizienten je Fold (für Explainability)
    oof_index: pd.Index             # Index der Beobachtungen, für die OOF-Scores existieren

class PlattCalibrator:
    """Einfacher Platt-Kalibrierer (sigmoid) via Logistische Regression auf Scores."""
    def __init__(self) -> None:
        self._lr = LogisticRegression(solver="lbfgs", max_iter=10000)

    def fit(self, scores: ArrayLike, y_true: ArrayLike) -> "PlattCalibrator":
        x = np.asarray(scores, dtype=float).reshape(-1, 1)
        y = np.asarray(y_true, dtype=int)
        self._lr.fit(x, y)
        return self

    def predict_proba(self, scores: ArrayLike) -> np.ndarray:
        check_is_fitted(self._lr)
        x = np.asarray(scores, dtype=float).reshape(-1, 1)
        p = self._lr.predict_proba(x)[:, 1]
        return p

    @property
    def coef_(self) -> np.ndarray:
        check_is_fitted(self._lr)
        return self._lr.coef_.copy()

    @property
    def intercept_(self) -> np.ndarray:
        check_is_fitted(self._lr)
        return self._lr.intercept_.copy()

def _build_lr_pipeline(C: float | None = None) -> Pipeline:
    """Erzeugt Pipeline(StandardScaler -> LogisticRegression)."""
    clf = LogisticRegression(
        class_weight="balanced",
        solver="lbfgs",
        max_iter=10000,
        C=1.0 if C is None else C,
        random_state=RANDOM_STATE,
    )
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", clf),
    ])
    return pipe

def _grid_search_lr(
    X: pd.DataFrame,
    y: pd.Series,
    feature_names: List[str],
    set_name: str,
) -> Tuple[Pipeline, Path, pd.DataFrame]:
    """Grid-Search über C, scoring=neg_log_loss, TSCV."""
    param_grid = {"clf__C": GRID_C}
    tscv = get_tscv()
    pipe = _build_lr_pipeline()
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        scoring="neg_log_loss",
        cv=tscv,
        n_jobs=-1,
        refit=True,
        return_train_score=True,
    )
    gs.fit(X[feature_names], y)
    best_pipe: Pipeline = gs.best_estimator_
    # CV-Details speichern
    cv_df = pd.DataFrame(gs.cv_results_)
    cv_details_path = METRICS_DIR / f"linear_logit_{set_name}_cv_results.csv"
    cv_df.to_csv(cv_details_path, index=False)
    logger.info(f"[{set_name}] Best C={gs.best_params_['clf__C']} (CV neg_log_loss={gs.best_score_:.4f})")
    return best_pipe, cv_details_path, cv_df

def _oof_scores_and_coefs(
    pipe: Pipeline,
    X: pd.DataFrame,
    y: pd.Series,
    feature_names: List[str],
) -> Tuple[np.ndarray, pd.DataFrame, pd.Index]:
    """
    Erzeugt OOF decision_function-Scores via TimeSeriesSplit OHNE cross_val_predict.
    Gibt (oof_scores_valid, coef_df, oof_index_valid) zurück.
    """
    tscv = get_tscv()
    n = len(X)
    oof_scores = np.full(shape=n, fill_value=np.nan, dtype=float)
    coefs: List[pd.Series] = []

    for fold, (tri, vai) in enumerate(tscv.split(X[feature_names], y), start=1):
        X_tr, y_tr = X.iloc[tri][feature_names], y.iloc[tri]
        X_va = X.iloc[vai][feature_names]

        tmp = _build_lr_pipeline(C=pipe.named_steps["clf"].C)
        tmp.fit(X_tr, y_tr)

        # decision_function auf Validierungs-Indices
        scores_va = tmp.decision_function(X_va)
        oof_scores[vai] = scores_va

        # Koeffizienten speichern
        beta = tmp.named_steps["clf"].coef_.ravel()
        coefs.append(pd.Series(beta, index=feature_names, name=f"fold{fold}"))

    coef_df = pd.concat(coefs, axis=1)

    valid_mask = np.isfinite(oof_scores)
    oof_index_valid = X.index[valid_mask]
    oof_scores_valid = oof_scores[valid_mask]

    return oof_scores_valid, coef_df, oof_index_valid

def _optimize_threshold_f1(y_true: ArrayLike, proba: ArrayLike) -> Tuple[float, Dict[float, float]]:
    """Sucht Schwelle theta in [0.01, 0.99], die F1 maximiert."""
    y = np.asarray(y_true, dtype=int)
    p = np.asarray(proba, dtype=float)
    grid = np.linspace(0.01, 0.99, 99)
    f1_map: Dict[float, float] = {}
    best_t, best_f1 = 0.5, -1.0
    for t in grid:
        yhat = (p >= t).astype(int)
        f1 = f1_score(y, yhat, zero_division=0)
        f1_map[float(t)] = float(f1)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return float(best_t), f1_map

def run_oof_training_for_set(
    set_name: str,
    feature_names: List[str],
    df_train: pd.DataFrame,
) -> Tuple[OOFResult, PlattCalibrator, Pipeline]:
    """Führt Grid-Search, OOF, Kalibrierung, Threshold-Bestimmung für ein Feature-Set aus."""
    X_tr, _, y_tr = split_Xy(df_train)

    # Grid Search
    best_pipe, cv_details_path, _ = _grid_search_lr(X_tr, y_tr, feature_names, set_name)
    best_C = float(best_pipe.named_steps["clf"].C)

    # OOF Scores + Fold-Koeffizienten (zeitreihen-sicher)
    oof_scores, coef_folds, oof_index = _oof_scores_and_coefs(best_pipe, X_tr, y_tr, feature_names)

    # Platt-Kalibrierung auf OOF-Scores (nur gültige Indizes)
    calibrator = PlattCalibrator().fit(oof_scores, y_tr.loc[oof_index].values)
    oof_proba_cal = calibrator.predict_proba(oof_scores)

    # Schwellen-Optimierung (F1) auf validem OOF-Ausschnitt
    threshold, _ = _optimize_threshold_f1(y_tr.loc[oof_index].values, oof_proba_cal)

    # OOF-Qualitätsmetriken (nur Info-Log)
    oof_auc = roc_auc_score(y_tr.loc[oof_index].values, oof_proba_cal)
    oof_f1 = f1_score(y_tr.loc[oof_index].values, (oof_proba_cal >= threshold).astype(int), zero_division=0)
    logger.info(f"[{set_name}] OOF AUC={oof_auc:.3f} | OOF F1@{threshold:.2f}={oof_f1:.3f} (valid n={len(oof_index)})")

    res = OOFResult(
        best_C=best_C,
        oof_scores=oof_scores,
        oof_proba_cal=oof_proba_cal,
        threshold=threshold,
        cv_details_path=cv_details_path,
        coef_per_fold=coef_folds,
        oof_index=oof_index,
    )
    return res, calibrator, best_pipe

# Optional: SGDClassifier nur zur Validierung (nicht persistiert)
def sgd_oof_check(
    feature_names: List[str],
    df_train: pd.DataFrame,
    set_name: str,
) -> Dict[str, float]:
    """Optionale OOF-Evaluierung mit SGD (log_loss), ohne cross_val_predict."""
    X_tr, _, y_tr = split_Xy(df_train)
    tscv = get_tscv()
    proba = np.full(len(X_tr), np.nan, dtype=float)

    for (tri, vai) in tscv.split(X_tr[feature_names], y_tr):
        X_tr_i, y_tr_i = X_tr.iloc[tri][feature_names], y_tr.iloc[tri]
        X_va_i = X_tr.iloc[vai][feature_names]

        sgd = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", SGDClassifier(
                loss="log_loss", penalty="l2",
                max_iter=2000, random_state=RANDOM_STATE,
                class_weight="balanced",
            )),
        ])
        sgd.fit(X_tr_i, y_tr_i)
        proba[vai] = sgd.predict_proba(X_va_i)[:, 1]

    valid = np.isfinite(proba)
    y_true = y_tr.values[valid]
    p = proba[valid]
    yhat = (p >= 0.5).astype(int)

    # NEU: clip statt eps-Argument
    p_clipped = np.clip(p, 1e-15, 1.0 - 1e-15)

    out = {
        "sgd_oof_auc": float(roc_auc_score(y_true, p)),
        "sgd_oof_f1_0p5": float(f1_score(y_true, yhat, zero_division=0)),
        "sgd_oof_logloss": float(log_loss(y_true, p_clipped)),
    }
    logger.info(f"[{set_name}] SGD OOF AUC={out['sgd_oof_auc']:.3f} | F1@0.5={out['sgd_oof_f1_0p5']:.3f} (valid n={valid.sum()})")
    return out

In [5]:
# %% [step4-walk_forward_test]

"""
Walk-Forward-Test (Expanding Origin):

* Für jeden Testmonat t: Train = [TRAIN_START .. t-1], Modell neu fitten (C=best_C), Proba(t)
* Kalibrierer aus OOF (fix) anwenden, globalen Threshold anwenden
* Forecast-CSV je Feature-Set ausgeben
* Kumulierte Renditen (Long/Cash) vorbereiten
"""
def monthly_return_pct(price: pd.Series) -> pd.Series:
    """Einfache Monatsrendite in %."""
    return 100.0 * (price / price.shift(1) - 1.0)


def _compute_strategy_returns(
    idx: pd.DatetimeIndex,
    pred_dir: pd.Series,          # 0/1 Klassensignal für Monate im Test
    s_price: pd.Series,           # Preislevel (Monatsultimo)
) -> pd.Series:
    """Long/Cash-Strategie: investiere in Up-Monaten, sonst 0%."""
    # Align Renditen mit Testindex
    ret = monthly_return_pct(s_price).reindex(idx).astype(float)
    strat_ret = ret * pred_dir.astype(int)  # in % Einheiten
    return strat_ret


def run_walk_forward(
    set_name: str,
    feature_names: List[str],
    df_full: pd.DataFrame,
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    s_price_full: pd.Series,
    best_C: float,
    calibrator: PlattCalibrator,
    threshold: float,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Expanding-Origin Walk-Forward auf dem Testfenster, mit globalem Kalibrator & Threshold."""
    X_full, _, y_full = split_Xy(df_full)
    X_tr0, _, y_tr0 = split_Xy(df_train)
    X_te0, _, y_te0 = split_Xy(df_test)

    test_idx = X_te0.index
    preds: List[float] = []
    probas_raw: List[float] = []
    probas_cal: List[float] = []

    # Initiales Train-Ende
    all_idx = X_full.index
    for t in test_idx:
        # Trainingsfenster = alles < t (expanding)
        train_mask = all_idx < t
        X_tr_t = X_full.loc[train_mask, feature_names]
        y_tr_t = y_full.loc[train_mask]

        # Fit mit best_C
        pipe = _build_lr_pipeline(C=best_C)
        pipe.fit(X_tr_t, y_tr_t)

        # Decision score & kalibrierte Proba für Monat t
        score_t = float(pipe.decision_function(X_te0.loc[[t], feature_names])[0])
        proba_t = float(calibrator.predict_proba([score_t])[0])
        yhat_t = 1.0 if proba_t >= threshold else 0.0

        probas_raw.append(float(1.0 / (1.0 + math.exp(-score_t))))  # Sigmoid(score) rein informativ
        probas_cal.append(proba_t)
        preds.append(yhat_t)

    df_forecast = pd.DataFrame(
        {
            "date": test_idx,
            "y_true": y_te0.values.astype(int),
            "y_pred": np.array(preds, dtype=int),
            "proba_cal": np.array(probas_cal, dtype=float),
            "proba_raw_sigmoid": np.array(probas_raw, dtype=float),
            "featureset": set_name,
            "model_class": "Linear",
        }
    ).set_index("date")

    # Strategie-Renditen (Long/Cash) + Buy&Hold
    strat_ret = _compute_strategy_returns(df_forecast.index, df_forecast["y_pred"], s_price_full)
    bh_ret = monthly_return_pct(s_price_full).reindex(df_forecast.index).astype(float)

    df_perf = pd.DataFrame(
        {
            "date": df_forecast.index,
            "ret_model_pct": strat_ret.values,
            "ret_bh_pct": bh_ret.values,
        }
    ).set_index("date")

    # Persist Forecast
    out_path = FORECASTS_DIR / f"linear_logit_{set_name}.csv"
    df_forecast.to_csv(out_path, index=True)
    logger.info(f"[{set_name}] Forecast gespeichert: {out_path}")

    return df_forecast, df_perf

In [6]:
# %% [step5-metriken_signifikanz_json]

"""
Metriken (OOF & Test) + Signifikanztests:

* OOF: AUC, F1, LogLoss, Brier
* Test: AUC, F1, Acc, LogLoss, Brier
* DM-Test (0/1-Loss) vs. Always-Up & vs. Persistence
* McNemar vs. Always-Up
* JSON je Feature-Set speichern
"""
def _binary_loss(y_true: ArrayLike, y_pred: ArrayLike) -> np.ndarray:
    """0/1-Loss je Beobachtung."""
    y = np.asarray(y_true, dtype=int)
    yhat = np.asarray(y_pred, dtype=int)
    return (y != yhat).astype(int)

def diebold_mariano_01loss(e1: ArrayLike, e2: ArrayLike, h: int = 1) -> Tuple[float, float]:
    """
    Sehr vereinfachter DM-Test für Differenzen der 0/1-Loss-Reihe (h=1).
    Gibt (DM-Statistik, p-Wert) zurück. Hinweis: Für Bachelor-Zwecke ausreichend.
    """
    d = np.asarray(e1, dtype=float) - np.asarray(e2, dtype=float)
    d = d[np.isfinite(d)]
    T = len(d)
    if T < 8:
        return np.nan, np.nan
    dbar = d.mean()
    var = d.var(ddof=1)  # Newey-West mit Lag 0
    if var <= 0:
        return np.nan, np.nan
    dm = dbar / math.sqrt(var / T)
    p = 2.0 * (1.0 - 0.5 * (1 + math.erf(abs(dm) / math.sqrt(2))))
    return float(dm), float(p)

def mcnemar_test(y_true: ArrayLike, y_pred_a: ArrayLike, y_pred_b: ArrayLike) -> Tuple[int, int, float]:
    """
    McNemar-Test (approx. Chi^2 mit Kontinuitätskorrektur).
    Rückgabe: (b, c, p-Wert)
    """
    y = np.asarray(y_true, dtype=int)
    a = np.asarray(y_pred_a, dtype=int)
    b = np.asarray(y_pred_b, dtype=int)
    correct_a = (a == y).astype(int)
    correct_b = (b == y).astype(int)
    b_count = int(((correct_a == 1) & (correct_b == 0)).sum())
    c_count = int(((correct_a == 0) & (correct_b == 1)).sum())
    chi2 = (abs(b_count - c_count) - 1) ** 2 / (b_count + c_count + 1e-12)
    z = math.sqrt(chi2)
    p = 2.0 * (1.0 - 0.5 * (1 + math.erf(z / math.sqrt(2))))
    return b_count, c_count, float(p)

def evaluate_and_write_metrics(
    set_name: str,
    oof: OOFResult,
    calibrator: PlattCalibrator,
    df_forecast: pd.DataFrame,
    coef_path: Path,
    permimp_path: Path,
    cv_details_path: Path,
) -> Path:
    """Berechnet Metriken & Tests, schreibt JSON und gibt Pfad zurück."""
    # --- OOF (nur gültiger OOF-Ausschnitt) ---
    y_oof = df_train.loc[oof.oof_index, "y_direction_next"].values.astype(int)
    p_oof = oof.oof_proba_cal
    yhat_oof = (p_oof >= oof.threshold).astype(int)
    # NEU: clip statt eps-Argument
    p_oof_clipped = np.clip(p_oof, 1e-15, 1.0 - 1e-15)
    oof_auc = float(roc_auc_score(y_oof, p_oof))
    oof_f1 = float(f1_score(y_oof, yhat_oof, zero_division=0))
    oof_ll = float(log_loss(y_oof, p_oof_clipped))
    oof_brier = float(brier_score_loss(y_oof, p_oof))

    # --- Test ---
    y_te = df_forecast["y_true"].values.astype(int)
    p_te = df_forecast["proba_cal"].values.astype(float)
    yhat_te = df_forecast["y_pred"].values.astype(int)
    # NEU: clip statt eps-Argument
    p_te_clipped = np.clip(p_te, 1e-15, 1.0 - 1e-15)
    test_auc = float(roc_auc_score(y_te, p_te))
    test_f1 = float(f1_score(y_te, yhat_te, zero_division=0))
    test_acc = float(accuracy_score(y_te, yhat_te))
    test_ll = float(log_loss(y_te, p_te_clipped))
    test_brier = float(brier_score_loss(y_te, p_te))

    # --- Baselines für Tests ---
    yhat_up = np.ones_like(y_te, dtype=int)
    y_all = pd.concat([df_train["y_direction_next"], df_test["y_direction_next"]])
    y_persist = y_all.shift(1).reindex(df_forecast.index).fillna(method="ffill").astype(int).values

    # --- DM-Test (0/1-Loss) ---
    loss_model = _binary_loss(y_te, yhat_te)
    loss_up = _binary_loss(y_te, yhat_up)
    loss_persist = _binary_loss(y_te, y_persist)
    dm_up, p_up = diebold_mariano_01loss(loss_model, loss_up, h=1)
    dm_pers, p_pers = diebold_mariano_01loss(loss_model, loss_persist, h=1)

    # --- McNemar vs Always-Up ---
    b_cnt, c_cnt, p_mcnemar = mcnemar_test(y_te, yhat_te, yhat_up)

    # --- JSON schreiben ---
    metrics = {
        "featureset": set_name,
        "model_class": "Linear",
        "cv_details_path": str(cv_details_path),
        "coef_path": str(coef_path),
        "permimp_path": str(permimp_path),
        "threshold": float(oof.threshold),
        "oof_auc": oof_auc,
        "oof_f1": oof_f1,
        "oof_logloss": oof_ll,
        "oof_brier": oof_brier,
        "test_auc": test_auc,
        "test_f1": test_f1,
        "test_acc": test_acc,
        "test_logloss": test_ll,
        "test_brier": test_brier,
        "dm_vs_always_up": {"stat": dm_up, "pvalue": p_up},
        "dm_vs_persistence": {"stat": dm_pers, "pvalue": p_pers},
        "mcnemar_vs_always_up": {"b": b_cnt, "c": c_cnt, "pvalue": p_mcnemar},
        "oof_valid_n": int(len(oof.oof_index)),
    }
    out_path = METRICS_DIR / f"linear_logit_{set_name}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)
    logger.info(f"[{set_name}] Metrics gespeichert: {out_path}")
    return out_path


In [7]:
# %% [step6-explainability]

"""
Explainability:

* Koeffizienten je Fold + Median -> CSV
* Permutation Importance (log-loss) im letzten Train-Fenster -> Top-20 -> CSV
"""
def save_coef_profiles(
    set_name: str,
    coef_folds: pd.DataFrame,
) -> Path:
    """Speichert Fold-Koeffizienten + Median als CSV."""
    coef_df = coef_folds.copy()
    coef_df["median"] = coef_df.median(axis=1)
    out_path = METRICS_DIR / f"linear_logit_{set_name}_coefs.csv"
    coef_df.to_csv(out_path)
    logger.info(f"[{set_name}] Koeffizienten gespeichert: {out_path}")
    return out_path


def compute_perm_importance_topk(
    set_name: str,
    feature_names: List[str],
    df_train: pd.DataFrame,
    best_C: float,
    k: int = 20,
) -> Path:
    """Permutation Importance im letzten Train-Fenster, scoring=neg_log_loss."""
    X_tr, _, y_tr = split_Xy(df_train)
    pipe = _build_lr_pipeline(C=best_C)
    pipe.fit(X_tr[feature_names], y_tr)
    r = permutation_importance(
        estimator=pipe,
        X=X_tr[feature_names],
        y=y_tr,
        scoring="neg_log_loss",
        n_repeats=30,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )
    imp_df = pd.DataFrame(
        {
            "feature": feature_names,
            "importance_mean": r.importances_mean,
            "importance_std": r.importances_std,
        }
    ).sort_values("importance_mean", ascending=False)
    imp_top = imp_df.head(k)
    out_path = METRICS_DIR / f"linear_logit_{set_name}_permimp_top{k}.csv"
    imp_top.to_csv(out_path, index=False)
    logger.info(f"[{set_name}] Permutation Importance Top-{k} gespeichert: {out_path}")
    return out_path

In [8]:
# %% [step7-plots]

"""
Plots (matplotlib-only):

* Kumulierte Renditen (Modell vs. Buy&Hold)
* Balken: Test-AUC & Test-F1 über Feature-Sets
* Top-20 Koef (Median) & Perm-Imp (Top-20)
  Dateien werden unter artifacts/reports/20_* gespeichert.
"""
def _to_growth(ret_pct: pd.Series) -> pd.Series:
    """Wandelt Monatsrendite in % in Wachstumsmultiplikator um und kumuliert."""
    g = (1.0 + ret_pct.fillna(0.0) / 100.0).cumprod()
    return g


def plot_cum_returns(set_name: str, df_perf: pd.DataFrame) -> Path:
    fig, ax = plt.subplots(figsize=(10, 5))
    g_model = _to_growth(df_perf["ret_model_pct"])
    g_bh = _to_growth(df_perf["ret_bh_pct"])
    ax.plot(g_model.index, g_model.values, label="Modell (Long/Cash)")
    ax.plot(g_bh.index, g_bh.values, label="Buy&Hold")
    ax.set_title(f"Kumulierte Rendite – {set_name}")
    ax.set_xlabel("Datum")
    ax.set_ylabel("Wachstum (Start=1.0)")
    ax.grid(True, alpha=0.3)
    ax.legend()
    out = REPORTS_DIR / f"20_cumret_{set_name}.png"
    fig.tight_layout()
    fig.savefig(out, dpi=150)
    plt.close(fig)
    return out


def plot_bar_metrics_overview(summary: pd.DataFrame) -> Path:
    """Balken-Plot Test-AUC & Test-F1 je Feature-Set."""
    fig, ax = plt.subplots(figsize=(8, 5))
    x = np.arange(len(summary))
    width = 0.35
    ax.bar(x - width/2, summary["test_auc"], width, label="AUC")
    ax.bar(x + width/2, summary["test_f1"], width, label="F1")
    ax.set_xticks(x)
    ax.set_xticklabels(summary.index, rotation=0)
    ax.set_ylim(0, 1)
    ax.set_title("Test-Metriken je Feature-Set")
    ax.grid(True, axis="y", alpha=0.3)
    ax.legend()
    out = REPORTS_DIR / f"20_metrics_overview.png"
    fig.tight_layout()
    fig.savefig(out, dpi=150)
    plt.close(fig)
    return out


def plot_top20_coefficients(set_name: str, coef_csv: Path) -> Path:
    """Horizontale Balken der Top-20 |Median-Koeffizienten|."""
    df = pd.read_csv(coef_csv, index_col=0)
    med = df["median"].copy()
    top = med.reindex(med.abs().sort_values(ascending=False).head(20).index)
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.barh(top.index, top.values)
    ax.set_title(f"Top-20 |Median-Koeffizienten| – {set_name}")
    ax.set_xlabel("Koeffizient")
    ax.invert_yaxis()
    ax.grid(True, axis="x", alpha=0.3)
    out = REPORTS_DIR / f"20_coef_top20_{set_name}.png"
    fig.tight_layout()
    fig.savefig(out, dpi=150)
    plt.close(fig)
    return out


def plot_top20_permimp(set_name: str, permimp_csv: Path) -> Path:
    """Horizontale Balken der Top-20 Permutation-Importance."""
    df = pd.read_csv(permimp_csv)
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.barh(df["feature"], df["importance_mean"])
    ax.set_title(f"Permutation Importance Top-20 – {set_name}")
    ax.set_xlabel("Δ(neg_log_loss)")
    ax.invert_yaxis()
    ax.grid(True, axis="x", alpha=0.3)
    out = REPORTS_DIR / f"20_permimp_top20_{set_name}.png"
    fig.tight_layout()
    fig.savefig(out, dpi=150)
    plt.close(fig)
    return out

In [9]:
# %% [step8-persistenz]

"""
Persistenz:

* Speichert finales LR-Modell (auf komplettem Trainingsfenster) und Platt-Kalibrator
"""
def persist_model_and_calibrator(
    set_name: str,
    feature_names: List[str],
    df_train: pd.DataFrame,
    best_C: float,
    calibrator: PlattCalibrator,
) -> Tuple[Path, Path]:
    X_tr, _, y_tr = split_Xy(df_train)
    pipe = _build_lr_pipeline(C=best_C)
    pipe.fit(X_tr[feature_names], y_tr)

    model_path = MODELS_DIR / f"linear_logit_{set_name}.pkl"
    cal_path = MODELS_DIR / f"linear_logit_{set_name}_calibrator.pkl"
    joblib.dump(pipe, model_path)
    joblib.dump(calibrator, cal_path)
    logger.info(f"[{set_name}] Modell gespeichert: {model_path}")
    logger.info(f"[{set_name}] Kalibrator gespeichert: {cal_path}")
    return model_path, cal_path

In [10]:
# %% [step9-uebersicht20-driver]

"""
DRIVER: Führt alles je Feature-Set (TECH/MACRO/INTEGRATED) aus und erstellt Übersicht (CSV+PNG).

* OOF/GS/Kalibrierung/Threshold
* Optionaler SGD-Check
* Walk-Forward + Forecast-CSV
* Explainability CSVs + Plots
* Metriken/Signifikanz + JSON
* Persistenz
* Gesamttabelle
"""
results_summary: Dict[str, Dict[str, float]] = {}
metrics_paths: Dict[str, Path] = {}

for set_name, feature_names in FEATURE_GROUPS.items():
    logger.info(f"=== Processing Feature-Set: {set_name} (d={len(feature_names)}) ===")

    # 1) OOF + Kalibrierung + Threshold
    oof_res, platt_cal, _best_pipe = run_oof_training_for_set(set_name, feature_names, df_train)

    # 2) Optional: SGD OOF-Check (nicht persistiert)
    _ = sgd_oof_check(feature_names, df_train, set_name)

    # 3) Walk-Forward-Test
    df_forecast, df_perf = run_walk_forward(
        set_name=set_name,
        feature_names=feature_names,
        df_full=df_features,
        df_train=df_train,
        df_test=df_test,
        s_price_full=s_price,
        best_C=oof_res.best_C,
        calibrator=platt_cal,
        threshold=oof_res.threshold,
    )

    # 4) Explainability
    coef_csv = save_coef_profiles(set_name, oof_res.coef_per_fold)
    permimp_csv = compute_perm_importance_topk(set_name, feature_names, df_train, best_C=oof_res.best_C, k=20)

    # 5) Metriken + Tests + JSON
    metrics_json_path = evaluate_and_write_metrics(
        set_name=set_name,
        oof=oof_res,
        calibrator=platt_cal,
        df_forecast=df_forecast,
        coef_path=coef_csv,
        permimp_path=permimp_csv,
        cv_details_path=oof_res.cv_details_path,
    )
    metrics_paths[set_name] = metrics_json_path

    # 6) Persistenz (Modell + Kalibrator)
    model_path, cal_path = persist_model_and_calibrator(
        set_name=set_name,
        feature_names=feature_names,
        df_train=df_train,
        best_C=oof_res.best_C,
        calibrator=platt_cal,
    )

    # 7) Plots
    cumret_png = plot_cum_returns(set_name, df_perf)
    coef_png = plot_top20_coefficients(set_name, coef_csv)
    permimp_png = plot_top20_permimp(set_name, permimp_csv)

    # 8) Summary sammeln (AUC/F1 Test + OOF zur Übersicht)
    with open(metrics_json_path, "r", encoding="utf-8") as f:
        m = json.load(f)
    results_summary[set_name] = {
        "oof_auc": m["oof_auc"],
        "oof_f1": m["oof_f1"],
        "test_auc": m["test_auc"],
        "test_f1": m["test_f1"],
        "test_acc": m["test_acc"],
    }

# Übersicht als CSV + Balkenplot

summary_df = pd.DataFrame(results_summary).T.sort_index()
summary_csv = METRICS_DIR / "linear_logit_overview.csv"
summary_df.to_csv(summary_csv)
logger.info(f"Übersicht gespeichert: {summary_csv}")

overview_png = plot_bar_metrics_overview(summary_df)

# Fertig

logger.info("=== Done: Lineare Klassifikatoren (Logit) für TECH/MACRO/INTEGRATED ===")

2025-08-24 17:53:46,035 | INFO | === Processing Feature-Set: TECH (d=5) ===
2025-08-24 17:53:49,773 | INFO | [TECH] Best C=0.1 (CV neg_log_loss=-0.7116)
2025-08-24 17:53:49,926 | INFO | [TECH] OOF AUC=0.511 | OOF F1@0.01=0.824 (valid n=60)
2025-08-24 17:53:49,952 | INFO | [TECH] SGD OOF AUC=0.388 | F1@0.5=0.479 (valid n=60)
2025-08-24 17:53:50,282 | INFO | [TECH] Forecast gespeichert: C:\Users\gamer\Desktop\AktienPrognose\artifacts\forecasts\linear_logit_TECH.csv
2025-08-24 17:53:50,286 | INFO | [TECH] Koeffizienten gespeichert: C:\Users\gamer\Desktop\AktienPrognose\artifacts\metrics\linear_logit_TECH_coefs.csv
2025-08-24 17:53:50,449 | INFO | [TECH] Permutation Importance Top-20 gespeichert: C:\Users\gamer\Desktop\AktienPrognose\artifacts\metrics\linear_logit_TECH_permimp_top20.csv
  y_persist = y_all.shift(1).reindex(df_forecast.index).fillna(method="ffill").astype(int).values
2025-08-24 17:53:50,460 | INFO | [TECH] Metrics gespeichert: C:\Users\gamer\Desktop\AktienPrognose\artifacts