# 2 – Training (Final)

Dieses Notebook trainiert für eine gegebene `EXP_ID` das Zwei-Stufen-
XGBoost-Modell (Signal + Richtung). Es liest die Config und den
Trainingsdatensatz aus `data/processed/...` und erzeugt nur den
Modell-Fit und einfache Metriken.


In [7]:
import sys
from pathlib import Path
import os

cwd = Path.cwd()
project_root = cwd
while not (project_root / 'src').is_dir():
    if project_root.parent == project_root:
        raise RuntimeError("Projektwurzel mit 'src' nicht gefunden.")
    project_root = project_root.parent

print('Erkannte Projektwurzel:', project_root)
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

os.chdir(project_root)
print('Arbeitsverzeichnis gesetzt auf:', Path.cwd())


Erkannte Projektwurzel: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project
Arbeitsverzeichnis gesetzt auf: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project


In [8]:
# Bitte EXP_ID explizit setzen, passend zur Data-Prep.
EXP_ID = 'hv_flex_0_7_result'  # z.B. 'v3_h4_thr0p3pct_relaxed'
assert EXP_ID != 'CHANGE_ME', 'Bitte EXP_ID oben setzen.'

# True = mit News-Merge, False = nur Preise
# Hinweis: Wenn eine Config-Datei aus Data-Prep existiert, überschreibt sie FEATURE_MODE automatisch.
USE_NEWS = True
FEATURE_MODE = 'news+price' if USE_NEWS else 'price_only'



In [9]:
import json
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from src.utils.io import DATA_PROCESSED
from src.models.train_xgboost_two_stage import (
    split_train_val_test,
    build_signal_targets,
    build_direction_targets,
    train_xgb_binary,
    get_feature_cols,
)

plt.style.use('seaborn-v0_8')

# Datensatz laden
# Config aus Data-Prep lesen (falls vorhanden)
exp_meta_dir = DATA_PROCESSED / 'experiments'
exp_config_path = exp_meta_dir / f'{EXP_ID}_config.json'
if exp_config_path.is_file():
    with exp_config_path.open('r', encoding='utf-8') as f:
        exp_config = json.load(f)
    cfg_feature_mode = exp_config.get('feature_mode')
    if cfg_feature_mode and cfg_feature_mode != FEATURE_MODE:
        print(f'[info] FEATURE_MODE überschrieben durch Config: {FEATURE_MODE} -> {cfg_feature_mode}')
        FEATURE_MODE = cfg_feature_mode

ds_kind = 'news' if FEATURE_MODE == 'news+price' else 'price'
ds_path = DATA_PROCESSED / 'datasets' / f'eurusd_{ds_kind}_training__{EXP_ID}.csv'
print('Verwende Datensatz:', ds_path)
if not ds_path.exists():
    raise FileNotFoundError(
        f"Datensatz nicht gefunden: {ds_path}\n"
        "→ Bitte zuerst das Data-Prep-Notebook (1_data_prep_final.ipynb) mit genau derselben EXP_ID ausführen."
    )
df = pd.read_csv(ds_path, parse_dates=['date']).sort_values('date').reset_index(drop=True)
print(df.shape)
feature_cols = get_feature_cols(df)

# Price-only: entferne News-Spalten aus feature_cols
if FEATURE_MODE == 'price_only':
    drop_prefixes = ['news_']
    drop_exact = [
        'article_count',
        'avg_polarity',
        'avg_neg',
        'avg_neu',
        'avg_pos',
        'pos_share',
        'neg_share',
    ]
    feature_cols = [
        c
        for c in feature_cols
        if not any(c.startswith(p) for p in drop_prefixes) and c not in drop_exact
    ]
    print('[info] Price-only aktiv – News-Features entfernt.')

print('Feature-Spalten:', len(feature_cols))


Verwende Datensatz: data/processed/datasets/eurusd_news_training__hv_flex_0_7_result.csv
(1168, 48)
Feature-Spalten: 35


In [10]:
# Zeitliche Splits
test_start = '2025-01-01'
train_frac_pretest = 0.7

splits = split_train_val_test(
    df, pd.to_datetime(test_start), train_frac_within_pretest=train_frac_pretest
)
for name, split_df in splits.items():
    print(name, split_df['date'].min().date(), '->', split_df['date'].max().date(), 'n=', len(split_df))


train 2020-04-14 -> 2023-11-28 n= 661
val 2023-11-29 -> 2024-12-31 n= 284
test 2025-01-02 -> 2025-11-12 n= 223


In [11]:
# Optional: Debug-Zelle (nur nötig, wenn du etwas prüfen willst).
# Hinweis: Diese Zelle funktioniert erst, nachdem oben Datensatz+Splits geladen wurden.
if 'splits' not in globals() or 'feature_cols' not in globals():
    raise RuntimeError(
        "Bitte zuerst die Zellen oben ausführen (Datensatz laden + Splits + feature_cols)."
    )
if 'X_train_dir' not in globals() or 'y_train_dir' not in globals():
    X_train_dir, y_train_dir = build_direction_targets(splits['train'], feature_cols=feature_cols)

print('X_train_dir', getattr(X_train_dir, 'shape', None))
print('y_train_dir unique:', np.unique(y_train_dir, return_counts=True))
print('train signal counts:', splits['train']['signal'].value_counts().to_dict())
print('train direction counts (signal==1):', splits['train'].query('signal==1')['direction'].value_counts().to_dict())


X_train_dir (503, 21)
y_train_dir unique: (array([0, 1]), array([238, 265]))
train signal counts: {0: 444, 1: 217}
train direction counts (signal==1): {0.0: 122, 1.0: 95}


In [12]:
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix
import json

# Schwelle für das Signal-Modell (Stufe 1).
# Höhere Werte -> höhere Precision, geringerer Recall.
SIGNAL_THRESHOLD = 0.5

# --- Signal-Modell trainieren ---
y_train_signal = build_signal_targets(splits['train'])
y_val_signal   = build_signal_targets(splits['val'])
y_test_signal  = build_signal_targets(splits['test'])

X_train_signal = splits['train'][feature_cols]
X_val_signal   = splits['val'][feature_cols]
X_test_signal  = splits['test'][feature_cols]

# Class-Imbalance für das Signal-Modell explizit berücksichtigen
n_pos_signal = int((y_train_signal == 1).sum())
n_neg_signal = int((y_train_signal == 0).sum())
scale_pos_weight_signal = n_neg_signal / max(n_pos_signal, 1)
print('Signal scale_pos_weight:', scale_pos_weight_signal)

model_signal = train_xgb_binary(
    X_train_signal,
    y_train_signal,
    X_val_signal,
    y_val_signal,
    scale_pos_weight=scale_pos_weight_signal,
)
print('[ok] Signal-Modell trainiert.')

# --- Richtungs-Modell trainieren ---
X_train_dir, y_train_dir = build_direction_targets(splits['train'], feature_cols=feature_cols)
X_val_dir,   y_val_dir   = build_direction_targets(splits['val'],   feature_cols=feature_cols)
X_test_dir,  y_test_dir  = build_direction_targets(splits['test'],  feature_cols=feature_cols)

def _count01(y):
    if y is None or len(y) == 0:
        return {}
    u, c = np.unique(y, return_counts=True)
    return {int(uu): int(cc) for uu, cc in zip(u, c)}

print('[debug] Signal train counts:', _count01(y_train_signal))
print('[debug] Signal val counts  :', _count01(y_val_signal))
print('[debug] Signal test counts :', _count01(y_test_signal))
print('[debug] Direction train X/y:', getattr(X_train_dir, 'shape', None), _count01(y_train_dir))
print('[debug] Direction val   X/y:', getattr(X_val_dir, 'shape', None), _count01(y_val_dir))
print('[debug] Direction test  X/y:', getattr(X_test_dir, 'shape', None), _count01(y_test_dir))
if len(X_val_dir) == 0:
    print("[warn] Val-Split hat 0 Bewegungstage (signal==1) → DIR_THRESHOLD kann nicht val-basiert optimiert werden.")
if len(X_test_dir) == 0:
    print("[warn] Test-Split hat 0 Bewegungstage (signal==1) → Direction-Metriken sind leer.")

model_dir = train_xgb_binary(
    X_train_dir,
    y_train_dir,
    X_val_dir,
    y_val_dir,
    scale_pos_weight=1.0,
)
print('[ok] Richtungs-Modell trainiert.')

# --- Metriken berechnen und speichern ---

def binary_metrics_dict(y_true, y_prob, threshold, target_names):
    if y_true is None or len(y_true) == 0:
        return {
            'threshold': float(threshold),
            'report': {},
            'confusion_matrix': [],
        }
    y_pred = (y_prob >= threshold).astype(int)
    report = classification_report(
        y_true,
        y_pred,
        target_names=target_names,
        output_dict=True,
        digits=3,
    )
    cm = confusion_matrix(y_true, y_pred).tolist()
    return {
        'threshold': float(threshold),
        'report': report,
        'confusion_matrix': cm,
    }

def proba_pos(model, X):
    """P(positive Klasse) als 1D-Array; liefert [] wenn X leer ist."""
    if X is None or len(X) == 0:
        return np.array([])
    proba = model.predict_proba(X)
    if getattr(proba, 'ndim', 0) != 2 or proba.shape[1] < 2:
        raise ValueError(
            f"predict_proba lieferte unerwartete Form {getattr(proba, 'shape', None)}. "
            "Das Modell ist evtl. degeneriert (z.B. Training hatte nur 1 Klasse oder leere Daten)."
        )
    return proba[:, 1]

# Wahrscheinlichkeiten
p_train_signal = proba_pos(model_signal, X_train_signal)
p_val_signal   = proba_pos(model_signal, X_val_signal)
p_test_signal  = proba_pos(model_signal, X_test_signal)

signal_metrics = {
    'train': binary_metrics_dict(y_train_signal, p_train_signal, SIGNAL_THRESHOLD, ['neutral', 'move']),
    'val':   binary_metrics_dict(y_val_signal,   p_val_signal,   SIGNAL_THRESHOLD, ['neutral', 'move']),
    'test':  binary_metrics_dict(y_test_signal,  p_test_signal,  SIGNAL_THRESHOLD, ['neutral', 'move']),
}

p_train_dir = proba_pos(model_dir, X_train_dir)
p_val_dir   = proba_pos(model_dir, X_val_dir)
p_test_dir  = proba_pos(model_dir, X_test_dir)

# Threshold für das Richtungs-Modell (down vs up) anhand des Val-Splits optimieren
DIR_THRESHOLD = 0.5
best_f1_up = None
if y_val_dir is not None and len(y_val_dir) > 0 and len(np.unique(y_val_dir)) > 1:
    thr_grid = np.linspace(0.4, 0.6, 11)
    best_thr = 0.5
    best_f1_up = -1.0
    for thr in thr_grid:
        y_val_pred = (p_val_dir >= thr).astype(int)
        rep = classification_report(
            y_val_dir,
            y_val_pred,
            target_names=['down', 'up'],
            output_dict=True,
            digits=3,
            zero_division=0,
        )
        f1_up = rep.get('up', {}).get('f1-score', -1.0)
        if f1_up > best_f1_up:
            best_f1_up = f1_up
            best_thr = thr
    DIR_THRESHOLD = float(best_thr)
    print('Richtungs-Schwelle (val-basiert):', DIR_THRESHOLD, 'F1_up(val):', best_f1_up)
else:
    print('[warn] Val-Split hat zu wenig Direction-Samples (0 oder nur 1 Klasse) → DIR_THRESHOLD=0.5')

direction_metrics = {
    'train': binary_metrics_dict(y_train_dir, p_train_dir, DIR_THRESHOLD, ['down', 'up']),
    'val':   binary_metrics_dict(y_val_dir,   p_val_dir,   DIR_THRESHOLD, ['down', 'up']),
    'test':  binary_metrics_dict(y_test_dir,  p_test_dir,  DIR_THRESHOLD, ['down', 'up']),
}

# --- Kostenbasierte Schwellen für das Richtungs-Modell bestimmen ---
from src.utils.io import DATA_PROCESSED  # für Zugriff auf Experiment-Config
exp_meta_dir = DATA_PROCESSED / 'experiments'
exp_config_path = exp_meta_dir / f'{EXP_ID}_config.json'
with exp_config_path.open('r', encoding='utf-8') as f:
    _cfg = json.load(f)
label_params = _cfg.get('label_params', {})
up_thr_label = float(label_params.get('up_threshold', 0.0))
down_thr_label = float(label_params.get('down_threshold', 0.0))
max_adv_label = label_params.get('max_adverse_move_pct', 0.01) or 0.01

# Einsatz-Größen für die Kostenfunktion (müssen zu Strategie A im Report passen)
stake_up = 100.0
stake_down = 100.0

def cost_per_trade(true_label: str, pred_label: str) -> float:
    """Approx. Trade-Kosten in CHF für Strategie A.

    Vereinfachte Annahme:
    - Korrekte Trades verdienen ca. Schwelle * Einsatz.
    - Falsche Trades bzw. Trades auf neutralen Tagen verlieren ca.
      max_adverse_move_pct * Einsatz.
    """
    true_label = str(true_label)
    pred_label = str(pred_label)

    if pred_label == 'neutral':
        return 0.0
    if true_label == 'neutral':
        # konservativ: immer Stop-Loss
        return -stake_up * max_adv_label if pred_label == 'up' else -stake_down * max_adv_label
    if pred_label == 'up':
        if true_label == 'up':
            return stake_up * up_thr_label
        else:  # true_label == 'down'
            return -stake_up * max_adv_label
    if pred_label == 'down':
        if true_label == 'down':
            return stake_down * (-down_thr_label)
        else:  # true_label == 'up'
            return -stake_down * max_adv_label
    return 0.0

labels_val = splits['val']['label'].to_numpy()
signal_pred_val = (p_val_signal >= SIGNAL_THRESHOLD).astype(int)
p_val_dir_all = proba_pos(model_dir, splits['val'][feature_cols])

thr_candidates = np.linspace(0.3, 0.7, 17)
best_pnl = -1e18
best_thr_down = 0.4
best_thr_up = 0.6

for thr_down in thr_candidates:
    for thr_up in thr_candidates:
        if thr_down >= thr_up:
            continue
        pnl = 0.0
        for prob, sig, true_lab in zip(p_val_dir_all, signal_pred_val, labels_val):
            if sig == 0:
                continue  # kein Trade, wenn Stufe 1 schon neutral ist
            if prob >= thr_up:
                pred_label = 'up'
            elif prob <= thr_down:
                pred_label = 'down'
            else:
                pred_label = 'neutral'
            pnl += cost_per_trade(true_lab, pred_label)
        if pnl > best_pnl:
            best_pnl = pnl
            best_thr_down = thr_down
            best_thr_up = thr_up

DIR_THR_DOWN = float(best_thr_down)
DIR_THR_UP = float(best_thr_up)
print('Richtungs-Schwellen (kostenbasiert, Val):', DIR_THR_DOWN, DIR_THR_UP, 'P&L(val):', best_pnl)

# --- Kostenbasierte Schwelle für das Signal-Modell bestimmen ---
thr_sig_candidates = np.linspace(0.3, 0.7, 17)
best_sig_thr = SIGNAL_THRESHOLD
best_sig_pnl = -1e18
for thr_sig in thr_sig_candidates:
    pnl = 0.0
    for sig_prob, dir_prob, true_lab in zip(p_val_signal, p_val_dir_all, labels_val):
        if sig_prob < thr_sig:
            pred_label = 'neutral'
        else:
            if dir_prob >= DIR_THR_UP:
                pred_label = 'up'
            elif dir_prob <= DIR_THR_DOWN:
                pred_label = 'down'
            else:
                pred_label = 'neutral'
        pnl += cost_per_trade(true_lab, pred_label)
    if pnl > best_sig_pnl:
        best_sig_pnl = pnl
        best_sig_thr = thr_sig

SIG_THR_TRADE = float(best_sig_thr)
print('Signal-Schwelle (kostenbasiert, Val):', SIG_THR_TRADE, 'P&L(val):', best_sig_pnl)

# Kombinierte 3-Klassen-Auswertung auf Test
X_test_all = splits['test'][feature_cols]
signal_prob_test = proba_pos(model_signal, X_test_all)
signal_pred_test = (signal_prob_test >= SIGNAL_THRESHOLD).astype(int)
dir_prob_test = proba_pos(model_dir, X_test_all)

combined_pred = np.full(len(signal_prob_test), 'neutral', dtype=object)
mask_signal_trade = signal_prob_test >= SIG_THR_TRADE
combined_pred[mask_signal_trade & (dir_prob_test >= DIR_THR_UP)] = 'up'
combined_pred[mask_signal_trade & (dir_prob_test <= DIR_THR_DOWN)] = 'down'

combined_true = splits['test']['label'].to_numpy()

combined_report = classification_report(
    combined_true,
    combined_pred,
    labels=['neutral', 'up', 'down'],
    output_dict=True,
    digits=3,
)
combined_cm = confusion_matrix(
    combined_true,
    combined_pred,
    labels=['neutral', 'up', 'down'],
).tolist()

# Modell-Parameter + Feature-Importances
signal_params = model_signal.get_xgb_params()
direction_params = model_dir.get_xgb_params()
signal_params['feature_importances_'] = model_signal.feature_importances_.tolist()
direction_params['feature_importances_'] = model_dir.feature_importances_.tolist()

# Config laden
from src.utils.io import DATA_PROCESSED
exp_meta_dir = DATA_PROCESSED / 'experiments'
exp_config_path = exp_meta_dir / f'{EXP_ID}_config.json'
with exp_config_path.open('r', encoding='utf-8') as f:
    exp_config = json.load(f)

feature_mode = FEATURE_MODE

config_block = {
    'exp_id': exp_config.get('exp_id', EXP_ID),
    'price_source': exp_config.get('label_params', {}).get('price_source'),
    'drop_weekends': exp_config.get('label_params', {}).get('drop_weekends'),
    'horizon_days': exp_config.get('label_params', {}).get('horizon_days'),
    'up_threshold': exp_config.get('label_params', {}).get('up_threshold'),
    'down_threshold': exp_config.get('label_params', {}).get('down_threshold'),
    'strict_monotonic': exp_config.get('label_params', {}).get('strict_monotonic'),
    'max_adverse_move_pct': exp_config.get('label_params', {}).get('max_adverse_move_pct'),
    'hit_within_horizon': exp_config.get('label_params', {}).get('hit_within_horizon'),
    'first_hit_wins': exp_config.get('label_params', {}).get('first_hit_wins'),
    'dataset_path': str(ds_path),
    'feature_cols': feature_cols,
    'test_start': test_start,
    'train_frac_within_pretest': train_frac_pretest,
    'signal_threshold': SIGNAL_THRESHOLD,
    'signal_threshold_trade': SIG_THR_TRADE,
    'direction_threshold': DIR_THRESHOLD,
    'direction_threshold_down': DIR_THR_DOWN,
    'direction_threshold_up': DIR_THR_UP,
    'feature_mode': feature_mode,
}

results = {
    'config': config_block,
    'model_params': {
        'signal': signal_params,
        'direction': direction_params,
    },
    'signal': signal_metrics,
    'direction': direction_metrics,
    'combined_test': {
        'report': combined_report,
        'confusion_matrix': combined_cm,
        'labels': ['neutral', 'up', 'down'],
    },
}

# Ergebnisse in Standard- und Final-Ordner schreiben
base_results_dir = Path('notebooks') / 'results'
final_results_dir = base_results_dir / 'final_two_stage'
base_results_dir.mkdir(parents=True, exist_ok=True)
final_results_dir.mkdir(parents=True, exist_ok=True)

json_base = base_results_dir / f'two_stage__{EXP_ID}.json'
json_final = final_results_dir / f'two_stage_final__{EXP_ID}.json'

with json_base.open('w') as f:
    json.dump(results, f, indent=2)
with json_final.open('w') as f:
    json.dump(results, f, indent=2)

# einfache Metrik-Tabelle (F1 der positiven Klasse)
rows = []
for model_key, model_name, pos_label in [
    ('signal', 'signal', 'move'),
    ('direction', 'direction', 'up'),
]:
    metrics = results[model_key]
    for split, m in metrics.items():
        rep = m['report']
        cls = rep.get(pos_label, {})
        rows.append({
            'model': model_name,
            'split': split,
            'precision_pos': cls.get('precision'),
            'recall_pos': cls.get('recall'),
            'f1_pos': cls.get('f1-score'),
        })

metrics_df = pd.DataFrame(rows)
csv_final = final_results_dir / f'two_stage_final__{EXP_ID}_metrics.csv'
metrics_df.to_csv(csv_final, index=False)

# Test-Predictions als CSV für Fehlklassifikations-Analysen speichern
test_dates = splits['test']['date'].to_numpy()
test_labels = splits['test']['label'].to_numpy()

pred_rows = []
for dt, y_true, sig_p, sig_hat, dir_p, comb_hat in zip(
    test_dates,
    test_labels,
    signal_prob_test,
    signal_pred_test,
    dir_prob_test,
    combined_pred,
):
    pred_rows.append({
        'date': dt,
        'label_true': y_true,
        'signal_prob': float(sig_p),
        'signal_pred': int(sig_hat),
        'direction_prob_up': float(dir_p),
        'direction_pred_up': 1 if comb_hat == 'up' else (0 if comb_hat == 'down' else -1),
        'combined_pred': comb_hat,
    })

pred_df = pd.DataFrame(pred_rows)
pred_path = final_results_dir / f'two_stage_final__{EXP_ID}_predictions.csv'
pred_df.to_csv(pred_path, index=False)

print('[ok] Ergebnisse gespeichert unter:')
print('   JSON base :', json_base)
print('   JSON final:', json_final)
print('   CSV final :', csv_final)
print('   Predictions:', pred_path)


Signal scale_pos_weight: 2.046082949308756
[ok] Signal-Modell trainiert.
[debug] Signal train counts: {0: 444, 1: 217}
[debug] Signal val counts  : {0: 236, 1: 48}
[debug] Signal test counts : {0: 168, 1: 55}
[debug] Direction train X/y: (217, 35) {0: 122, 1: 95}
[debug] Direction val   X/y: (48, 35) {0: 27, 1: 21}
[debug] Direction test  X/y: (55, 35) {0: 14, 1: 41}
[ok] Richtungs-Modell trainiert.




Richtungs-Schwelle (val-basiert): 0.4 F1_up(val): 0.45714285714285713
Richtungs-Schwellen (kostenbasiert, Val): 0.39999999999999997 0.575 P&L(val): 5.199999999999999
Signal-Schwelle (kostenbasiert, Val): 0.5 P&L(val): 5.199999999999999
[ok] Ergebnisse gespeichert unter:
   JSON base : notebooks/results/two_stage__hv_flex_0_7_result.json
   JSON final: notebooks/results/final_two_stage/two_stage_final__hv_flex_0_7_result.json
   CSV final : notebooks/results/final_two_stage/two_stage_final__hv_flex_0_7_result_metrics.csv
   Predictions: notebooks/results/final_two_stage/two_stage_final__hv_flex_0_7_result_predictions.csv
