# 2 – Training (Final)

Dieses Notebook trainiert für eine gegebene `EXP_ID` das Zwei-Stufen-
XGBoost-Modell (Signal + Richtung). Es liest die Config und den
Trainingsdatensatz aus `data/processed/...` und erzeugt nur den
Modell-Fit und einfache Metriken.


In [1]:
import sys
from pathlib import Path
import os

cwd = Path.cwd()
project_root = cwd
while not (project_root / 'src').is_dir():
    if project_root.parent == project_root:
        raise RuntimeError("Projektwurzel mit 'src' nicht gefunden.")
    project_root = project_root.parent

print('Erkannte Projektwurzel:', project_root)
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

os.chdir(project_root)
print('Arbeitsverzeichnis gesetzt auf:', Path.cwd())


Erkannte Projektwurzel: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project
Arbeitsverzeichnis gesetzt auf: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project


In [2]:
# Bitte EXP_ID explizit setzen, passend zur Data-Prep.
EXP_ID = 'hv5_h4_thr0p4pct_hit_5'  # z.B. 'v3_h4_thr0p3pct_relaxed'
assert EXP_ID != 'CHANGE_ME', 'Bitte EXP_ID oben setzen.'


In [3]:
import json
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from src.utils.io import DATA_PROCESSED
from src.models.train_xgboost_two_stage import (
    split_train_val_test,
    build_signal_targets,
    build_direction_targets,
    train_xgb_binary,
    get_feature_cols,
)

plt.style.use('seaborn-v0_8')

# Datensatz laden
ds_path = DATA_PROCESSED / 'datasets' / f'eurusd_news_training__{EXP_ID}.csv'
print('Verwende Datensatz:', ds_path)
df = pd.read_csv(ds_path, parse_dates=['date']).sort_values('date').reset_index(drop=True)
print(df.shape)
feature_cols = get_feature_cols(df)
print('Feature-Spalten:', len(feature_cols))


Verwende Datensatz: data/processed/datasets/eurusd_news_training__hv5_h4_thr0p4pct_hit_5.csv
(1163, 44)
Feature-Spalten: 35


In [4]:
# Zeitliche Splits
test_start = '2025-01-01'
train_frac_pretest = 0.8

splits = split_train_val_test(
    df, pd.to_datetime(test_start), train_frac_within_pretest=train_frac_pretest
)
for name, split_df in splits.items():
    print(name, split_df['date'].min().date(), '->', split_df['date'].max().date(), 'n=', len(split_df))


train 2020-04-14 -> 2024-04-09 n= 756
val 2024-04-10 -> 2024-12-31 n= 189
test 2025-01-02 -> 2025-11-05 n= 218


In [5]:
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix
import json

# Schwelle für das Signal-Modell (Stufe 1).
# Höhere Werte -> höhere Precision, geringerer Recall.
SIGNAL_THRESHOLD = 0.5

# --- Signal-Modell trainieren ---
y_train_signal = build_signal_targets(splits['train'])
y_val_signal   = build_signal_targets(splits['val'])
y_test_signal  = build_signal_targets(splits['test'])

X_train_signal = splits['train'][feature_cols]
X_val_signal   = splits['val'][feature_cols]
X_test_signal  = splits['test'][feature_cols]

# Class-Imbalance für das Signal-Modell explizit berücksichtigen
n_pos_signal = int((y_train_signal == 1).sum())
n_neg_signal = int((y_train_signal == 0).sum())
scale_pos_weight_signal = n_neg_signal / max(n_pos_signal, 1)
print('Signal scale_pos_weight:', scale_pos_weight_signal)

model_signal = train_xgb_binary(
    X_train_signal,
    y_train_signal,
    X_val_signal,
    y_val_signal,
    scale_pos_weight=scale_pos_weight_signal,
)
print('[ok] Signal-Modell trainiert.')

# --- Richtungs-Modell trainieren ---
X_train_dir, y_train_dir = build_direction_targets(splits['train'], feature_cols=feature_cols)
X_val_dir,   y_val_dir   = build_direction_targets(splits['val'],   feature_cols=feature_cols)
X_test_dir,  y_test_dir  = build_direction_targets(splits['test'],  feature_cols=feature_cols)

model_dir = train_xgb_binary(
    X_train_dir,
    y_train_dir,
    X_val_dir,
    y_val_dir,
    scale_pos_weight=1.0,
)
print('[ok] Richtungs-Modell trainiert.')

# --- Metriken berechnen und speichern ---

def binary_metrics_dict(y_true, y_prob, threshold, target_names):
    y_pred = (y_prob >= threshold).astype(int)
    report = classification_report(
        y_true,
        y_pred,
        target_names=target_names,
        output_dict=True,
        digits=3,
    )
    cm = confusion_matrix(y_true, y_pred).tolist()
    return {
        'threshold': float(threshold),
        'report': report,
        'confusion_matrix': cm,
    }

# Wahrscheinlichkeiten
p_train_signal = model_signal.predict_proba(X_train_signal)[:, 1]
p_val_signal   = model_signal.predict_proba(X_val_signal)[:, 1]
p_test_signal  = model_signal.predict_proba(X_test_signal)[:, 1]

signal_metrics = {
    'train': binary_metrics_dict(y_train_signal, p_train_signal, SIGNAL_THRESHOLD, ['neutral', 'move']),
    'val':   binary_metrics_dict(y_val_signal,   p_val_signal,   SIGNAL_THRESHOLD, ['neutral', 'move']),
    'test':  binary_metrics_dict(y_test_signal,  p_test_signal,  SIGNAL_THRESHOLD, ['neutral', 'move']),
}

p_train_dir = model_dir.predict_proba(X_train_dir)[:, 1]
p_val_dir   = model_dir.predict_proba(X_val_dir)[:, 1]
p_test_dir  = model_dir.predict_proba(X_test_dir)[:, 1]

# Threshold für das Richtungs-Modell (down vs up) anhand des Val-Splits optimieren
thr_grid = np.linspace(0.4, 0.6, 11)
best_thr = 0.5
best_f1_up = -1.0
for thr in thr_grid:
    y_val_pred = (p_val_dir >= thr).astype(int)
    rep = classification_report(
        y_val_dir,
        y_val_pred,
        target_names=['down', 'up'],
        output_dict=True,
        digits=3,
    )
    f1_up = rep['up']['f1-score']
    if f1_up > best_f1_up:
        best_f1_up = f1_up
        best_thr = thr

DIR_THRESHOLD = float(best_thr)
print('Richtungs-Schwelle (val-basiert):', DIR_THRESHOLD, 'F1_up(val):', best_f1_up)

direction_metrics = {
    'train': binary_metrics_dict(y_train_dir, p_train_dir, DIR_THRESHOLD, ['down', 'up']),
    'val':   binary_metrics_dict(y_val_dir,   p_val_dir,   DIR_THRESHOLD, ['down', 'up']),
    'test':  binary_metrics_dict(y_test_dir,  p_test_dir,  DIR_THRESHOLD, ['down', 'up']),
}

# Kombinierte 3-Klassen-Auswertung auf Test
X_test_all = splits['test'][feature_cols]
signal_prob_test = model_signal.predict_proba(X_test_all)[:, 1]
signal_pred_test = (signal_prob_test >= SIGNAL_THRESHOLD).astype(int)
dir_prob_test = model_dir.predict_proba(X_test_all)[:, 1]
dir_pred_test = (dir_prob_test >= DIR_THRESHOLD).astype(int)

combined_pred = np.where(
    signal_pred_test == 0,
    'neutral',
    np.where(dir_pred_test == 1, 'up', 'down'),
)
combined_true = splits['test']['label'].to_numpy()

combined_report = classification_report(
    combined_true,
    combined_pred,
    labels=['neutral', 'up', 'down'],
    output_dict=True,
    digits=3,
)
combined_cm = confusion_matrix(
    combined_true,
    combined_pred,
    labels=['neutral', 'up', 'down'],
).tolist()

# Modell-Parameter + Feature-Importances
signal_params = model_signal.get_xgb_params()
direction_params = model_dir.get_xgb_params()
signal_params['feature_importances_'] = model_signal.feature_importances_.tolist()
direction_params['feature_importances_'] = model_dir.feature_importances_.tolist()

# Config laden
from src.utils.io import DATA_PROCESSED
exp_meta_dir = DATA_PROCESSED / 'experiments'
exp_config_path = exp_meta_dir / f'{EXP_ID}_config.json'
with exp_config_path.open('r', encoding='utf-8') as f:
    exp_config = json.load(f)

config_block = {
    'exp_id': exp_config.get('exp_id', EXP_ID),
    'horizon_days': exp_config.get('label_params', {}).get('horizon_days'),
    'up_threshold': exp_config.get('label_params', {}).get('up_threshold'),
    'down_threshold': exp_config.get('label_params', {}).get('down_threshold'),
    'strict_monotonic': exp_config.get('label_params', {}).get('strict_monotonic'),
    'dataset_path': str(ds_path),
    'feature_cols': feature_cols,
    'test_start': test_start,
    'train_frac_within_pretest': train_frac_pretest,
    'signal_threshold': SIGNAL_THRESHOLD,
    'direction_threshold': DIR_THRESHOLD,
}

results = {
    'config': config_block,
    'model_params': {
        'signal': signal_params,
        'direction': direction_params,
    },
    'signal': signal_metrics,
    'direction': direction_metrics,
    'combined_test': {
        'report': combined_report,
        'confusion_matrix': combined_cm,
        'labels': ['neutral', 'up', 'down'],
    },
}

# Ergebnisse in Standard- und Final-Ordner schreiben
base_results_dir = Path('notebooks') / 'results'
final_results_dir = base_results_dir / 'final_two_stage'
base_results_dir.mkdir(parents=True, exist_ok=True)
final_results_dir.mkdir(parents=True, exist_ok=True)

json_base = base_results_dir / f'two_stage__{EXP_ID}.json'
json_final = final_results_dir / f'two_stage_final__{EXP_ID}.json'

with json_base.open('w') as f:
    json.dump(results, f, indent=2)
with json_final.open('w') as f:
    json.dump(results, f, indent=2)

# einfache Metrik-Tabelle (F1 der positiven Klasse)
rows = []
for model_key, model_name, pos_label in [
    ('signal', 'signal', 'move'),
    ('direction', 'direction', 'up'),
]:
    metrics = results[model_key]
    for split, m in metrics.items():
        rep = m['report']
        cls = rep.get(pos_label, {})
        rows.append({
            'model': model_name,
            'split': split,
            'precision_pos': cls.get('precision'),
            'recall_pos': cls.get('recall'),
            'f1_pos': cls.get('f1-score'),
        })

metrics_df = pd.DataFrame(rows)
csv_final = final_results_dir / f'two_stage_final__{EXP_ID}_metrics.csv'
metrics_df.to_csv(csv_final, index=False)

# Test-Predictions als CSV für Fehlklassifikations-Analysen speichern
test_dates = splits['test']['date'].to_numpy()
test_labels = splits['test']['label'].to_numpy()

pred_rows = []
for dt, y_true, sig_p, sig_hat, dir_p, dir_hat, comb_hat in zip(
    test_dates,
    test_labels,
    signal_prob_test,
    signal_pred_test,
    dir_prob_test,
    dir_pred_test,
    combined_pred,
):
    pred_rows.append({
        'date': dt,
        'label_true': y_true,
        'signal_prob': float(sig_p),
        'signal_pred': int(sig_hat),
        'direction_prob_up': float(dir_p),
        'direction_pred_up': int(dir_hat),
        'combined_pred': comb_hat,
    })

pred_df = pd.DataFrame(pred_rows)
pred_path = final_results_dir / f'two_stage_final__{EXP_ID}_predictions.csv'
pred_df.to_csv(pred_path, index=False)

print('[ok] Ergebnisse gespeichert unter:')
print('   JSON base :', json_base)
print('   JSON final:', json_final)
print('   CSV final :', csv_final)
print('   Predictions:', pred_path)


Signal scale_pos_weight: 0.18309859154929578




[ok] Signal-Modell trainiert.
[ok] Richtungs-Modell trainiert.
Richtungs-Schwelle (val-basiert): 0.4 F1_up(val): 0.7341772151898734
[ok] Ergebnisse gespeichert unter:
   JSON base : notebooks/results/two_stage__hv5_h4_thr0p4pct_hit_5.json
   JSON final: notebooks/results/final_two_stage/two_stage_final__hv5_h4_thr0p4pct_hit_5.json
   CSV final : notebooks/results/final_two_stage/two_stage_final__hv5_h4_thr0p4pct_hit_5_metrics.csv
   Predictions: notebooks/results/final_two_stage/two_stage_final__hv5_h4_thr0p4pct_hit_5_predictions.csv


