# 1 – Data Prep (Final)

Dieses Notebook erzeugt für eine frei wählbare `EXP_ID` die Labels
und den Trainingsdatensatz für das Zwei-Stufen-XGBoost-Modell.

Es ruft nur die Python-Pipelines auf und enthält bewusst **kein**
Modell-Training oder umfangreiche Visualisierungen.


In [109]:
import sys
from pathlib import Path
import os

cwd = Path.cwd()
project_root = cwd
while not (project_root / 'src').is_dir():
    if project_root.parent == project_root:
        raise RuntimeError("Projektwurzel mit 'src' nicht gefunden.")
    project_root = project_root.parent

print('Erkannte Projektwurzel:', project_root)
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

os.chdir(project_root)
print('Arbeitsverzeichnis gesetzt auf:', Path.cwd())


Erkannte Projektwurzel: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project
Arbeitsverzeichnis gesetzt auf: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project


In [110]:
# Experiment-Konfiguration – bitte EXP_ID explizit setzen, bevor du das Notebook ausführst.
#
# Typische Beispiele:
#   EXP_ID = 'v3_h4_thr0p3pct_relaxed'
#   EXP_ID = 'v9_h4_thr0p5pct_tol0p3_30dfeat'
#   EXP_ID = 's1_h4_thr0p5pct_tol0p3'

EXP_ID = 'nv28_h6_thr1p_5pct_tolerant_p_pct_5p'  # <- hier setzen
assert EXP_ID != 'CHANGE_ME', 'Bitte EXP_ID oben setzen, bevor du weiterläufst.'

# Label-Parameter – bei Bedarf anpassen
LABEL_PARAMS = dict(
    horizon_days=6,
    up_threshold=0.015,
    down_threshold=-0.015,
    strict_monotonic=False,
    max_adverse_move_pct=None,
)


In [111]:
import json
from pathlib import Path

from src.utils.io import DATA_PROCESSED
from src.data.label_eurusd import label_eurusd
from src.data.build_training_set import build_training_dataframe

print('EXP_ID:', EXP_ID)
print('LABEL_PARAMS:', LABEL_PARAMS)

# 1) Config-JSON speichern
exp_meta_dir = DATA_PROCESSED / 'experiments'
exp_meta_dir.mkdir(parents=True, exist_ok=True)
exp_config = {
    'exp_id': EXP_ID,
    'label_params': LABEL_PARAMS,
}
exp_config_path = exp_meta_dir / f'{EXP_ID}_config.json'
with exp_config_path.open('w', encoding='utf-8') as f:
    json.dump(exp_config, f, indent=2)
print('[ok] Experiment-Konfiguration gespeichert unter:', exp_config_path)

# 2) Labels berechnen
labels = label_eurusd(**LABEL_PARAMS)
fx_dir = DATA_PROCESSED / 'fx'
fx_dir.mkdir(parents=True, exist_ok=True)
labels_path_exp = fx_dir / f'eurusd_labels__{EXP_ID}.csv'
labels.to_csv(labels_path_exp)
labels_path_latest = fx_dir / 'eurusd_labels.csv'
labels.to_csv(labels_path_latest)
print('[ok] Labels gespeichert als:')
print('   ', labels_path_exp)
print('   ', labels_path_latest)

# 3) Trainingsdatensatz bauen
merged = build_training_dataframe(exp_id=EXP_ID)
ds_dir = DATA_PROCESSED / 'datasets'
ds_dir.mkdir(parents=True, exist_ok=True)
train_path_exp = ds_dir / f'eurusd_news_training__{EXP_ID}.csv'
merged.to_csv(train_path_exp, index=False)
train_path_latest = ds_dir / 'eurusd_news_training.csv'
merged.to_csv(train_path_latest, index=False)
print('[ok] Trainingsdatensatz gespeichert als:')
print('   ', train_path_exp)
print('   ', train_path_latest)


EXP_ID: nv28_h6_thr1p_5pct_tolerant_p_pct_5p
LABEL_PARAMS: {'horizon_days': 6, 'up_threshold': 0.015, 'down_threshold': -0.015, 'strict_monotonic': False, 'max_adverse_move_pct': None}
[ok] Experiment-Konfiguration gespeichert unter: data/processed/experiments/nv28_h6_thr1p_5pct_tolerant_p_pct_5p_config.json
[ok] Labels gespeichert als:
    data/processed/fx/eurusd_labels__nv28_h6_thr1p_5pct_tolerant_p_pct_5p.csv
    data/processed/fx/eurusd_labels.csv
[ok] Trainingsdatensatz gespeichert als:
    data/processed/datasets/eurusd_news_training__nv28_h6_thr1p_5pct_tolerant_p_pct_5p.csv
    data/processed/datasets/eurusd_news_training.csv
