# 1 – Data Prep (Final, MT5 H1)

Dieses Notebook erzeugt für eine frei wählbare `EXP_ID` die Labels
und den Trainingsdatensatz für das Zwei-Stufen-XGBoost-Modell –
aber **aus MT5 H1-Daten** (Hourly), die zu Daily aggregiert werden.

Wichtig: Entscheidungen (News ja/nein, Feature-Mode, etc.) werden
über Variablen gesteuert – nicht über Namenskonventionen der EXP_ID.


In [10]:
import sys
from pathlib import Path
import os

cwd = Path.cwd()
project_root = cwd
while not (project_root / 'src').is_dir():
    if project_root.parent == project_root:
        raise RuntimeError("Projektwurzel mit 'src' nicht gefunden.")
    project_root = project_root.parent

print('Erkannte Projektwurzel:', project_root)
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

os.chdir(project_root)
print('Arbeitsverzeichnis gesetzt auf:', Path.cwd())


Erkannte Projektwurzel: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project
Arbeitsverzeichnis gesetzt auf: /Users/jeremynathan/Documents/GitHub/hs2025_ml_project/hs2025_ml_project


In [11]:
# ==============================
# EXPERIMENT SETTINGS (H1 -> Daily)
# ==============================

EXP_ID = 'hp_mt5_flex_result'  # <- eindeutig setzen
assert EXP_ID != 'CHANGE_ME', 'Bitte EXP_ID oben setzen, bevor du weiterläufst.'

# True = mit News-Merge, False = nur Preise
USE_NEWS = False
FEATURE_MODE = 'news+price' if USE_NEWS else 'price_only'

# Wenn True: überschreibt auch die "latest" Dateien (wie die alten Final-Notebooks)
WRITE_LATEST = False

# MT5 H1 Export (View -> Symbols -> Bars -> Export Bars)
H1_CSV_PATH = Path('data/raw/fx/EURUSD_mt5_H1_2015_2025.csv')

# Daily-Cut (in Stunden, Server-Zeit aus MT5-Export)
# 0 = Kalender-Tag 00:00..23:00, 22 = typischer FX-Cut (je nach Broker)
CUT_HOUR = 0

DROP_WEEKENDS = False

LABEL_PARAMS = dict(
    horizon_days=15,
    up_threshold=0.02,
    down_threshold=-0.02,
    strict_monotonic=False,
    max_adverse_move_pct=0.004,
    hit_within_horizon=True,
    first_hit_wins=True,
    hit_source='h1',  # 'close'|'hl'|'h1' (h1 = order-aware via hourly bars)
    intraday_tie_breaker='down',

)


In [12]:
import json
from pathlib import Path

import pandas as pd

from src.utils.io import DATA_PROCESSED
from src.data.mt5_h1 import load_mt5_export_bars, h1_to_daily_ohlc, h1_daily_intraday_features, Mt5H1DailyFeatureConfig
from src.data.label_eurusd import label_eurusd_from_daily_prices, label_eurusd_from_daily_and_h1
from src.data.build_training_set import build_training_dataframe, build_price_only_training_dataframe_from_labels

print('EXP_ID:', EXP_ID)
print('FEATURE_MODE:', FEATURE_MODE)
print('H1_CSV_PATH:', H1_CSV_PATH)
print('CUT_HOUR:', CUT_HOUR)
print('DROP_WEEKENDS:', DROP_WEEKENDS)
print('LABEL_PARAMS:', LABEL_PARAMS)

# 1) H1 laden -> Daily OHLC ableiten
h1 = load_mt5_export_bars(H1_CSV_PATH)
if DROP_WEEKENDS:
    h1 = h1[h1.index.dayofweek < 5]

daily = h1_to_daily_ohlc(h1, cut_hour=CUT_HOUR, drop_weekends=DROP_WEEKENDS)

# 2) zusätzliche Intraday-Features pro Tag (aus H1)
h1_feat_cfg = Mt5H1DailyFeatureConfig(cut_hour=CUT_HOUR, drop_weekends=DROP_WEEKENDS)
intraday_feat = h1_daily_intraday_features(h1, cfg=h1_feat_cfg)

print('[info] H1 rows:', len(h1), 'Daily days:', len(daily), 'Intraday days:', len(intraday_feat))
print('[info] Daily range:', daily.index.min().date(), '->', daily.index.max().date())

# 3) Labels berechnen
# - close/hl: Daily-basiert (hl nutzt High/Low je Tag, Order innerhalb des Tages unbekannt)
# - h1: Hourly-basiert (Order innerhalb des Tages approximiert; nur bei "beides in derselben Stunde" bleibt Tie-Break)
if LABEL_PARAMS.get('hit_source') == 'h1':
    labels = label_eurusd_from_daily_and_h1(daily, h1, cut_hour=CUT_HOUR, drop_weekends=DROP_WEEKENDS, **LABEL_PARAMS)
else:
    labels = label_eurusd_from_daily_prices(daily, drop_weekends=DROP_WEEKENDS, **LABEL_PARAMS)

# 4) Config-JSON speichern (inkl. Quelle/Mode)
exp_meta_dir = DATA_PROCESSED / 'experiments'
exp_meta_dir.mkdir(parents=True, exist_ok=True)
exp_config = {
    'exp_id': EXP_ID,
    'feature_mode': FEATURE_MODE,
    'data_params': {
        'source': 'mt5_h1',
        'h1_csv_path': str(H1_CSV_PATH),
        'cut_hour': int(CUT_HOUR),
        'drop_weekends': bool(DROP_WEEKENDS),
    },
    'label_params': {
        **LABEL_PARAMS,
        'price_source': 'mt5_h1',
        'drop_weekends': bool(DROP_WEEKENDS),
    },
}
exp_config_path = exp_meta_dir / f'{EXP_ID}_config.json'
with exp_config_path.open('w', encoding='utf-8') as f:
    json.dump(exp_config, f, indent=2)
print('[ok] Experiment-Konfiguration gespeichert unter:', exp_config_path)

# 5) Labels speichern
fx_dir = DATA_PROCESSED / 'fx'
fx_dir.mkdir(parents=True, exist_ok=True)
labels_path_exp = fx_dir / f'eurusd_labels__{EXP_ID}.csv'
labels.to_csv(labels_path_exp)
print('[ok] Labels gespeichert:', labels_path_exp)

if WRITE_LATEST:
    labels_path_latest = fx_dir / 'eurusd_labels.csv'
    labels.to_csv(labels_path_latest)
    print('[ok] Labels (latest) gespeichert:', labels_path_latest)

# 6) Dataset bauen (news+price oder price_only)
if FEATURE_MODE == 'news+price':
    merged = build_training_dataframe(exp_id=EXP_ID)
    ds_name = f'eurusd_news_training__{EXP_ID}.csv'
else:
    merged = build_price_only_training_dataframe_from_labels(exp_id=EXP_ID)
    ds_name = f'eurusd_price_training__{EXP_ID}.csv'

# 7) Intraday-Features dazu mergen
intraday_df = intraday_feat.reset_index().rename(columns={'Date': 'date'})
merged = merged.merge(intraday_df, on='date', how='left')

# 8) Speichern
out_dir = DATA_PROCESSED / 'datasets'
out_dir.mkdir(parents=True, exist_ok=True)
train_path_exp = out_dir / ds_name
merged.to_csv(train_path_exp, index=False)
print('[ok] Trainingsdatensatz gespeichert:', train_path_exp)

if WRITE_LATEST:
    latest_name = 'eurusd_news_training.csv' if FEATURE_MODE=='news+price' else 'eurusd_price_training.csv'
    train_path_latest = out_dir / latest_name
    merged.to_csv(train_path_latest, index=False)
    print('[ok] Dataset (latest) gespeichert:', train_path_latest)
print('[info] Dataset shape:', merged.shape)


EXP_ID: hp_mt5_flex_result
FEATURE_MODE: price_only
H1_CSV_PATH: data/raw/fx/EURUSD_mt5_H1_2015_2025.csv
CUT_HOUR: 0
DROP_WEEKENDS: False
LABEL_PARAMS: {'horizon_days': 15, 'up_threshold': 0.02, 'down_threshold': -0.02, 'strict_monotonic': False, 'max_adverse_move_pct': 0.004, 'hit_within_horizon': True, 'first_hit_wins': True, 'hit_source': 'h1', 'intraday_tie_breaker': 'down'}
[info] H1 rows: 68247 Daily days: 2852 Intraday days: 2852
[info] Daily range: 2015-01-02 -> 2025-12-24
[ok] Experiment-Konfiguration gespeichert unter: data/processed/experiments/hp_mt5_flex_result_config.json
[ok] Labels gespeichert: data/processed/fx/eurusd_labels__hp_mt5_flex_result.csv
[ok] Trainingsdatensatz gespeichert: data/processed/datasets/eurusd_price_training__hp_mt5_flex_result.csv
[info] Dataset shape: (2837, 60)
