## Datenaufbereitung

### stocks_import.py

In [None]:
import datetime
import yfinance as yf
import time

# Zeitraum: letzte 30 Tage
today = datetime.date.today()
start_date = today - datetime.timedelta(days=60)
end_date = today

# Aktienkurse (täglich)
tickers = ["NVDA", "GOOG", "MSFT"]
for ticker in tickers:
    df = yf.download(ticker, start=start_date, end=end_date, interval="1d", auto_adjust=True)
    df.to_csv(f"../../03_Daten/raw_data/historical_stock_data_daily_{ticker}_last60d.csv")
    time.sleep(10)  # 10 Sekunden warten

### data_format.py

In [None]:
import pandas as pd

ticker = "GOOG"
csv_path = f"../../03_Daten/raw_data/historical_stock_data_daily_{ticker}_last60d.csv"
output_path = f"../../03_Daten/processed_data/historical_stock_data_daily_{ticker}_last60d_flat.csv"

# 1) Einlesen mit Angabe, dass 2 Headerzeilen vorhanden sind und der Index die "Date"-Spalte ist
df = pd.read_csv(csv_path, header=[0, 1], index_col=0)
print("Vor Flatten:\n", df.head())
print("\nMultiIndex Columns:\n", df.columns)

# 2) Entferne die Ticker-Ebene (Level 1), sodass nur die Spaltennamen übrig bleiben
df.columns = df.columns.droplevel(1)
print("\nSpalten nach droplevel(1):\n", df.columns)

# 3) Falls das Datum aktuell als Index vorliegt, diesen zurück in eine eigene Spalte holen
df.reset_index(inplace=True)
# Falls der Indexname nicht bereits "Date" ist, kann man ihn umbenennen
if df.columns[0] != "Date":
    df.rename(columns={df.columns[0]: "Date"}, inplace=True)

# 4) Wähle explizit die gewünschten Spalten in der gewünschten Reihenfolge
df = df[["Date", "Close", "High", "Low", "Open", "Volume"]]

# 5) Datum in ein Datetime-Format umwandeln (optional, aber empfohlen)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# 6) Das geflattete DataFrame als neue CSV speichern
df.to_csv(output_path, index=False)
print("\nNach Flatten:\n", df.head())
print(f"\nFlattened CSV gespeichert unter: {output_path}")


### gtd_import.py

In [None]:
import time
from pytrends.request import TrendReq

start_date = "2015-01-01"
end_date = "2025-05-14"
timeframe_str = f"{start_date} {end_date}"

ticker_keywords = {
    "NVDA": ["NVIDIA stock", "sell NVIDIA stock", "buy NVIDIA stock"],
    "GOOG": ["Google stock", "sell Google stock", "buy Google stock"],
    "MSFT": ["Microsoft stock", "sell Microsoft stock", "buy Microsoft stock"]
}

# Erstellen einer pytrends-Instanz
pytrends = TrendReq(hl="en-US", tz=360)

# Für jede Aktie: Abrufen der Google Trends Daten für alle Suchbegriffe und Zusammenführen der Ergebnisse
for ticker, keywords in ticker_keywords.items():
    print(f"\nStarte Abruf der Google Trends Daten für {ticker}...")

    df_all = None  # DataFrame, in dem die Zeitreihen-Daten aller Keywords gespeichert werden
    for keyword in keywords:
        print(f"  Abrufe Keyword: {keyword}")
        kw_list = [keyword]
        pytrends.build_payload(kw_list, cat=0, timeframe=timeframe_str, geo='', gprop='')

        # Retry-Logik mit exponentiellem Backoff
        max_retries = 3
        retry_count = 0
        wait_time = 60  # Start-Wartezeit in Sekunden

        while retry_count < max_retries:
            try:
                trends_data = pytrends.interest_over_time()
                break
            except Exception as e:
                retry_count += 1
                print(
                    f"    Fehler beim Abruf von '{keyword}': {e}. Warte {wait_time} Sekunden, Versuch {retry_count} von {max_retries}..."
                )
                time.sleep(wait_time)
                wait_time *= 2  # Wartezeit verdoppeln
        else:
            raise Exception(f"Mehrere Versuche für '{keyword}' fehlgeschlagen. Bitte überprüfen Sie Ihre Anfrage.")

        # Entfernen der 'isPartial'-Spalte, falls vorhanden
        if 'isPartial' in trends_data.columns:
            trends_data = trends_data.drop(columns=['isPartial'])

        # Umbenennen der Spalte in den Keyword-Namen
        trends_data = trends_data.rename(columns={keyword: f"{keyword}"})

        # Falls df_all noch nicht existiert: setze es auf den aktuellen DataFrame
        if df_all is None:
            df_all = trends_data[[f"{keyword}"]].copy()
        else:
            # Merge: Zusammenführen an der Datumsspalte (Index)
            df_all = df_all.merge(trends_data[[f"{keyword}"]], left_index=True, right_index=True, how="outer")

    # Ergebnisse chronologisch sortieren
    df_all = df_all.sort_index()
    print(f"\nErste Zeilen der kombinierten Google Trends Daten für {ticker}:")
    print(df_all.head())

    # kombiniertes DataFrame als CSV speichern
    output_filename = f"../../03_Daten/raw_data/google_trends_daily_{ticker}_last30d.csv"
    df_all.to_csv(output_filename)
    print(f"Google Trends Daten für {ticker} wurden in '{output_filename}' gespeichert.")


### gtd_merger.py

In [None]:
import os
import pandas as pd

# Einstellungen
tickers = ["NVDA", "GOOG", "MSFT"]
periods = ["2015-2020", "2020-2023", "2023-2025"]
input_pattern = "../../03_Daten/raw_data/google_trends_weekly_{ticker}_{period}.csv"
output_dir = "../../03_Daten/processed_data"
os.makedirs(output_dir, exist_ok=True)

for ticker in tickers:
    # Liste zum Sammeln der DataFrames
    dfs = []

    # 1) Die CSVs für die 3 Perioden einlesen
    for period in periods:
        fn = input_pattern.format(ticker=ticker, period=period)
        try:
            df = pd.read_csv(
                fn,
                parse_dates=["date"],
                index_col="date"
            )
            dfs.append(df)
            print(f"eingelesen: {fn} ({len(df)} Zeilen)")
        except FileNotFoundError:
            print(f"Datei nicht gefunden: {fn}")

    if not dfs:
        print(f"Keine Daten für {ticker}, überspringe.")
        continue

    # 2) Aneinanderhängen
    merged = pd.concat(dfs)

    # 3) Nach Datum sortieren
    merged = merged.sort_index()

    # 4) Duplikate (gleicher Index) entfernen, ersten behalten
    merged = merged[~merged.index.duplicated(keep="first")]

    # 5) Abspeichern
    out_fn = os.path.join(
        output_dir,
        f"google_trends_weekly_{ticker}_2015-2025.csv"
    )
    merged.to_csv(out_fn)
    print(f"gespeichert: {out_fn} ({len(merged)} Zeilen)\n")


### data_merger.py

In [None]:
import pandas as pd

# Liste der Ticker, für die die Daten zusammengeführt werden sollen
tickers = ["NVDA", "GOOG", "MSFT"]

for ticker in tickers:
    # 1) Aktienkurs- & RSI-Daten einlesen, sortiert nach Datum
    stock_path = f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat_with_RSI.csv"
    stock_df = (
        pd.read_csv(stock_path, parse_dates=["Date"], index_col="Date")
          .sort_index()
    )

    # 2) Google Trends-Daten einlesen, sortiert
    trends_path = f"../../03_Daten/processed_data/google_trends_weekly_{ticker}_2015-2025.csv"
    trends_df = (
        pd.read_csv(trends_path, parse_dates=["date"], index_col="date")
          .sort_index()
    )

    # 3) Gemeinsamen Zeithorizont bestimmen (ca. 10 Jahre)
    start_date = max(stock_df.index.min(), trends_df.index.min())
    end_date   = min(stock_df.index.max(), trends_df.index.max())
    stock_df   = stock_df.loc[start_date:end_date]
    trends_df  = trends_df.loc[start_date:end_date]

    # 4) Auf Wochenperioden abbilden
    stock_df.index  = stock_df.index.to_period("W")
    trends_df.index = trends_df.index.to_period("W")

    # 5) Inner Join auf Wochenniveau
    merged = stock_df.join(
        trends_df,
        how="inner",
        lsuffix="",
        rsuffix="_trend"
    )

    # 6) PeriodIndex zurück in Timestamps
    merged.index = merged.index.to_timestamp()

    # 7) Abspeichern
    out_path = f"../../03_Daten/processed_data/merged_weekly_{ticker}_2015-2025.csv"
    merged.to_csv(out_path)
    print(f"{ticker}: Merged über {merged.shape[0]} Wochen → {out_path}")


## Featureengineering

### RSI_feature.py

In [None]:
import pandas as pd

def compute_rsi(df: pd.DataFrame, period: int = 14) -> pd.Series:
    """
    Berechnet den klassischen 14‑Perioden‑RSI mit min_periods,
    sodass erst ab Index = period Werte ausgegeben werden.
    """
    delta = df["Close"].diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    # hier min_periods=period setzen
    ema_up = up.ewm(com=period-1, adjust=False, min_periods=period).mean()
    ema_down = down.ewm(com=period-1, adjust=False, min_periods=period).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Pfad zur geflatteten CSV
ticker = "MSFT"
infile = f"../../03_Daten/processed_data/historical_stock_data_daily_{ticker}_last60d_flat.csv"
outfile = f"../../03_Daten/processed_data/historical_stock_data_daily_{ticker}_last60d_flat_with_RSI.csv"

# CSV einlesen
df = pd.read_csv(infile, parse_dates=["Date"], index_col="Date")

# RSI berechnen (erst ab der 14. Woche echte Werte)
df["RSI_14"] = compute_rsi(df, period=14)

# Kurzcheck auf NaNs in den ersten Reihen
print(df[["Close", "RSI_14"]].head(20))

# Neue CSV speichern
df.to_csv(outfile, index=True)
print(f"RSI hinzugefügt und gespeichert in {outfile}")


## Implementierung der Vergleichsmodelle

### ARIMA.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

tickers = ["NVDA", "GOOG", "MSFT"]
results_arima = []

for ticker in tickers:
    df = pd.read_csv(
        f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat.csv",
        parse_dates=['Date'], index_col='Date'
    ).sort_index()

    tscv = TimeSeriesSplit(n_splits=3)
    fold = 1
    plt.figure(figsize=(12, 8))

    for train_index, test_index in tscv.split(df):
        train, test = df.iloc[train_index], df.iloc[test_index]
        model = ARIMA(train['Close'], order=(1,1,1)).fit()
        forecast = model.forecast(steps=len(test))
        forecast.index = test.index

        # metrics
        y_true = test['Close']
        y_pred = forecast
        rmse = math.sqrt(mean_squared_error(y_true, y_pred))
        mae  = mean_absolute_error(y_true, y_pred)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        r2   = r2_score(y_true, y_pred)

        results_arima.append({
            'Ticker': ticker,
            'Fold': fold,
            'RMSE': rmse,
            'MAE': mae,
            'MAPE': mape,
            'R2': r2
        })

        plt.plot(test.index, test['Close'], label=f'Fold {fold} Actual', linewidth=2)
        plt.plot(forecast.index, forecast, linestyle='--', label=f'Fold {fold} Forecast', linewidth=2)
        fold += 1

    plt.title(f"{ticker} ARIMA(1,1,1) Forecasts across 3 Folds")
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Zusammenfassung der Einzel-Fold-Metriken
summary_df = pd.DataFrame(results_arima)
print("Einzel-Fold Metriken:")
print(summary_df)

# Durchschnittliche Metriken über alle 3 Folds je Ticker
avg_metrics = summary_df.groupby('Ticker').agg(
    RMSE=('RMSE','mean'),
    MAE=('MAE','mean'),
    MAPE=('MAPE','mean'),
    R2=('R2','mean')
).reset_index()

print("\nDurchschnittliche Metriken je Ticker über 3 Folds:")
print(avg_metrics)


### SARIMAX.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

# Ticker-Liste und Ergebnis-Speicher
tickers = ["NVDA", "GOOG", "MSFT"]
results_sarimax = []

for ticker in tickers:
    # Daten einlesen
    df = pd.read_csv(
        f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat.csv",
        parse_dates=["Date"],
        index_col="Date"
    ).sort_index()

    # TimeSeriesSplit-Validierung
    tscv = TimeSeriesSplit(n_splits=3)
    fold = 1
    plt.figure(figsize=(12, 8))

    for train_idx, test_idx in tscv.split(df):
        train = df.iloc[train_idx]
        test  = df.iloc[test_idx]

        model = SARIMAX(
            train['Close'],
            order=(1, 1, 1),
            seasonal_order=(1, 1, 1, 52),
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        fit = model.fit(disp=False)

        # Forecast für Test
        forecast = fit.forecast(steps=len(test))
        forecast.index = test.index

        # Metriken berechnen
        y_true = test['Close']
        y_pred = forecast
        rmse = math.sqrt(mean_squared_error(y_true, y_pred))
        mae  = mean_absolute_error(y_true, y_pred)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        r2   = r2_score(y_true, y_pred)

        results_sarimax.append({
            'Ticker': ticker,
            'Fold': fold,
            'RMSE': rmse,
            'MAE': mae,
            'MAPE': mape,
            'R2': r2
        })

        # Plot Actual vs Forecast
        plt.plot(test.index, y_true, label=f'Fold {fold} Actual', linewidth=2)
        plt.plot(forecast.index, forecast, linestyle='--', label=f'Fold {fold} Forecast', linewidth=2)
        fold += 1

    plt.title(f"{ticker} SARIMAX(1,1,1)x(1,1,1,52) Forecasts across 3 Folds")
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Zusammenfassung der Fold-Metriken
summary_df = pd.DataFrame(results_sarimax)
print("Einzel-Fold SARIMAX Metriken:")
print(summary_df)

# Durchschnittliche Metriken je Ticker
avg_metrics = summary_df.groupby('Ticker').agg(
    RMSE=('RMSE','mean'),
    MAE=('MAE','mean'),
    MAPE=('MAPE','mean'),
    R2=('R2','mean')
).reset_index()

print("\nDurchschnittliche SARIMAX Metriken je Ticker über 3 Folds:")
print(avg_metrics)


### lstm.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout

# 1) Daten einlesen und vorbereiten
ticker = "MSFT"
df = pd.read_csv(
    f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat.csv",
    parse_dates=["Date"], index_col="Date"
).sort_index()
data = df[["Close"]].copy()

# 2) Skalierung in [0,1]
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

# 3) Funktion, um Sequenzen zu bauen
def create_sequences(dataset, window_size=10):
    X, y = [], []
    for i in range(len(dataset) - window_size):
        X.append(dataset[i : i + window_size, 0])
        y.append(dataset[i + window_size, 0])
    return np.array(X), np.array(y)

window_size = 10

# 4) TimeSeriesSplit
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

# Storage
rmse_list, mae_list, mape_list, r2_list, hit_rate_list, sharpe_list = ([] for _ in range(6))
all_y_true, all_y_pred, all_dates = [], [], []

# 5) Loop über Folds
fold_num = 1
for train_idx, test_idx in tscv.split(scaled_data):
    # Split in train/test
    train_data = scaled_data[train_idx]
    test_data  = scaled_data[test_idx]

    # Sequenzen
    X_train, y_train = create_sequences(train_data, window_size)
    X_test,  y_test  = create_sequences(test_data,  window_size)

    # Reshape für LSTM
    X_train = X_train.reshape((-1, window_size, 1))
    X_test  = X_test.reshape((-1, window_size, 1))

    # Modell definieren
    model = Sequential([
        Input(shape=(window_size, 1)),
        LSTM(50, return_sequences=True),
        Dropout(0.2),
        LSTM(50),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer="adam", loss="mse")
    model.fit(X_train, y_train, epochs=20, batch_size=16, verbose=0)

    # @tf.function Inferenz
    @tf.function
    def predict_fn(x):
        return model(x, training=False)

    # Vorhersage & inverse Skalierung
    y_pred_scaled = predict_fn(tf.constant(X_test)).numpy().reshape(-1, 1)
    y_test_scaled = y_test.reshape(-1, 1)
    y_pred = scaler.inverse_transform(y_pred_scaled).flatten()
    y_true = scaler.inverse_transform(y_test_scaled).flatten()

    # Datums-Indizes für diesen Fold (erste window_size Zeitpunkte entfallen)
    dates = df.index[test_idx][window_size:]
    all_dates.append(dates)

    # Metriken berechnen
    rmse  = math.sqrt(mean_squared_error(y_true, y_pred))
    mae   = mean_absolute_error(y_true, y_pred)
    mape  = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2    = r2_score(y_true, y_pred)

    # Speichern
    rmse_list.append(rmse)
    mae_list.append(mae)
    mape_list.append(mape)
    r2_list.append(r2)
    all_y_true.append(y_true)
    all_y_pred.append(y_pred)

    # Konsolenausgabe
    print(f"Fold {fold_num} – RMSE={rmse:.4f}, MAE={mae:.4f}, "
          f"MAPE={mape:.2f}%, R²={r2:.4f}")
    fold_num += 1

# 6) Durchschnitt aller Folds
print("\n=== Durchschnitt aller Folds ===")
print(f"RMSE:     {np.mean(rmse_list):.4f}")
print(f"MAE:      {np.mean(mae_list):.4f}")
print(f"MAPE:     {np.mean(mape_list):.4f}%")
print(f"R²:       {np.mean(r2_list):.4f}")

# 7) Ein einziger Plot für alle 3 Folds
plt.figure(figsize=(12, 6))
for i in range(n_splits):
    # Real
    plt.plot(all_dates[i], all_y_true[i], label=f"Real Prc Fold {i+1}")
    # Pred (gestrichelt)
    plt.plot(all_dates[i], all_y_pred[i], linestyle="--", label=f"Pred Prc Fold {i+1}")
plt.title(f"{ticker} – Aktienkurse alle {n_splits} Folds")
plt.xlabel("Datum")
plt.ylabel("Preis (Close)")
plt.legend()
plt.tight_layout()
plt.show()


### XGBoost.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
from xgboost import XGBRegressor

# 1) CSV laden und Daten vorbereiten
ticker = "NVDA"
df = pd.read_csv(
    f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat.csv",
    parse_dates=["Date"], index_col="Date"
).sort_index()
data = df[["Close"]].copy()
values = data.values

# 2) Funktion zum Erstellen von Sequenzen (Windowing)
def create_sequences(dataset, window_size=10):
    X, y = [], []
    for i in range(len(dataset) - window_size):
        X.append(dataset[i : i + window_size, 0])
        y.append(dataset[i + window_size, 0])
    return np.array(X), np.array(y)

window_size = 10

# 3) TimeSeriesSplit einrichten
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

# 4) Storage für Metriken und Kursverläufe
rmse_list, mae_list, mape_list, r2_list = [], [], [], []
hit_rate_list, sharpe_list = [], []
all_y_true, all_y_pred, all_dates = [], [], []

# 5) Loop über Folds
fold_num = 1
for train_idx, test_idx in tscv.split(values):
    # a) Train/Test-Split
    train_data = values[train_idx]
    test_data  = values[test_idx]

    # b) Sequenzen erstellen
    X_train, y_train = create_sequences(train_data, window_size)
    X_test,  y_test  = create_sequences(test_data,  window_size)

    print(f"Fold {fold_num}: X_train={X_train.shape}, y_train={y_train.shape}, "
          f"X_test={X_test.shape}, y_test={y_test.shape}")

    # c) XGBoost definieren & trainieren
    xgb_model = XGBRegressor(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        random_state=42
    )
    xgb_model.fit(X_train, y_train)

    # d) Vorhersage
    y_pred = xgb_model.predict(X_test)

    # Datums-Indizes für Plot
    dates = df.index[test_idx][window_size:]
    all_dates.append(dates)

    # Speichern der echten und prognostizierten Kurse
    all_y_true.append(y_test)
    all_y_pred.append(y_pred)

    # e) Metriken berechnen
    rmse  = math.sqrt(mean_squared_error(y_test, y_pred))
    mae   = mean_absolute_error(y_test, y_pred)
    mape  = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2    = r2_score(y_test, y_pred)

    # Ergebnisse sammeln
    rmse_list.append(rmse)
    mae_list.append(mae)
    mape_list.append(mape)
    r2_list.append(r2)

    # Konsolenausgabe pro Fold
    print(f"Fold {fold_num} – "
          f"RMSE={rmse:.4f}, "
          f"MAE={mae:.4f}, "
          f"MAPE={mape:.2f}%, "
          f"R²={r2:.4f}")
    fold_num += 1

# 6) Durchschnitt über alle Folds
print("=== Durchschnitt über alle Folds ===")
print(f"RMSE:       {np.mean(rmse_list):.4f}")
print(f"MAE:        {np.mean(mae_list):.4f}")
print(f"MAPE:       {np.mean(mape_list):.4f}%")
print(f"R²:         {np.mean(r2_list):.4f}")

# 7) Plot für alle 3 Folds
plt.figure(figsize=(12, 6))
for i in range(n_splits):
    plt.plot(all_dates[i], all_y_true[i],
             label=f"Real Prc Fold {i+1}")
    plt.plot(all_dates[i], all_y_pred[i],
             linestyle="--", label=f"Pred Prc Fold {i+1}")
plt.title(f"{ticker} – Aktienkurse alle {n_splits} Folds")
plt.xlabel("Datum")
plt.ylabel("Close Price")
plt.legend()
plt.tight_layout()
plt.show()


## Entwicklung und Training der Hybridmodelle

### GARCH_LSTM2.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from arch import arch_model
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense

# 1) Daten einlesen & Renditen berechnen
ticker = "GOOG"
df = pd.read_csv(
    f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat.csv",
    parse_dates=["Date"], index_col="Date"
).sort_index()
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(inplace=True)

# 2) CV-Setup
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)
window_size = 10

# 3) Speicher für Metriken
rmse_ret, mae_ret, mape_ret, r2_ret = [], [], [], []
hit_ret, sharpe_ret = [], []
rmse_pr, mae_pr, mape_pr, r2_pr = [], [], [], []
hit_pr, sharpe_pr = [], []

# 4) Speicher für Plots
all_ret_true, all_ret_pred, dates_ret = [], [], []
all_pr_true, all_pr_pred, dates_pr = [], [], []

# Helper zum Feature-Engineering
def create_features_and_target(df, window_size=10):
    X, y = [], []
    rets = df["Return"].values
    vol  = df["GARCH_vol"].values
    for i in range(window_size, len(df)):
        X.append(np.concatenate([rets[i-window_size:i], [vol[i]]]))
        y.append(rets[i])
    return np.array(X), np.array(y)

# 5) Schleife über die Folds
fold = 1
for train_idx, test_idx in tscv.split(df):
    train_df = df.iloc[train_idx].copy()
    test_df  = df.iloc[test_idx].copy()

    # a) GARCH auf den Trainings-Returns fitten
    scaled = train_df["Return"] * 10
    gm = arch_model(scaled, mean="Zero", vol="GARCH", p=1, q=1, dist="normal", rescale=False)
    res = gm.fit(disp="off")
    train_df["GARCH_vol"] = res.conditional_volatility / 10

    # b) GARCH-Forecast für den Testabschnitt
    horizon = len(test_df)
    fc = res.forecast(start=train_df.index[-1], horizon=horizon, reindex=False)
    test_df["GARCH_vol"] = np.sqrt(fc.variance.values[-1, :]) / 10

    # c) Features und Target
    X_tr, y_tr = create_features_and_target(train_df, window_size)
    X_te, y_te = create_features_and_target(test_df,  window_size)

    # d) LSTM definieren & trainieren
    X_tr_l = X_tr.reshape(-1, window_size+1, 1)
    X_te_l = X_te.reshape(-1, window_size+1, 1)
    model = Sequential([
        Input(shape=(window_size+1, 1)),
        LSTM(50, return_sequences=True), Dropout(0.2),
        LSTM(50),                     Dropout(0.2),
        Dense(1)
    ])
    model.compile("adam", "mse")
    model.fit(X_tr_l, y_tr, epochs=20, batch_size=16, verbose=0)

    # e) Vorhersage der Log-Returns
    y_pred = model.predict(X_te_l).flatten()

    # Datums-Vektor
    dates = test_df.index[window_size:]
    dates_ret.append(dates)
    all_ret_true.append(y_te)
    all_ret_pred.append(y_pred)

    # f) Metriken auf Returns
    rm = math.sqrt(mean_squared_error(y_te, y_pred))
    ma = mean_absolute_error(y_te, y_pred)
    denom_r = np.where(y_te == 0, np.nan, y_te)
    mp = np.nanmean(np.abs((y_te - y_pred) / denom_r)) * 100
    r2v = r2_score(y_te, y_pred)
    dir_t = np.sign(np.diff(y_te))
    dir_p = np.sign(np.diff(y_pred))
    hr  = (dir_t == dir_p).mean() * 100
    rets_pred = np.diff(y_pred) / y_pred[:-1]
    sr  = rets_pred.mean() / (rets_pred.std() if rets_pred.std() != 0 else np.nan)

    rmse_ret.append(rm); mae_ret.append(ma); mape_ret.append(mp)
    r2_ret.append(r2v); hit_ret.append(hr); sharpe_ret.append(sr)

    # g) Rückrechnung in Preise
    preds, actuals = [], []
    for i, r in enumerate(y_pred):
        prev_price = test_df["Close"].iloc[i + window_size - 1]
        preds.append(prev_price * np.exp(r))
        actuals.append(test_df["Close"].iloc[i + window_size])
    preds   = np.array(preds)
    actuals = np.array(actuals)

    dates_pr.append(dates)
    all_pr_true.append(actuals)
    all_pr_pred.append(preds)

    # h) Metriken auf Preise
    rm_p = math.sqrt(mean_squared_error(actuals, preds))
    ma_p = mean_absolute_error(actuals, preds)
    denom_p = np.where(actuals == 0, np.nan, actuals)
    mp_p = np.nanmean(np.abs((actuals - preds) / denom_p)) * 100
    r2p  = r2_score(actuals, preds)
    dt   = np.sign(np.diff(actuals))
    dp   = np.sign(np.diff(preds))
    hr_p = (dt == dp).mean() * 100
    rets_p = np.diff(preds) / preds[:-1]
    sr_p    = rets_p.mean() / (rets_p.std() if rets_p.std() != 0 else np.nan)

    rmse_pr.append(rm_p); mae_pr.append(ma_p); mape_pr.append(mp_p)
    r2_pr.append(r2p);  hit_pr.append(hr_p);  sharpe_pr.append(sr_p)

    print(f"Fold {fold} fertig.")
    fold += 1

# 6) Durchschnittswerte ausgeben
print(f"=== Log-Returns (Ø über {n_splits} Folds) ===")
print(f"RMSE: {np.mean(rmse_ret):.4f}, MAE: {np.mean(mae_ret):.4f}, "
      f"MAPE: {np.mean(mape_ret):.4f}%, R²: {np.mean(r2_ret):.4f}, "
      f"Hit: {np.mean(hit_ret):.4f}%, Sharpe: {np.nanmean(sharpe_ret):.4f}")
print(f"=== Close-Preise (Ø über {n_splits} Folds) ===")
print(f"RMSE: {np.mean(rmse_pr):.4f}, MAE: {np.mean(mae_pr):.4f}, "
      f"MAPE: {np.mean(mape_pr):.4f}%, R²: {np.mean(r2_pr):.4f}, "
      f"Hit: {np.mean(hit_pr):.4f}%, Sharpe: {np.nanmean(sharpe_pr):.4f}")

# 7) Log-Returns aller 3 Folds
plt.figure(figsize=(12,6))
for i in range(n_splits):
    plt.plot(dates_ret[i], all_ret_true[i],  label=f"Real Ret Fold {i+1}")
    plt.plot(dates_ret[i], all_ret_pred[i], linestyle="--", label=f"Pred Ret Fold {i+1}")
plt.title(f"{ticker} – Log-Renditen alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Log-Return")
plt.legend()
plt.tight_layout()
plt.show()

# 8) Close-Preise aller 3 Folds
plt.figure(figsize=(12,6))
for i in range(n_splits):
    plt.plot(dates_pr[i], all_pr_true[i],  label=f"Real Prc Fold {i+1}")
    plt.plot(dates_pr[i], all_pr_pred[i], linestyle="--", label=f"Pred Prc Fold {i+1}")
plt.title(f"{ticker} – Close-Preise alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Preis (Close)")
plt.legend()
plt.tight_layout()
plt.show()


### GARCH_LSTM_RSI2.py

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from arch import arch_model
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Einstellungen
ticker      = "MSFT"
window_size = 10
n_splits    = 3
csv_path    = f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat_with_RSI.csv"

# 1) Daten einlesen und Return + GARCH_vol berechnen
df = pd.read_csv(csv_path, parse_dates=["Date"], index_col="Date").sort_index()
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(inplace=True)

# GARCH einmalig auf allen Daten fitten
scaled_all = df["Return"] * 10
garch_all  = arch_model(scaled_all, mean="Zero", vol="GARCH", p=1, q=1,
                        dist="normal", rescale=False).fit(disp="off")
df["GARCH_vol"] = garch_all.conditional_volatility / 10

# 2) Statistische Features skalieren (RSI + GARCH_vol)
static_cols = ["GARCH_vol", "RSI_14"]
scaler = StandardScaler()
df[static_cols] = scaler.fit_transform(df[static_cols])

# 3) X/y Erzeuger
def make_xy(df):
    X, y = [], []
    for i in range(window_size, len(df)):
        seq  = df["Return"].iloc[i-window_size:i].tolist()
        stat = df[static_cols].iloc[i].tolist()
        X.append(seq + stat)
        y.append(df["Return"].iat[i])
    return np.array(X), np.array(y)

# 4) Hyperparameter‑Grid und CV-Setup
param_grid = {"units":[50], "dropout":[0.2], "lr":[1e-3, 5e-4], "batch_size":[16]}
tscv = TimeSeriesSplit(n_splits=n_splits)
best_rmse, best_cfg = np.inf, None

# 5) Grid‑Search über Log‑Return‑RMSE
for units in param_grid["units"]:
    for drop in param_grid["dropout"]:
        for lr in param_grid["lr"]:
            for bs in param_grid["batch_size"]:
                cv_rmses = []
                for tr_idx, te_idx in tscv.split(df):
                    train_df = df.iloc[tr_idx]
                    test_df  = df.iloc[te_idx]
                    X_tr, y_tr = make_xy(train_df)
                    X_ts, y_ts = make_xy(test_df)
                    cut = int(len(X_tr)*0.9)
                    X_train, X_val = X_tr[:cut], X_tr[cut:]
                    y_train, y_val = y_tr[:cut], y_tr[cut:]
                    X_train = X_train.reshape((-1, X_train.shape[1],1))
                    X_val   = X_val.reshape((-1, X_val.shape[1],1))
                    X_test  = X_ts .reshape((-1, X_ts .shape[1],1))
                    model = Sequential([
                        Input(shape=(X_train.shape[1],1)),
                        LSTM(units, return_sequences=True),
                        Dropout(drop),
                        LSTM(units),
                        Dropout(drop),
                        Dense(1)
                    ])
                    opt = tf.keras.optimizers.Adam(learning_rate=lr)
                    model.compile(optimizer=opt, loss="mse")
                    es = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
                    model.fit(X_train, y_train, validation_data=(X_val,y_val),
                              epochs=20, batch_size=bs, callbacks=[es], verbose=0)
                    y_pred = model.predict(X_test).flatten()
                    cv_rmses.append(math.sqrt(mean_squared_error(y_ts, y_pred)))
                avg_rmse = np.mean(cv_rmses)
                if avg_rmse < best_rmse:
                    best_rmse, best_cfg = avg_rmse, {"units":units,"dropout":drop,"lr":lr,"batch_size":bs}
print(f"\nBest CV‑RMSE (Log‑Renditen): {best_rmse:.4f}")
print("Best Config:", best_cfg)

# 5) Finales Modelltraining auf dem gesamten Datensatz
X_all, y_all = make_xy(df)
X_all = X_all.reshape((-1, X_all.shape[1], 1))

model = Sequential([
    Input(shape=(X_all.shape[1], 1)),
    LSTM(best_cfg["units"], return_sequences=True),
    Dropout(best_cfg["dropout"]),
    LSTM(best_cfg["units"]),
    Dropout(best_cfg["dropout"]),
    Dense(1)
])
model.compile(optimizer=tf.keras.optimizers.Adam(best_cfg["lr"]), loss="mse")
model.fit(X_all, y_all, epochs=20, batch_size=best_cfg["batch_size"], verbose=1)

# Modellperformance auf Trainingdaten berechnen
y_pred_all = model.predict(X_all).flatten()

# RMSE auf Renditeebene (Train)
rmse_ret_all = math.sqrt(mean_squared_error(y_all, y_pred_all))
print(f"\n📈 RMSE (Renditeebene) auf Trainingsdaten: {rmse_ret_all:.4f}")

# RMSE auf Preisebene rekonstruieren
# Schritt 1: Startpreis
start_prices = df["Close"].iloc[window_size - 1 : -1].values
true_prices = df["Close"].iloc[window_size:].values
pred_prices = start_prices * np.exp(y_pred_all)

# Schritt 2: RMSE auf Preisbasis
rmse_prc_all = math.sqrt(mean_squared_error(true_prices, pred_prices))
print(f"💰 RMSE (Preisebene) auf Trainingsdaten:  {rmse_prc_all:.4f}")

# Modell speichern
model.save(f"../../05_Modelle/garch_lstm_{ticker.lower()}_final_model.keras")
print("✅ Finales Modell gespeichert")


# 6) Endgültiges Training & Metriken pro Fold ausgeben
metrics = {"rmse_ret":[],"mae_ret":[],"mape_ret":[],"r2_ret":[],"hit_ret":[],"sharpe_ret":[],
           "rmse_prc":[],"mae_prc":[],"mape_prc":[],"r2_prc":[],"hit_prc":[],"sharpe_prc":[]}
fold_results = []

for fold, (tr_idx, te_idx) in enumerate(tscv.split(df), 1):
    train_df = df.iloc[tr_idx]
    test_df  = df.iloc[te_idx]
    X_tr, y_tr = make_xy(train_df)
    X_ts, y_ts = make_xy(test_df)
    X_tr = X_tr.reshape((-1, X_tr.shape[1],1))
    X_ts = X_ts.reshape((-1, X_ts.shape[1],1))
    model = Sequential([
        Input(shape=(X_tr.shape[1],1)),
        LSTM(best_cfg["units"], return_sequences=True),
        Dropout(best_cfg["dropout"]),
        LSTM(best_cfg["units"]),
        Dropout(best_cfg["dropout"]),
        Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(best_cfg["lr"]), loss="mse")
    es = EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)
    model.fit(X_tr, y_tr, epochs=20, batch_size=best_cfg["batch_size"], callbacks=[es], verbose=0)
    y_pred = model.predict(X_ts).flatten()

    # Returns-Metriken
    rm_ret = math.sqrt(mean_squared_error(y_ts, y_pred))
    mae_ret = mean_absolute_error(y_ts, y_pred)
    denom_r = np.where(y_ts==0, np.nan, y_ts)
    mape_ret = np.nanmean(np.abs((y_ts - y_pred)/denom_r))*100
    r2_ret = r2_score(y_ts, y_pred)
    dir_true = np.sign(np.diff(y_ts))
    dir_pred = np.sign(np.diff(y_pred))
    hit_ret = (dir_true==dir_pred).mean()*100
    ret_ret = np.diff(y_pred)/y_pred[:-1]
    sharpe_ret = ret_ret.mean()/(ret_ret.std() if ret_ret.std()!=0 else np.nan)
    metrics["rmse_ret"].append(rm_ret)
    metrics["mae_ret"].append(mae_ret)
    metrics["mape_ret"].append(mape_ret)
    metrics["r2_ret"].append(r2_ret)
    metrics["hit_ret"].append(hit_ret)
    metrics["sharpe_ret"].append(sharpe_ret)

    # Preis-Metriken
    preds, actuals = [], []
    for i, r in enumerate(y_pred):
        p0 = test_df["Close"].iloc[i+window_size-1]
        p = p0 * np.exp(r)
        preds.append(p)
        actuals.append(test_df["Close"].iloc[i+window_size])
    preds = np.array(preds); actuals = np.array(actuals)
    rm_prc = math.sqrt(mean_squared_error(actuals, preds))
    mae_prc = mean_absolute_error(actuals, preds)
    denom_p = np.where(actuals==0, np.nan, actuals)
    mape_prc = np.nanmean(np.abs((actuals - preds)/denom_p))*100
    r2_prc = r2_score(actuals, preds)
    dir_t = np.sign(np.diff(actuals))
    dir_p = np.sign(np.diff(preds))
    hit_prc = (dir_t==dir_p).mean()*100
    ret_pr = np.diff(preds)/preds[:-1]
    sharpe_prc = ret_pr.mean()/(ret_pr.std() if ret_pr.std()!=0 else np.nan)
    metrics["rmse_prc"].append(rm_prc)
    metrics["mae_prc"].append(mae_prc)
    metrics["mape_prc"].append(mape_prc)
    metrics["r2_prc"].append(r2_prc)
    metrics["hit_prc"].append(hit_prc)
    metrics["sharpe_prc"].append(sharpe_prc)
    print(f"Fold {fold}:\n"
          f"  Returns → RMSE={rm_ret:.4f}, MAE={mae_ret:.4f}, MAPE={mape_ret:.4f}%, R2={r2_ret:.4f}, Hit-Rate={hit_ret:.4f}%, Sharpe={sharpe_ret:.4f}\n"
          f"  Prices  → RMSE={rm_prc:.4f}, MAE={mae_prc:.4f}, MAPE={mape_prc:.4f}%, R2={r2_prc:.4f}, Hit-Rate={hit_prc:.4f}%, Sharpe={sharpe_prc:.4f}")
    fold_results.append({"idx":test_df.index[window_size:], "y_test":y_ts, "y_pred":y_pred,
                         "actuals":actuals, "preds":preds})

# 7) Durchschnittliche Metriken über alle Folds
# berechne Mittelwerte
avg_rmse_ret   = np.nanmean(metrics["rmse_ret"])
avg_mae_ret    = np.nanmean(metrics["mae_ret"])
avg_mape_ret   = np.nanmean(metrics["mape_ret"])
avg_r2_ret     = np.nanmean(metrics["r2_ret"])
avg_hit_ret    = np.nanmean(metrics["hit_ret"])
avg_sharpe_ret = np.nanmean(metrics["sharpe_ret"])

avg_rmse_prc   = np.nanmean(metrics["rmse_prc"])
avg_mae_prc    = np.nanmean(metrics["mae_prc"])
avg_mape_prc   = np.nanmean(metrics["mape_prc"])
avg_r2_prc     = np.nanmean(metrics["r2_prc"])
avg_hit_prc    = np.nanmean(metrics["hit_prc"])
avg_sharpe_prc = np.nanmean(metrics["sharpe_prc"])

# Ausgabe
print("\n=== Durchschnittliche Metriken: Log-Renditen ===")
print(f"RMSE      = {avg_rmse_ret:.4f}")
print(f"MAE       = {avg_mae_ret:.4f}")
print(f"MAPE      = {avg_mape_ret:,.4f}%")
print(f"R²        = {avg_r2_ret:.4f}")
print(f"HitRate   = {avg_hit_ret:.4f}%")
print(f"Sharpe    = {avg_sharpe_ret:.4f}")

print("\n=== Durchschnittliche Metriken: Preise ===")
print(f"RMSE      = {avg_rmse_prc:.4f}")
print(f"MAE       = {avg_mae_prc:.4f}")
print(f"MAPE      = {avg_mape_prc:.4f}%")
print(f"R²        = {avg_r2_prc:.4f}")
print(f"HitRate   = {avg_hit_prc:.4f}%")
print(f"Sharpe    = {avg_sharpe_prc:.4f}")

# 8) Plots: Log-Renditen & Preise über alle Folds
plt.figure(figsize=(12,5))
for i,fr in enumerate(fold_results,1):
    plt.plot(fr["idx"], fr["y_test"], label=f"Real Ret Fold {i}", alpha=0.8)
    plt.plot(fr["idx"], fr["y_pred"], "--", label=f"Pred Ret Fold {i}", alpha=0.8)
plt.title(f"{ticker} – Log‑Renditen alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Log‑Rendite"); plt.legend(); plt.tight_layout(); plt.show()

plt.figure(figsize=(12,5))
for i,fr in enumerate(fold_results,1):
    plt.plot(fr["idx"], fr["actuals"], label=f"Real Prc Fold {i}", alpha=0.8)
    plt.plot(fr["idx"], fr["preds"],   "--", label=f"Pred Prc Fold {i}", alpha=0.8)
plt.title(f"{ticker} – Aktienkurse alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Preis (Close)"); plt.legend(); plt.tight_layout(); plt.show()


### GARCH_LSTM_RSI_RMSE_Test.py

In [None]:
import numpy as np
import pandas as pd
import math
from arch import arch_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import load_model

# 1) Daten laden und auf die letzten 30 Tage beschränken
ticker = "MSFT"
df = pd.read_csv(f"../../03_Daten/processed_data/historical_stock_data_daily_{ticker}_last60d_flat_with_RSI.csv", parse_dates=["Date"])
df = df.sort_values("Date").set_index("Date")
df = df.last("30D").copy()

# 2) Return und GARCH-Volatilität berechnen
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(subset=["Return", "RSI_14"], inplace=True)

# GARCH auf Return anwenden
scaled_ret = df["Return"] * 10
garch = arch_model(scaled_ret, mean="Zero", vol="GARCH", p=1, q=1, dist="normal", rescale=False).fit(disp="off")
df["GARCH_vol"] = garch.conditional_volatility / 10

# 3) Feature-Skalierung (wie im Training: StandardScaler)
scaler = StandardScaler()
df[["GARCH_vol", "RSI_14"]] = scaler.fit_transform(df[["GARCH_vol", "RSI_14"]])

# 4) Sequenzen erzeugen (wie im Training)
window_size = 10
def make_xy(df):
    X, y = [], []
    for i in range(window_size, len(df)):
        seq  = df["Return"].iloc[i - window_size:i].tolist()
        stat = df[["GARCH_vol", "RSI_14"]].iloc[i].tolist()
        X.append(seq + stat)
        y.append(df["Return"].iloc[i])
    return np.array(X), np.array(y)

X_test, y_test = make_xy(df)
X_test = X_test.reshape((-1, X_test.shape[1], 1))

# 5) Modell laden und Vorhersage durchführen
model = load_model(f"../../05_Modelle/garch_lstm_{ticker.lower()}_final_model.keras")
y_pred = model.predict(X_test).flatten()

# 6) RMSE berechnen
rmse_day  = math.sqrt(mean_squared_error(y_test[-1:], y_pred[-1:]))
rmse_week = math.sqrt(mean_squared_error(y_test[-7:], y_pred[-7:]))
rmse_full = math.sqrt(mean_squared_error(y_test, y_pred))

# 7) Ergebnis anzeigen
print("\nRMSE auf den letzten 30 Tagen:")
print(f"Letzter Tag    : {rmse_day:.4f}")
print(f"Letzte Woche   : {rmse_week:.4f}")
print(f"Letzte 30 Tage : {rmse_full:.4f}")

# 8) RMSE auf tatsächliche Preise berechnen
# Rekonstruiere Preise aus vorhergesagten Log-Renditen
pred_prices = []
real_prices = []

# Ausgangspunkt: Preis an Position window_size - 1
start_idx = window_size - 1
for i in range(len(y_pred)):
    p0 = df["Close"].iloc[start_idx + i]  # letzter bekannter Preis
    pred_price = p0 * np.exp(y_pred[i])
    real_price = df["Close"].iloc[start_idx + i + 1]
    pred_prices.append(pred_price)
    real_prices.append(real_price)

pred_prices = np.array(pred_prices)
real_prices = np.array(real_prices)

# Preis-RMSE berechnen
rmse_price_day  = math.sqrt(mean_squared_error(real_prices[-1:], pred_prices[-1:]))
rmse_price_week = math.sqrt(mean_squared_error(real_prices[-7:], pred_prices[-7:]))
rmse_price_full = math.sqrt(mean_squared_error(real_prices, pred_prices))

# Ausgabe
print("\nRMSE auf den tatsächlichen Aktienkursen:")
print(f"Letzter Tag    : {rmse_price_day:.4f}")
print(f"Letzte Woche   : {rmse_price_week:.4f}")
print(f"Letzte 30 Tage : {rmse_price_full:.4f}")


### GARCH_XGBoost2.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from arch import arch_model
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import math

# 1) Daten einlesen und Renditen berechnen
ticker = "NVDA"
df = pd.read_csv(
    f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat.csv",
    parse_dates=["Date"], index_col="Date"
)
df.sort_index(inplace=True)
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(inplace=True)

# 2) TimeSeriesSplit konfigurieren
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)
window_size = 10

# 3) Helper: Features und Target aus Returns + GARCH-Vol erstellen
def create_features_and_target(df, window_size=10):
    X, y = [], []
    rets = df["Return"].values
    vols = df["GARCH_vol"].values
    for i in range(window_size, len(df)):
        feats = np.concatenate([rets[i-window_size:i], [vols[i]]])
        X.append(feats)
        y.append(rets[i])
    return np.array(X), np.array(y)

# 4) Speicher für Metriken und Plot-Daten
metrics_returns = []
metrics_prices  = []

dates_ret_all = []
y_test_all    = []
y_pred_all    = []

dates_pr_all  = []
actuals_all   = []
preds_all     = []

fold = 1
for train_idx, test_idx in tscv.split(df):
    train_df = df.iloc[train_idx].copy()
    test_df  = df.iloc[test_idx].copy()

    # a) GARCH auf Trainingsdaten fitten
    scaled = train_df["Return"] * 10
    g = arch_model(scaled, mean="Zero", vol="GARCH", p=1, q=1,
                   dist="normal", rescale=False)
    res = g.fit(disp="off")
    train_df["GARCH_vol"] = res.conditional_volatility / 10

    # b) Forecast der Volatilität für Testperiode
    fc = res.forecast(start=train_df.index[-1],
                      horizon=len(test_df),
                      reindex=False)
    test_df["GARCH_vol"] = np.sqrt(fc.variance.values[-1, :]) / 10

    # c) Features & Targets für XGB
    X_train, y_train = create_features_and_target(train_df, window_size)
    X_test,  y_test  = create_features_and_target(test_df,  window_size)

    # d) XGBoost trainieren und vorhersagen
    model = XGBRegressor(n_estimators=100, max_depth=3,
                         learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Datums-Indizes
    dates = test_df.index[window_size:]
    # für Renditen-Plot
    dates_ret_all.append(dates)
    y_test_all.append(y_test)
    y_pred_all.append(y_pred)

    # e) Metriken für Log-Renditen
    rmse_ret    = math.sqrt(mean_squared_error(y_test, y_pred))
    mae_ret     = mean_absolute_error(y_test, y_pred)
    mape_ret    = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-8))) * 100
    r2_ret      = r2_score(y_test, y_pred)
    hitrate_ret = np.mean(np.sign(y_test) == np.sign(y_pred)) * 100
    sharpe_ret  = np.mean(y_pred) / (np.std(y_pred) + 1e-8)

    metrics_returns.append([
        rmse_ret, mae_ret, mape_ret,
        r2_ret, hitrate_ret, sharpe_ret
    ])

    # f) Rückrechnung in Close-Preise
    preds, actuals = [], []
    for i, r in enumerate(y_pred):
        idx = i + window_size
        prev_p = test_df["Close"].iloc[idx-1]
        p_pred = prev_p * np.exp(r)
        preds.append(p_pred)
        actuals.append(test_df["Close"].iloc[idx])
    preds   = np.array(preds)
    actuals = np.array(actuals)

    # für Preis-Plot
    dates_pr_all.append(dates)
    actuals_all.append(actuals)
    preds_all.append(preds)

    # g) Metriken auf Preisbasis
    rmse_price   = math.sqrt(mean_squared_error(actuals, preds))
    mae_price    = mean_absolute_error(actuals, preds)
    mape_price   = np.mean(np.abs((actuals - preds) / (actuals + 1e-8))) * 100
    r2_price     = r2_score(actuals, preds)
    hitrate_price= np.mean(np.sign(np.diff(actuals)) ==
                           np.sign(np.diff(preds))) * 100
    sharpe_price = np.mean(np.diff(preds)) / (np.std(np.diff(preds)) + 1e-8)

    metrics_prices.append([
        rmse_price, mae_price, mape_price,
        r2_price, hitrate_price, sharpe_price
    ])

    print(f"Fold {fold}: RMSE_Returns={rmse_ret:.4f}, RMSE_Prices={rmse_price:.4f}")
    fold += 1

# 5) Durchschnittliche Metriken ausgeben
def print_avg(name, arr):
    m = np.array(arr)
    print(f"\n=== Ø Metriken: {name} ===")
    print(f"RMSE   = {m[:,0].mean():.4f}")
    print(f"MAE    = {m[:,1].mean():.4f}")
    print(f"MAPE   = {m[:,2].mean():.2f}%")
    print(f"R²     = {m[:,3].mean():.4f}")
    print(f"HitRate= {m[:,4].mean():.2f}%")
    print(f"Sharpe = {m[:,5].mean():.4f}")

print_avg("Log-Renditen", metrics_returns)
print_avg("Close-Preise", metrics_prices)

# 6) Plot aller 3 Folds: Log-Renditen
plt.figure(figsize=(12, 6))
for i in range(n_splits):
    plt.plot(dates_ret_all[i],    y_test_all[i],
             label=f"Real Ret Fold {i+1}")
    plt.plot(dates_ret_all[i],    y_pred_all[i],
             linestyle="--", label=f"Pred Ret Fold {i+1}")
plt.title(f"{ticker} – Log-Renditen alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Log-Return")
plt.legend(); plt.tight_layout(); plt.show()

# 7) Plot aller 3 Folds: Close-Preise
plt.figure(figsize=(12, 6))
for i in range(n_splits):
    plt.plot(dates_pr_all[i], actuals_all[i],
             label=f"Real Prc Fold {i+1}")
    plt.plot(dates_pr_all[i], preds_all[i],
             linestyle="--", label=f"Pred Prc Fold {i+1}")
plt.title(f"{ticker} – Close-Preise alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Preis (Close)")
plt.legend(); plt.tight_layout(); plt.show()


### GARCH_XGBoost_RSI.py

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from arch import arch_model
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Einstellungen
ticker      = "MSFT"
window_size = 10
n_splits    = 3
csv_path    = f"../../03_Daten/processed_data/historical_stock_data_weekly_{ticker}_flat_with_RSI.csv"

# 1) Daten einlesen und Log-Renditen berechnen
df = (
    pd.read_csv(csv_path, parse_dates=["Date"], index_col="Date")
      .sort_index()
)
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(inplace=True)

# 2) GARCH einmalig auf gesamten Datensatz fitten und Volatilität speichern
res_all = arch_model(df["Return"] * 10,
                     mean="Zero", vol="GARCH", p=1, q=1,
                     dist="normal", rescale=False
                    ).fit(disp="off")
df["GARCH_vol"] = res_all.conditional_volatility / 10

# 3) Helper zum Erzeugen der Features/Targets
def make_xy(df, window_size=10):
    X, y = [], []
    rets = df["Return"].values
    vols = df["GARCH_vol"].values
    rsis = df["RSI_14"].values
    for i in range(window_size, len(df)):
        seq   = rets[i-window_size:i]
        feats = np.concatenate([seq, [vols[i], rsis[i]]])
        X.append(feats)
        y.append(rets[i])
    return np.array(X), np.array(y)

# 4) Hyperparam-Suche
param_dist = {
    "n_estimators":    [50, 100, 200],
    "max_depth":       [3, 5, 7],
    "learning_rate":   [0.01, 0.05, 0.1],
    "subsample":       [0.8, 1.0],
    "colsample_bytree":[0.8, 1.0],
    "gamma":           [0, 0.1, 0.5]
}
xgb_base = XGBRegressor(random_state=42, tree_method="hist")
tscv_search = TimeSeriesSplit(n_splits=n_splits)
search = RandomizedSearchCV(
    xgb_base, param_dist,
    n_iter=20,
    cv=tscv_search,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    random_state=42,
    verbose=0
)
X_full, y_full = make_xy(df, window_size)
search.fit(X_full, y_full)
best_params = search.best_params_
print(">>> Best XGBoost Params:", best_params)

# 5) CV: Train/Test, Metriken, und Sammeln für Plots
tscv = TimeSeriesSplit(n_splits=n_splits)

# Listen für durchschnittliche Metriken
metrics_returns = []
metrics_prices  = []

# Listen für Plot-Daten aller Folds
dates_ret_all    = []
ret_true_all     = []
ret_pred_all     = []
dates_price_all  = []
price_true_all   = []
price_pred_all   = []

fold = 1
for tr_idx, te_idx in tscv.split(df):
    train_df = df.iloc[tr_idx].copy()
    test_df  = df.iloc[te_idx].copy()

    # GARCH-Forecast für Test-Set
    fc = res_all.forecast(
        start=train_df.index[-1],
        horizon=len(test_df),
        reindex=False
    )
    test_df["GARCH_vol"] = np.sqrt(fc.variance.values[-1, :]) / 10

    # Features/Targets
    X_train, y_train = make_xy(train_df, window_size)
    X_test,  y_test  = make_xy(test_df,  window_size)

    # Finales Modell trainieren
    model = XGBRegressor(**best_params, random_state=42, tree_method="hist")
    model.fit(X_train, y_train)

    # Vorhersage Log-Returns
    y_pred = model.predict(X_test)

    # Datums-Indizes
    dates = test_df.index[window_size:]
    dates_ret_all.append(dates)
    ret_true_all.append(y_test)
    ret_pred_all.append(y_pred)

    # Metriken Log-Returns mit Filter für Null-Returns
    rmse_ret    = math.sqrt(mean_squared_error(y_test, y_pred))
    mae_ret     = mean_absolute_error(y_test, y_pred)
    mask        = (y_test != 0)
    mape_ret    = np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100
    r2_ret      = r2_score(y_test, y_pred)
    hitrate_ret = np.mean(np.sign(y_test) == np.sign(y_pred)) * 100
    sharpe_ret  = np.mean(y_pred) / (np.std(y_pred) + 1e-8)
    metrics_returns.append([rmse_ret, mae_ret, mape_ret,
                             r2_ret, hitrate_ret, sharpe_ret])

    # Rückrechnung auf Close-Preise
    preds, actuals = [], []
    for i, r in enumerate(y_pred):
        p0 = test_df["Close"].iat[i + window_size - 1]
        p_pred = p0 * np.exp(r)
        preds.append(p_pred)
        actuals.append(test_df["Close"].iat[i + window_size])
    preds   = np.array(preds)
    actuals = np.array(actuals)

    dates_price_all.append(dates)
    price_true_all.append(actuals)
    price_pred_all.append(preds)

    # Metriken Close-Preise
    rmse_p    = math.sqrt(mean_squared_error(actuals, preds))
    mae_p     = mean_absolute_error(actuals, preds)
    mape_p    = np.mean(np.abs((actuals - preds) / (actuals + 1e-8))) * 100
    r2_p      = r2_score(actuals, preds)
    hitrate_p = np.mean(np.sign(np.diff(actuals)) == np.sign(np.diff(preds))) * 100
    sharpe_p  = np.mean(np.diff(preds)) / (np.std(np.diff(preds)) + 1e-8)
    metrics_prices.append([rmse_p, mae_p, mape_p,
                           r2_p, hitrate_p, sharpe_p])

    print(f"Fold {fold}: RMSE_Returns={rmse_ret:.4f}, RMSE_Prices={rmse_p:.4f}")
    fold += 1

# 6) Durchschnittliche Metriken ausgeben
def print_avg(name, arr):
    m = np.array(arr)
    print(f"\n=== Ø Metriken: {name} ===")
    print(f"RMSE    = {m[:,0].mean():.4f}")
    print(f"MAE     = {m[:,1].mean():.4f}")
    print(f"MAPE    = {m[:,2].mean():.2f}%")
    print(f"R²      = {m[:,3].mean():.4f}")
    print(f"HitRate = {m[:,4].mean():.2f}%")
    print(f"Sharpe  = {m[:,5].mean():.4f}")

print_avg("Log-Renditen", metrics_returns)
print_avg("Close-Preise", metrics_prices)

# 7) Plot: Log-Renditen über alle 3 Folds
plt.figure(figsize=(12,6))
for i in range(n_splits):
    plt.plot(dates_ret_all[i], ret_true_all[i],  label=f"Real Ret Fold {i+1}")
    plt.plot(dates_ret_all[i], ret_pred_all[i], linestyle="--", label=f"Pred Ret Fold {i+1}")
plt.title(f"{ticker} – Log-Renditen alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Log-Return")
plt.legend(); plt.tight_layout(); plt.show()

# 8) Plot: Close-Preise über alle 3 Folds
plt.figure(figsize=(12,6))
for i in range(n_splits):
    plt.plot(dates_price_all[i], price_true_all[i],  label=f"Real Prc Fold {i+1}")
    plt.plot(dates_price_all[i], price_pred_all[i], linestyle="--", label=f"Pred Prc Fold {i+1}")
plt.title(f"{ticker} – Close-Preise alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Preis (Close)")
plt.legend(); plt.tight_layout(); plt.show()

from joblib import dump

# 9) Finales Modell auf allen historischen Daten trainieren
model_final = XGBRegressor(**best_params, random_state=42, tree_method="hist")
model_final.fit(X_full, y_full)

# 10) Vorhersage auf Trainingsdaten
y_full_pred = model_final.predict(X_full)

# RMSE Log-Rendite (Training)
rmse_ret_full = math.sqrt(mean_squared_error(y_full, y_full_pred))
print(f"\n📈 RMSE (Renditeebene) auf Trainingsdaten: {rmse_ret_full:.4f}")

# Preis-Rückrechnung
start_prices = df["Close"].iloc[window_size - 1 : -1].values
true_prices  = df["Close"].iloc[window_size:].values
pred_prices  = start_prices * np.exp(y_full_pred)

# RMSE Preis (Training)
rmse_prc_full = math.sqrt(mean_squared_error(true_prices, pred_prices))
print(f"💰 RMSE (Preisebene) auf Trainingsdaten:  {rmse_prc_full:.4f}")

# 11) Modell speichern
model_path = f"../../05_Modelle/garch_xgboost_{ticker.lower()}_final_model.joblib"
dump(model_final, model_path)
print(f"✅ Modell gespeichert: {model_path}")


### GARCH_XGBoost_RSI_RMSE_Test.py

In [None]:
from joblib import load
from arch import arch_model
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import math

# Parameter
ticker = "MSFT"
window_size = 10
model_path = f"../../05_Modelle/garch_xgboost_{ticker.lower()}_final_model.joblib"
csv_path = f"../../03_Daten/processed_data/historical_stock_data_daily_{ticker}_last60d_flat_with_RSI.csv"

# Modell laden
model = load(model_path)
print(f"✅ Modell geladen: {model_path}")

# Daten laden und vorbereiten
df = pd.read_csv(csv_path, parse_dates=["Date"])
df = df.sort_values("Date").set_index("Date")
df = df.loc[df.index >= df.index.max() - pd.Timedelta(days=30)].copy()
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(subset=["Return", "RSI_14"], inplace=True)

# GARCH-Volatilität berechnen
ret_scaled = df["Return"] * 10
garch = arch_model(ret_scaled, mean="Zero", vol="GARCH", p=1, q=1, dist="normal", rescale=False).fit(disp="off")
df["GARCH_vol"] = garch.conditional_volatility / 10

# Feature-Vektoren erstellen
def make_xy_outsample(df, window_size):
    X, y = [], []
    for i in range(window_size, len(df)):
        seq = df["Return"].iloc[i - window_size:i].values
        vol = df["GARCH_vol"].iloc[i]
        rsi = df["RSI_14"].iloc[i]
        X.append(np.concatenate([seq, [vol, rsi]]))
        y.append(df["Return"].iloc[i])
    return np.array(X), np.array(y)

X_30, y_30 = make_xy_outsample(df, window_size)
y_pred = model.predict(X_30)

# RMSE Log-Rendite
rmse_ret_day  = math.sqrt(mean_squared_error(y_30[-1:], y_pred[-1:]))
rmse_ret_week = math.sqrt(mean_squared_error(y_30[-7:], y_pred[-7:]))
rmse_ret_full = math.sqrt(mean_squared_error(y_30, y_pred))
print("\n📈 RMSE auf Log-Renditen (Out-of-sample)")
print(f"Letzter Tag    : {rmse_ret_day:.4f}")
print(f"Letzte Woche   : {rmse_ret_week:.4f}")
print(f"Letzte 30 Tage : {rmse_ret_full:.4f}")

# RMSE Preisprognose
pred_prices, real_prices = [], []
start_idx = window_size - 1
for i in range(len(y_pred)):
    p0 = df["Close"].iloc[start_idx + i]
    p_pred = p0 * np.exp(y_pred[i])
    p_real = df["Close"].iloc[start_idx + i + 1]
    pred_prices.append(p_pred)
    real_prices.append(p_real)

rmse_price_day  = math.sqrt(mean_squared_error(real_prices[-1:], pred_prices[-1:]))
rmse_price_week = math.sqrt(mean_squared_error(real_prices[-7:], pred_prices[-7:]))
rmse_price_full = math.sqrt(mean_squared_error(real_prices, pred_prices))
print("\n💰 RMSE auf Preisen (Out-of-sample)")
print(f"Letzter Tag    : {rmse_price_day:.4f}")
print(f"Letzte Woche   : {rmse_price_week:.4f}")
print(f"Letzte 30 Tage : {rmse_price_full:.4f}")


## Integration und Evaluation von Google Trends-Daten

### EDA.py

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 5)

# Hochgeladene Dateien
paths = {
    "NVDA": "../../03_Daten/processed_data/merged_weekly_NVDA_2015-2025.csv",
    "GOOG": "../../03_Daten/processed_data/merged_weekly_GOOG_2015-2025.csv",
    "MSFT": "../../03_Daten/processed_data/merged_weekly_MSFT_2015-2025.csv"
}

# GTD-Spalten
gtd_cols = {
    "NVDA": ["NVIDIA stock", "buy NVIDIA stock", "sell NVIDIA stock"],
    "GOOG": ["Google stock", "buy Google stock", "sell Google stock"],
    "MSFT": ["Microsoft stock", "buy Microsoft stock", "sell Microsoft stock"]
}

for ticker, path in paths.items():
    df = pd.read_csv(path, parse_dates=["Date"], index_col="Date").sort_index()
    df["Return"] = df["Close"].pct_change()

    print(f"\n=== {ticker}: Beschreibung ===")
    print(df.describe())

    print("Fehlende Werte:")
    print(df.isnull().sum())

    # Plot: Schlusskurs
    plt.plot(df["Close"])
    plt.title(f"{ticker} – Wöchentlicher Schlusskurs")
    plt.xlabel("Datum"); plt.ylabel("Close")
    plt.tight_layout(); plt.show()

    # Plot: Histogramm Close
    sns.histplot(df["Close"].dropna(), kde=True)
    plt.title(f"{ticker} – Verteilung Schlusskurs")
    plt.xlabel("Close")
    plt.tight_layout(); plt.show()

    # Plot: Boxplot Close
    sns.boxplot(x=df["Close"].dropna())
    plt.title(f"{ticker} – Boxplot Schlusskurs")
    plt.xlabel("Close")
    plt.tight_layout(); plt.show()

    # Plot: Wöchentliche Renditen
    plt.plot(df["Return"])
    plt.title(f"{ticker} – Wöchentliche Rendite")
    plt.xlabel("Datum"); plt.ylabel("Return")
    plt.tight_layout(); plt.show()

    # Korrelationsmatrix zwischen Close und GTD
    corr_cols = ["Close"] + gtd_cols[ticker]
    corr_df = df[corr_cols].dropna()
    corr_matrix = corr_df.corr()

    print("Korrelationsmatrix:")
    print(corr_matrix)

    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title(f"{ticker} – Korrelation: Schlusskurs und Google Trends")
    plt.tight_layout(); plt.show()


### gtd_stock_corr.py

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ticker-konfiguration
ticker_files = {
    "NVDA": {
        "file": "../../03_Daten/processed_data/merged_weekly_NVDA_2015-2025.csv",
        "gtd": ["NVIDIA stock", "buy NVIDIA stock", "sell NVIDIA stock"]
    },
    "GOOG": {
        "file": "../../03_Daten/processed_data/merged_weekly_GOOG_2015-2025.csv",
        "gtd": ["Google stock", "buy Google stock", "sell Google stock"]
    },
    "MSFT": {
        "file": "../../03_Daten/processed_data/merged_weekly_MSFT_2015-2025.csv",
        "gtd": ["Microsoft stock", "buy Microsoft stock", "sell Microsoft stock"]
    },
}

for ticker, cfg in ticker_files.items():
    df = pd.read_csv(cfg["file"], parse_dates=["Date"], index_col="Date").sort_index()

    # 4-Wochen-Rolling-Mean der GTD
    df_roll = df[cfg["gtd"]].rolling(window=4, min_periods=1).mean()

    # Plot
    fig, ax1 = plt.subplots(figsize=(12, 5))
    ax1.plot(df.index, df["Close"], color="tab:blue", linewidth=2, label="Close Price")
    ax1.set_xlabel("Datum")
    ax1.set_ylabel("Close Price", color="tab:blue")
    ax1.tick_params(axis="y", labelcolor="tab:blue")

    ax2 = ax1.twinx()
    styles = ["--", "-.", ":"]
    for col, ls in zip(cfg["gtd"], styles):
        ax2.plot(df_roll.index, df_roll[col],
                 linestyle=ls, alpha=0.8, label=f"{col} (4-Wochen MA)")
    ax2.set_ylabel("Google Trends (4-Wochen gleitend)")

    # Legenden an getrennten Ecken
    h1, l1 = ax1.get_legend_handles_labels()
    h2, l2 = ax2.get_legend_handles_labels()
    ax1.legend(h1, l1, loc="upper left")
    ax2.legend(h2, l2, loc="upper right")

    plt.title(f"{ticker} – Kurs vs. Google Trends (4-Wochen-MA)")
    fig.tight_layout()
    plt.grid(alpha=0.3)
    plt.show()


### GARCH_LSTM_RSI_GTD.py

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from arch import arch_model
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Einstellungen
ticker      = "NVDA"
window_size = 10
n_splits    = 3
csv_path    = f"../../03_Daten/processed_data/merged_weekly_{ticker}_2015-2025_with_trends.csv"

# 1) Daten einlesen und Return + GARCH_vol berechnen
df = pd.read_csv(csv_path, parse_dates=["Date"], index_col="Date").sort_index()
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(inplace=True)

# GARCH einmalig auf allen Daten fitten
scaled_all = df["Return"] * 10
garch_all  = arch_model(scaled_all, mean="Zero", vol="GARCH", p=1, q=1,
                        dist="normal", rescale=False).fit(disp="off")
df["GARCH_vol"] = garch_all.conditional_volatility / 10

# 2) Statistische Features skalieren & GTD-Spalten identifizieren
static_cols = ["GARCH_vol", "RSI_14", "Trend_Average", "Trend_Smoothed"]
print("Verwendete statische Features:", static_cols)

scaler = StandardScaler()
df[static_cols] = scaler.fit_transform(df[static_cols])

# 3) X/y Erzeuger
def make_xy(df):
    X, y = [], []
    for i in range(window_size, len(df)):
        seq  = df["Return"].iloc[i-window_size:i].tolist()
        stat = df[static_cols].iloc[i].tolist()
        X.append(seq + stat)
        y.append(df["Return"].iat[i])
    return np.array(X), np.array(y)

# 4) Hyperparameter‑Grid
param_grid = {
    "units":      [50],
    "dropout":    [0.2],
    "lr":         [1e-3, 5e-4],
    "batch_size": [16]
}

tscv = TimeSeriesSplit(n_splits=n_splits)
best_rmse, best_cfg = np.inf, None

# 5) Grid‑Search über Log‑Return‑RMSE
for units in param_grid["units"]:
    for drop in param_grid["dropout"]:
        for lr in param_grid["lr"]:
            for bs in param_grid["batch_size"]:
                cv_rmses = []
                for tr_idx, te_idx in tscv.split(df):
                    train_df = df.iloc[tr_idx]
                    test_df  = df.iloc[te_idx]
                    X_tr, y_tr = make_xy(train_df)
                    X_ts, y_ts = make_xy(test_df)

                    # Split Train/Val
                    cut = int(len(X_tr) * 0.9)
                    X_train, X_val = X_tr[:cut], X_tr[cut:]
                    y_train, y_val = y_tr[:cut], y_tr[cut:]

                    # reshape
                    X_train = X_train.reshape((-1, X_train.shape[1], 1))
                    X_val   = X_val.reshape((-1, X_val.shape[1], 1))
                    X_test  = X_ts .reshape((-1, X_ts .shape[1], 1))

                    # Modell
                    model = Sequential([
                        Input(shape=(X_train.shape[1],1)),
                        LSTM(units, return_sequences=True),
                        Dropout(drop),
                        LSTM(units),
                        Dropout(drop),
                        Dense(1)
                    ])
                    opt = tf.keras.optimizers.Adam(learning_rate=lr)
                    model.compile(optimizer=opt, loss="mse")
                    es = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
                    model.fit(X_train, y_train,
                              validation_data=(X_val, y_val),
                              epochs=20, batch_size=bs,
                              callbacks=[es], verbose=0)

                    # Evaluation
                    y_pred = model.predict(X_test).flatten()
                    cv_rmses.append(math.sqrt(mean_squared_error(y_ts, y_pred)))

                avg_rmse = np.mean(cv_rmses)
                if avg_rmse < best_rmse:
                    best_rmse, best_cfg = avg_rmse, {
                        "units":units, "dropout":drop,
                        "lr":lr, "batch_size":bs
                    }

print("\nBest CV‑RMSE (Log‑Renditen):", best_rmse)
print("Best Config:", best_cfg)

# 6) Endgültiges Training & pro-Fold‑RMSEs ausgeben
fold_rmse_ret = []
fold_rmse_prc = []
fold_results  = []

for fold, (tr_idx, te_idx) in enumerate(tscv.split(df), 1):
    train_df = df.iloc[tr_idx]
    test_df  = df.iloc[te_idx]
    X_tr, y_tr = make_xy(train_df)
    X_ts, y_ts = make_xy(test_df)

    # reshape
    X_tr = X_tr.reshape((-1, X_tr.shape[1],1))
    X_ts = X_ts.reshape((-1, X_ts.shape[1],1))

    model = Sequential([
        Input(shape=(X_tr.shape[1],1)),
        LSTM(best_cfg["units"], return_sequences=True),
        Dropout(best_cfg["dropout"]),
        LSTM(best_cfg["units"]),
        Dropout(best_cfg["dropout"]),
        Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(best_cfg["lr"]), loss="mse")
    es = EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)
    model.fit(X_tr, y_tr, epochs=20,
              batch_size=best_cfg["batch_size"],
              callbacks=[es], verbose=0)

    # Vorhersage
    y_pred = model.predict(X_ts).flatten()
    rm_ret = math.sqrt(mean_squared_error(y_ts, y_pred))
    fold_rmse_ret.append(rm_ret)

    # Rückrechnung Kurse
    preds, actuals = [], []
    for i, r in enumerate(y_pred):
        p0 = test_df["Close"].iloc[i+window_size-1]
        preds.append(p0 * np.exp(r))
        actuals.append(test_df["Close"].iloc[i+window_size])
    rm_prc = math.sqrt(mean_squared_error(actuals, preds))
    fold_rmse_prc.append(rm_prc)

    print(f"Fold {fold}: RMSE Log‐Renditen = {rm_ret:.4f}, RMSE Preise = {rm_prc:.4f}")

    fold_results.append({
        "idx":      test_df.index[window_size:],
        "y_test":   y_ts,
        "y_pred":   y_pred,
        "actuals":  actuals,
        "preds":    preds
    })

# Durchschnitt über alle Folds
print(f"\nDurchschn. RMSE Log‑Renditen: {np.mean(fold_rmse_ret):.4f}")
print(f"Durchschn. RMSE Aktienkurse: {np.mean(fold_rmse_prc):.4f}")

# 7) Plots: alle Folds
plt.figure(figsize=(12,5))
for i, fr in enumerate(fold_results, 1):
    plt.plot(fr["idx"], fr["y_test"],  label=f"Real Fold {i}", alpha=0.8)
    plt.plot(fr["idx"], fr["y_pred"],  "--", label=f"Pred Fold {i}", alpha=0.8)
plt.title(f"{ticker} – Log‑Renditen alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Log‑Rendite")
plt.legend(); plt.tight_layout(); plt.show()

plt.figure(figsize=(12,5))
for i, fr in enumerate(fold_results, 1):
    plt.plot(fr["idx"], fr["actuals"], label=f"Real Fold {i}", alpha=0.8)
    plt.plot(fr["idx"], fr["preds"],   "--", label=f"Pred Fold {i}", alpha=0.8)
plt.title(f"{ticker} – Aktienkurse alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Preis (Close)")
plt.legend(); plt.tight_layout(); plt.show()


### GARCH_XGBoost_RSI_GTD.py

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from arch import arch_model
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Einstellungen
ticker      = "NVDA"
window_size = 10
n_splits    = 3
csv_path    = f"../../03_Daten/processed_data/merged_weekly_{ticker}_2015-2025.csv"

# 1) Daten einlesen und Log-Renditen berechnen
df = pd.read_csv(csv_path, parse_dates=["Date"], index_col="Date").sort_index()
df["Return"] = np.log(df["Close"] / df["Close"].shift(1))
df.dropna(inplace=True)

# 2) GARCH auf gesamten Datensatz einmalig fitten und Volatilität speichern
g      = arch_model(df["Return"] * 10, mean="Zero", vol="GARCH", p=1, q=1,
                    dist="normal", rescale=False)
res_all = g.fit(disp="off")
df["GARCH_vol"] = res_all.conditional_volatility / 10

# 3) Statische Feature-Spalten ermitteln und skalieren
gtd_cols    = [c for c in df.columns if "stock" in c.lower()]
static_cols = ["GARCH_vol", "RSI_14"] + gtd_cols
print("Verwendete statische Features (GTD + RSI + GARCH):", static_cols)

scaler = StandardScaler()
df[static_cols] = scaler.fit_transform(df[static_cols])

# 4) Funktion zum Erzeugen von X, y
def make_xy(subdf):
    X, y = [], []
    rets = subdf["Return"].values
    for i in range(window_size, len(subdf)):
        seq  = list(rets[i-window_size:i])                   # letzte Log-Renditen
        stat = subdf[static_cols].iloc[i].values.tolist()     # GARCH_vol, RSI, GTD
        X.append(seq + stat)
        y.append(rets[i])
    return np.array(X), np.array(y)

# 5) Hyperparameter-Suche für XGBoost
X_full, y_full = make_xy(df)

tscv_search = TimeSeriesSplit(n_splits=n_splits)
param_dist = {
    "n_estimators":     [50, 100, 200],
    "max_depth":        [3, 5, 7],
    "learning_rate":    [0.01, 0.05, 0.1],
    "subsample":        [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "gamma":            [0, 0.1, 0.5]
}

# tree_method="hist" für CPU-Training, n_jobs=1 um GPU-Parallelität auszuschließen
xgb = XGBRegressor(
    random_state=42,
    tree_method="hist",
    n_jobs=1
)

search = RandomizedSearchCV(
    xgb,
    param_dist,
    n_iter=20,
    cv=tscv_search,
    scoring="neg_root_mean_squared_error",
    n_jobs=1,            # nur ein Job, um CUDA-Konflikte zu vermeiden
    random_state=42,
    verbose=1,
    error_score='raise'
)
search.fit(X_full, y_full)
best_params = search.best_params_
print(">>> Best XGBoost Params:", best_params)

# 6) Out-of-Sample-Evaluation & Metriken
tscv = TimeSeriesSplit(n_splits=n_splits)
rmse_ret, mae_ret, mape_ret, r2_ret = [], [], [], []
hit_ret, sharpe_ret               = [], []
rmse_prc, mae_prc, mape_prc, r2_prc = [], [], [], []
hit_prc, sharpe_prc               = [], []

fold_results = []  # für Plots aller Folds

for fold, (tr_idx, te_idx) in enumerate(tscv.split(df), start=1):
    train_df = df.iloc[tr_idx].copy()
    test_df  = df.iloc[te_idx].copy()

    # a) GARCH-Forecast für Test-Set
    fc = res_all.forecast(start=train_df.index[-1], horizon=len(test_df), reindex=False)
    test_df["GARCH_vol"] = np.sqrt(fc.variance.values[-1, :]) / 10

    # b) Skalierung der statischen Features
    train_df[static_cols] = scaler.transform(train_df[static_cols])
    test_df[static_cols]  = scaler.transform(test_df[static_cols])

    # c) Features/Targets
    X_train, y_train = make_xy(train_df)
    X_test,  y_test  = make_xy(test_df)
    print(f"Fold {fold}: X_train={X_train.shape}, X_test={X_test.shape}")

    # d) Model trainieren mit Early Stopping
    model = XGBRegressor(
        **best_params,
        random_state=42,
        tree_method="hist",
        n_jobs=1,
        early_stopping_rounds=10,
        verbosity=0
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )

    # e) Log-Return Metriken
    y_pred = model.predict(X_test)
    rm = math.sqrt(mean_squared_error(y_test, y_pred))
    ma = mean_absolute_error(y_test, y_pred)
    mask_r = y_test != 0
    mp = np.mean(np.abs((y_test[mask_r] - y_pred[mask_r]) / y_test[mask_r])) * 100
    r2 = r2_score(y_test, y_pred)
    dir_true = np.sign(np.diff(y_test))
    dir_pred = np.sign(np.diff(y_pred))
    hr = (dir_true == dir_pred).mean() * 100
    rr = np.diff(y_pred) / y_pred[:-1]
    sr = rr.mean() / (rr.std() if rr.std() != 0 else np.nan)

    rmse_ret.append(rm); mae_ret.append(ma)
    mape_ret.append(mp); r2_ret.append(r2)
    hit_ret.append(hr); sharpe_ret.append(sr)

    # f) Preis-Metriken
    preds, actuals = [], []
    for i, r in enumerate(y_pred):
        p0 = test_df["Close"].iat[i + window_size - 1]
        preds.append(p0 * np.exp(r))
        actuals.append(test_df["Close"].iat[i + window_size])
    preds   = np.array(preds)
    actuals = np.array(actuals)

    rm_p  = math.sqrt(mean_squared_error(actuals, preds))
    ma_p  = mean_absolute_error(actuals, preds)
    mask_p = actuals != 0
    mp_p = np.mean(np.abs((actuals[mask_p] - preds[mask_p]) / actuals[mask_p])) * 100
    r2_p = r2_score(actuals, preds)
    dir_tp = np.sign(np.diff(actuals))
    dir_pp = np.sign(np.diff(preds))
    hr_p = (dir_tp == dir_pp).mean() * 100
    rp = np.diff(preds) / preds[:-1]
    sr_p = rp.mean() / (rp.std() if rp.std() != 0 else np.nan)

    rmse_prc.append(rm_p); mae_prc.append(ma_p)
    mape_prc.append(mp_p); r2_prc.append(r2_p)
    hit_prc.append(hr_p); sharpe_prc.append(sr_p)

    print(f"Fold {fold}:")
    print(f"  Returns → RMSE={rm:.4f}, MAE={ma:.4f}, MAPE={mp:.2f}%, "
          f"R²={r2:.4f}, Hit-Rate={hr:.1f}%, Sharpe={sr:.4f}")
    print(f"  Prices  → RMSE={rm_p:.4f}, MAE={ma_p:.4f}, MAPE={mp_p:.2f}%, "
          f"R²={r2_p:.4f}, Hit-Rate={hr_p:.1f}%, Sharpe={sr_p:.4f}\n")

    # Ergebnisse für spätere Plots speichern
    fold_results.append({
        "idx":    test_df.index[window_size:],
        "y_test": y_test,
        "y_pred": y_pred,
        "actual": actuals,
        "preds":  preds
    })

# 7) Durchschnittsergebnisse ausgeben
print("\n=== Durchschnittliche Metriken: Log-Renditen ===")
print(f"RMSE    = {np.mean(rmse_ret):.4f}")
print(f"MAE     = {np.mean(mae_ret):.4f}")
print(f"MAPE    = {np.mean(mape_ret):.2f}%")
print(f"R²      = {np.mean(r2_ret):.4f}")
print(f"HitRate = {np.mean(hit_ret):.2f}%")
print(f"Sharpe  = {np.nanmean(sharpe_ret):.4f}")

print("\n=== Durchschnittliche Metriken: Preise ===")
print(f"RMSE    = {np.mean(rmse_prc):.4f}")
print(f"MAE     = {np.mean(mae_prc):.4f}")
print(f"MAPE    = {np.mean(mape_prc):.2f}%")
print(f"R²      = {np.mean(r2_prc):.4f}")
print(f"HitRate = {np.mean(hit_prc):.2f}%")
print(f"Sharpe  = {np.nanmean(sharpe_prc):.4f}")

# 8) Plots für alle Folds

# a) Log-Renditen über alle Folds
plt.figure(figsize=(12, 5))
for i, fr in enumerate(fold_results, start=1):
    plt.plot(fr["idx"], fr["y_test"],  label=f"Real Ret Fold {i}",  alpha=0.7)
    plt.plot(fr["idx"], fr["y_pred"], "--",               label=f"Pred Ret Fold {i}", alpha=0.7)
plt.title(f"{ticker} – Log-Renditen über alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Log-Rendite")
plt.legend(); plt.tight_layout(); plt.show()

# b) Close-Preise über alle Folds
plt.figure(figsize=(12, 5))
for i, fr in enumerate(fold_results, start=1):
    plt.plot(fr["idx"], fr["actual"], label=f"Real Price Fold {i}", alpha=0.7)
    plt.plot(fr["idx"], fr["preds"],  "--",               label=f"Pred Price Fold {i}", alpha=0.7)
plt.title(f"{ticker} – Close-Preise über alle {n_splits} Folds")
plt.xlabel("Datum"); plt.ylabel("Preis (Close)")
plt.legend(); plt.tight_layout(); plt.show()


## Visualisierungen

### vis.py

In [None]:
import matplotlib.pyplot as plt

# Modellvarianten
models = ["LSTM", "XGBoost", "G+L", "G+L+RSI", "G+L+RSI+GTD", "G+X", "G+X+RSI", "G+X+RSI+GTD"]
x = range(len(models))

# RMSE-Werte Preisprognose (Schlusskurse)
rmse_nvda = [6.6339, 27.0426, 2.7375, 2.3375, 2.8136, 2.8578, 2.4853, 2.4625]
rmse_goog = [9.4500, 30.0505, 3.9578, 3.5819, 6.2158, 4.4258, 3.7953, 3.6881]
rmse_msft = [18.9792, 80.4302, 7.6404, 6.9672, 14.3669, 8.0835, 7.2955, 7.1738]

# Visualisierung 1: Reine ML-Modelle vs. GARCH-Hybride
plt.figure(figsize=(12, 6))
plt.scatter(x, rmse_nvda, color='green', label="NVDA", marker='o')
plt.scatter(x, rmse_goog, color='blue', label="GOOG", marker='s')
plt.scatter(x, rmse_msft, color='orange', label="MSFT", marker='^')
plt.title("RMSE – Reine ML-Modelle vs. GARCH-Hybride (Preisprognose)")
plt.xlabel("Modellvariante")
plt.ylabel("RMSE")
plt.xticks(x, models, rotation=30)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()

# Modellvarianten der GARCH-Hybride
models_hybrid = ["G+L", "G+L+RSI", "G+L+RSI+GTD", "G+X", "G+X+RSI", "G+X+RSI+GTD"]
x_hybrid = range(len(models_hybrid))

# RMSE-Werte Preisprognose aus den Tabellen
rmse_nvda = [2.7375, 2.3375, 2.8136, 2.8578, 2.4853, 2.4625]
rmse_goog = [3.9578, 3.5819, 6.2158, 4.4258, 3.7953, 3.6881]
rmse_msft = [7.6404, 6.9672, 14.3669, 8.0835, 7.2955, 7.1738]

# Diagramm mit Punkten
plt.figure(figsize=(10, 6))
plt.scatter(x_hybrid, rmse_nvda, color='green', label="NVDA", marker='o')
plt.scatter(x_hybrid, rmse_goog, color='blue', label="GOOG", marker='s')
plt.scatter(x_hybrid, rmse_msft, color='orange', label="MSFT", marker='^')

# Achsenbeschriftungen und Formatierung
plt.title("RMSE – Preisprognose: Einfluss von Features auf GARCH-Hybridmodelle")
plt.xlabel("Modellvariante")
plt.ylabel("RMSE")
plt.xticks(x_hybrid, models_hybrid, rotation=30)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()
