# StockVision - Phase 2 Model Training

Dieses Notebook trainiert **zwei Modelle** auf dem vorbereiteten AAPL-Feature-Datensatz:

- **TensorFlow LSTM** fur Preis-Regression (`predicted_price`)
- **XGBoost Classifier** fur Trend-Klassifikation (`SELL/HOLD/BUY`)

Outputs:
- `models/AAPL_price/`
- `models/AAPL_trend/`


In [None]:
# ==========================================
# Imports
# ==========================================
import json
import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from xgboost import XGBClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

np.random.seed(42)
tf.random.set_seed(42)
sns.set_style("whitegrid")

print("TensorFlow:", tf.__version__)
print("GPU available:", len(tf.config.list_physical_devices("GPU")) > 0)

# ==========================================
# Paths and data loading
# ==========================================
if os.path.basename(os.getcwd()) == "notebooks":
    BASE_DIR = Path("..").resolve()
else:
    BASE_DIR = Path.cwd()

DATA_PROCESSED = BASE_DIR / "data" / "processed"
MODELS_DIR = BASE_DIR / "models"
PRICE_DIR = MODELS_DIR / "AAPL_price"
TREND_DIR = MODELS_DIR / "AAPL_trend"
PRICE_DIR.mkdir(parents=True, exist_ok=True)
TREND_DIR.mkdir(parents=True, exist_ok=True)

TICKER = "AAPL"
SEQUENCE_LENGTH = 90

# Prefer direction dataset if present, fallback to features dataset
candidate_paths = [
    DATA_PROCESSED / f"{TICKER}_features_direction.csv",
    DATA_PROCESSED / f"{TICKER}_features.csv",
]
for data_path in candidate_paths:
    if data_path.exists():
        break
else:
    raise FileNotFoundError(f"No processed feature file found in {DATA_PROCESSED}")

print("Loading:", data_path)
df = pd.read_csv(data_path, index_col=0, parse_dates=True).sort_index()
print("Shape before cleaning:", df.shape)

# Ensure labels exist (in case notebook 01 wasn't rerun yet)
if "Next_Close" not in df.columns:
    df["Next_Close"] = df["Close"].shift(-1)
if "Direction_Binary" not in df.columns:
    df["Direction_Binary"] = (df["Next_Close"] > df["Close"]).astype(float)
if "Return_Next" not in df.columns:
    df["Return_Next"] = df["Close"].pct_change().shift(-1)
if "Direction_3Class" not in df.columns:
    threshold = 0.005
    df["Direction_3Class"] = pd.cut(
        df["Return_Next"],
        bins=[-np.inf, -threshold, threshold, np.inf],
        labels=[0, 1, 2],
    ).astype(float)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude_cols = {"Direction_Binary", "Direction_3Class", "Next_Close", "Return_Next"}
feature_cols = [c for c in numeric_cols if c not in exclude_cols]

if "Close" not in feature_cols:
    raise ValueError("Close must be present in feature columns for LSTM target regression.")

print("Feature count:", len(feature_cols))
print("Sample features:", feature_cols[:10])
print("trained_until:", df.index.max().date())


def temporal_split_indices(n, train_ratio=0.70, val_ratio=0.15):
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    return train_end, val_end


def safe_mape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return float(np.mean(np.abs((y_true - y_pred) / denom)) * 100.0)


In [None]:
# ==========================================
# MODEL 1: TensorFlow LSTM (Price Regression)
# ==========================================

lstm_df = df[feature_cols + ["Close"]].copy()
# Remove duplicate Close column if it was included twice
lstm_df = lstm_df.loc[:, ~lstm_df.columns.duplicated()].dropna().copy()

n_total = len(lstm_df)
train_end, val_end = temporal_split_indices(n_total)
train_df = lstm_df.iloc[:train_end].copy()
val_df = lstm_df.iloc[train_end:val_end].copy()
test_df = lstm_df.iloc[val_end:].copy()

print("LSTM splits:", len(train_df), len(val_df), len(test_df))

# Fit scaler only on train features
feature_scaler = MinMaxScaler()
feature_scaler.fit(train_df[feature_cols])


def make_sequences(split_df, feature_scaler, feature_cols, sequence_length):
    X_scaled = feature_scaler.transform(split_df[feature_cols])
    y = split_df["Close"].to_numpy(dtype=np.float32)
    dates = split_df.index.to_numpy()
    X_seq, y_seq, d_seq = [], [], []
    for i in range(sequence_length, len(split_df)):
        X_seq.append(X_scaled[i-sequence_length:i])
        y_seq.append(y[i])
        d_seq.append(dates[i])
    return np.asarray(X_seq, dtype=np.float32), np.asarray(y_seq, dtype=np.float32), np.asarray(d_seq)

X_train, y_train, d_train = make_sequences(train_df, feature_scaler, feature_cols, SEQUENCE_LENGTH)
X_val, y_val, d_val = make_sequences(val_df, feature_scaler, feature_cols, SEQUENCE_LENGTH)
X_test, y_test, d_test = make_sequences(test_df, feature_scaler, feature_cols, SEQUENCE_LENGTH)

print("LSTM sequence shapes:")
print("  X_train:", X_train.shape, "y_train:", y_train.shape)
print("  X_val:  ", X_val.shape, "y_val:  ", y_val.shape)
print("  X_test: ", X_test.shape, "y_test: ", y_test.shape)

if len(X_train) == 0 or len(X_val) == 0 or len(X_test) == 0:
    raise ValueError(
        "Not enough rows after cleaning for 90-day sequences in all splits. "
        "Reduce SEQUENCE_LENGTH or provide more data."
    )

lstm_model = keras.Sequential([
    layers.Input(shape=(SEQUENCE_LENGTH, len(feature_cols))),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dropout(0.2),
    layers.Dense(32, activation="relu"),
    layers.Dense(1)
])

lstm_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=[keras.metrics.RootMeanSquaredError(name="rmse"), keras.metrics.MeanAbsolutePercentageError(name="mape")]
)

callbacks = [
    EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5),
]

history = lstm_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=60,
    batch_size=32,
    callbacks=callbacks,
    verbose=1,
)

y_pred_test = lstm_model.predict(X_test, verbose=0).reshape(-1)
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred_test)))
mape = safe_mape(y_test, y_pred_test)

print(f"LSTM Test RMSE: {rmse:.4f}")
print(f"LSTM Test MAPE: {mape:.2f}%")

# Save artifacts
lstm_model.save(PRICE_DIR / "model.keras")
with open(PRICE_DIR / "scaler.pkl", "wb") as f:
    pickle.dump({"feature_scaler": feature_scaler, "features": feature_cols}, f)

price_metadata = {
    "type": "lstm_regression",
    "ticker": TICKER,
    "sequence_length": SEQUENCE_LENGTH,
    "features": feature_cols,
    "rmse": rmse,
    "mape": mape,
    "trained_until": str(lstm_df.index.max().date()),
}
(PRICE_DIR / "metadata.json").write_text(json.dumps(price_metadata, indent=2), encoding="utf-8")
print("Saved LSTM artifacts to:", PRICE_DIR)

# Quick plot
plt.figure(figsize=(12, 4))
plt.plot(y_test, label="Actual", linewidth=1.5)
plt.plot(y_pred_test, label="Predicted", linewidth=1.5)
plt.title("LSTM Price Regression (Test Split)")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# ==========================================
# MODEL 2: XGBoost Classifier (Trend)
# ==========================================

trend_df = df[feature_cols + ["Direction_3Class"]].copy().dropna().copy()
trend_df["Direction_3Class"] = trend_df["Direction_3Class"].astype(int)

n_total = len(trend_df)
train_end, val_end = temporal_split_indices(n_total)
train_cls = trend_df.iloc[:train_end].copy()
val_cls = trend_df.iloc[train_end:val_end].copy()
test_cls = trend_df.iloc[val_end:].copy()

X_train_cls = train_cls[feature_cols]
y_train_cls = train_cls["Direction_3Class"]
X_val_cls = val_cls[feature_cols]
y_val_cls = val_cls["Direction_3Class"]
X_test_cls = test_cls[feature_cols]
y_test_cls = test_cls["Direction_3Class"]

print("XGB splits:", len(X_train_cls), len(X_val_cls), len(X_test_cls))

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    random_state=42,
)

xgb_model.fit(
    X_train_cls,
    y_train_cls,
    eval_set=[(X_val_cls, y_val_cls)],
    verbose=False,
)

y_pred_cls = xgb_model.predict(X_test_cls)
accuracy = float(accuracy_score(y_test_cls, y_pred_cls))
cm = confusion_matrix(y_test_cls, y_pred_cls, labels=[0, 1, 2])

print(f"XGB Test Accuracy: {accuracy:.4f}")
print("Confusion matrix [rows=true, cols=pred] for labels [0,1,2]:")
print(cm)

# Visualization
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["SELL", "HOLD", "BUY"], yticklabels=["SELL", "HOLD", "BUY"])
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# Save artifacts
xgb_model.save_model(str(TREND_DIR / "xgb_model.json"))
trend_metadata = {
    "type": "xgboost_classifier",
    "ticker": TICKER,
    "features": feature_cols,
    "accuracy": accuracy,
    "label_mapping": {"0": "SELL", "1": "HOLD", "2": "BUY"},
    "trained_until": str(trend_df.index.max().date()),
}
(TREND_DIR / "metadata.json").write_text(json.dumps(trend_metadata, indent=2), encoding="utf-8")
print("Saved XGBoost artifacts to:", TREND_DIR)
