In [4]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

REPO_ROOT = Path.cwd().parent.parent
BACKEND_DIR = REPO_ROOT / "backend"
sys.path.insert(0, str(BACKEND_DIR))
sys.path.insert(0, str(Path.cwd()))

from analytics.forecasting.prophet import ProphetForecaster
from _pool_common import (
    load_pool_data,
    backtest_one_step,
    compute_metrics,
    metrics_to_parquet,
    TEST_SIZE,
    MIN_TRAIN_PROPHET,
    ARTIFACTS_DIR,
)

RESIDUAL_LAGS = 3
PRICE_LAGS = 3
XGB_PARAMS = dict(
    n_estimators=200,
    learning_rate=0.03,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
)

def walk_forward_prophet_xgb(model_df, feature_cols, test_size):
    price_lag_cols = [c for c in feature_cols if c.startswith("price_lag_")]
    n = len(model_df)
    split_idx = max(0, n - test_size)
    preds = []
    for i in range(split_idx, n):
        train = model_df.iloc[:i].copy()
        test_row = model_df.iloc[i : i + 1].copy()
        if len(train) < 5:
            continue
        if price_lag_cols:
            scaler_price = StandardScaler()
            train[price_lag_cols] = scaler_price.fit_transform(train[price_lag_cols])
            test_row[price_lag_cols] = scaler_price.transform(test_row[price_lag_cols])
        X_train = train[feature_cols]
        X_test = test_row[feature_cols]
        y_train_residual = (train["y_true"] - train["y_pred"]).values.reshape(-1, 1)
        scaler_residual = StandardScaler()
        y_train_scaled = scaler_residual.fit_transform(y_train_residual).ravel()
        model = XGBRegressor(**XGB_PARAMS)
        model.fit(X_train, y_train_scaled)
        prophet_forecast = float(test_row["y_pred"].iloc[0])
        pred_scaled = model.predict(X_test)[0]
        predicted_residual = float(scaler_residual.inverse_transform([[pred_scaled]])[0, 0])
        y_final = prophet_forecast + predicted_residual
        preds.append({
            "timestamp": test_row["timestamp"].iloc[0],
            "y_true": float(test_row["y_true"].iloc[0]),
            "y_pred": y_final,
        })
    return pd.DataFrame(preds)

In [5]:
stacked = load_pool_data(with_vix=True)
print(stacked.groupby("symbol").size())
stacked.head(10)

symbol
AAPL       262
BTC-USD    262
ETH-USD    262
MSFT       262
NVDA       262
QQQ        262
SPY        262
dtype: int64


Unnamed: 0,timestamp,symbol,close,vix
0,2021-02-22,AAPL,121.260002,27.950001
1,2021-03-01,AAPL,121.419998,24.66
2,2021-03-08,AAPL,121.029999,20.690001
3,2021-03-15,AAPL,119.989998,20.950001
4,2021-03-22,AAPL,121.209999,18.860001
5,2021-03-29,AAPL,123.0,17.33
6,2021-04-05,AAPL,133.0,16.690001
7,2021-04-12,AAPL,134.160004,16.25
8,2021-04-19,AAPL,134.320007,17.33
9,2021-04-26,AAPL,131.460007,18.610001


In [6]:
# Save Prophet + XGBoost artifact into backend/analytics/forecasting for backend to load
artifact_dir = REPO_ROOT / "backend" / "analytics" / "forecasting"
artifact_path = artifact_dir / "prophet_xgb_artifact.joblib"

pool_dfs = []
for sym, grp in stacked.groupby("symbol"):
    grp = grp.sort_values("timestamp")
    if len(grp) < TEST_SIZE + MIN_TRAIN_PROPHET:
        continue
    prices = grp.set_index("timestamp")["close"].astype(float).dropna()
    vix = grp.set_index("timestamp")["vix"].astype(float).reindex(prices.index).ffill().bfill()
    if vix.isna().all():
        continue
    artifact_test_size = max(TEST_SIZE, len(prices) - MIN_TRAIN_PROPHET)
    pred_prophet = backtest_one_step(
        prices, artifact_test_size,
        model_factory=lambda: ProphetForecaster(confidence_level=0.95),
        min_train=MIN_TRAIN_PROPHET,
    )
    model_df = pred_prophet.copy()
    model_df["residual"] = model_df["y_true"] - model_df["y_pred"]
    model_df["close"] = model_df["y_true"].values
    model_df["vix"] = vix.reindex(model_df["timestamp"]).values
    for lag in range(1, RESIDUAL_LAGS + 1):
        model_df[f"residual_lag_{lag}"] = model_df["residual"].shift(lag)
    for lag in range(1, PRICE_LAGS + 1):
        model_df[f"price_lag_{lag}"] = model_df["close"].shift(lag)
    model_df["vix_lag_1"] = model_df["vix"].shift(1)
    model_df = model_df.dropna()
    if len(model_df) < 5:
        continue
    pool_dfs.append(model_df)

if not pool_dfs:
    raise ValueError("No symbol had enough data to build the artifact.")
pool_df = pd.concat(pool_dfs, ignore_index=True)
feature_cols = [c for c in pool_df.columns if c.startswith("residual_lag_") or c.startswith("price_lag_")] + ["vix_lag_1"]
price_lag_cols = [c for c in feature_cols if c.startswith("price_lag_")]

train = pool_df.copy()
if price_lag_cols:
    scaler_price = StandardScaler()
    train[price_lag_cols] = scaler_price.fit_transform(train[price_lag_cols])
else:
    scaler_price = None
y_train_residual = (train["y_true"] - train["y_pred"]).values.reshape(-1, 1)
scaler_residual = StandardScaler()
y_train_scaled = scaler_residual.fit_transform(y_train_residual).ravel()
xgb_model = XGBRegressor(**XGB_PARAMS)
xgb_model.fit(train[feature_cols], y_train_scaled)

artifact = {
    "xgb_model": xgb_model,
    "scaler_price": scaler_price,
    "scaler_residual": scaler_residual,
    "feature_cols": feature_cols,
    "residual_lags": RESIDUAL_LAGS,
    "price_lags": PRICE_LAGS,
    "xgb_params": XGB_PARAMS,
}
artifact_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(artifact, artifact_path)
print("Saved:", artifact_path, "| trained on", len(pool_df), "rows from", len(pool_dfs), "symbols")

20:59:35 - cmdstanpy - INFO - Chain [1] start processing
20:59:35 - cmdstanpy - INFO - Chain [1] done processing
20:59:36 - cmdstanpy - INFO - Chain [1] start processing
20:59:36 - cmdstanpy - INFO - Chain [1] done processing
20:59:36 - cmdstanpy - INFO - Chain [1] start processing
20:59:36 - cmdstanpy - INFO - Chain [1] done processing
20:59:36 - cmdstanpy - INFO - Chain [1] start processing
20:59:36 - cmdstanpy - INFO - Chain [1] done processing
20:59:36 - cmdstanpy - INFO - Chain [1] start processing
20:59:37 - cmdstanpy - INFO - Chain [1] done processing
20:59:37 - cmdstanpy - INFO - Chain [1] start processing
20:59:37 - cmdstanpy - INFO - Chain [1] done processing
20:59:37 - cmdstanpy - INFO - Chain [1] start processing
20:59:37 - cmdstanpy - INFO - Chain [1] done processing
20:59:37 - cmdstanpy - INFO - Chain [1] start processing
20:59:38 - cmdstanpy - INFO - Chain [1] done processing
20:59:38 - cmdstanpy - INFO - Chain [1] start processing
20:59:38 - cmdstanpy - INFO - Chain [1]

Saved: C:\capstone_project_unfc\model\experiments-pool\artifacts\prophet_xgb_artifact.joblib | trained on 1743 rows from 7 symbols


In [7]:
model_name = "prophet_xgb"
all_preds = []
for sym, grp in stacked.groupby("symbol"):
    grp = grp.sort_values("timestamp")
    prices = grp.set_index("timestamp")["close"].astype(float).dropna()
    vix = grp.set_index("timestamp")["vix"].astype(float).reindex(prices.index).ffill().bfill()
    if len(prices) < TEST_SIZE + MIN_TRAIN_PROPHET or vix.isna().all():
        continue
    pred_prophet = backtest_one_step(
        prices, TEST_SIZE,
        model_factory=lambda: ProphetForecaster(confidence_level=0.95),
        min_train=MIN_TRAIN_PROPHET,
    )
    model_df = pred_prophet.copy()
    model_df["residual"] = model_df["y_true"] - model_df["y_pred"]
    model_df["close"] = model_df["y_true"].values
    model_df["vix"] = vix.reindex(model_df["timestamp"]).values
    for lag in range(1, RESIDUAL_LAGS + 1):
        model_df[f"residual_lag_{lag}"] = model_df["residual"].shift(lag)
    for lag in range(1, PRICE_LAGS + 1):
        model_df[f"price_lag_{lag}"] = model_df["close"].shift(lag)
    model_df["vix_lag_1"] = model_df["vix"].shift(1)
    model_df = model_df.dropna()
    feature_cols = [c for c in model_df.columns if c.startswith("residual_lag_") or c.startswith("price_lag_")] + ["vix_lag_1"]
    if len(model_df) < 10:
        continue
    pred = walk_forward_prophet_xgb(model_df, feature_cols, test_size=min(TEST_SIZE, len(model_df)))
    pred["symbol"] = sym
    all_preds.append(pred)

pred_prophet_xgb = pd.concat(all_preds, ignore_index=True)
print(pred_prophet_xgb.groupby("symbol").size())
pred_prophet_xgb.head()

21:14:41 - cmdstanpy - INFO - Chain [1] start processing
21:14:41 - cmdstanpy - INFO - Chain [1] done processing
21:14:41 - cmdstanpy - INFO - Chain [1] start processing
21:14:42 - cmdstanpy - INFO - Chain [1] done processing
21:14:42 - cmdstanpy - INFO - Chain [1] start processing
21:14:42 - cmdstanpy - INFO - Chain [1] done processing
21:14:42 - cmdstanpy - INFO - Chain [1] start processing
21:14:42 - cmdstanpy - INFO - Chain [1] done processing
21:14:43 - cmdstanpy - INFO - Chain [1] start processing
21:14:43 - cmdstanpy - INFO - Chain [1] done processing
21:14:43 - cmdstanpy - INFO - Chain [1] start processing
21:14:43 - cmdstanpy - INFO - Chain [1] done processing
21:14:43 - cmdstanpy - INFO - Chain [1] start processing
21:14:43 - cmdstanpy - INFO - Chain [1] done processing
21:14:44 - cmdstanpy - INFO - Chain [1] start processing
21:14:44 - cmdstanpy - INFO - Chain [1] done processing
21:14:44 - cmdstanpy - INFO - Chain [1] start processing
21:14:44 - cmdstanpy - INFO - Chain [1]

symbol
AAPL       22
BTC-USD    22
ETH-USD    22
MSFT       22
NVDA       22
QQQ        22
SPY        22
dtype: int64


Unnamed: 0,timestamp,y_true,y_pred,symbol
0,2025-09-29,258.019989,253.875402,AAPL
1,2025-10-06,245.270004,260.202551,AAPL
2,2025-10-13,252.289993,256.328662,AAPL
3,2025-10-20,262.820007,262.018875,AAPL
4,2025-10-27,270.369995,266.74975,AAPL


In [8]:
metrics_rows = []
for sym in pred_prophet_xgb["symbol"].unique():
    sub = pred_prophet_xgb[pred_prophet_xgb["symbol"] == sym]
    m = compute_metrics(sub)
    metrics_rows.append({"model": model_name, "symbol": sym, **m})
m_overall = compute_metrics(pred_prophet_xgb)
metrics_rows.append({"model": model_name, "symbol": "overall", **m_overall})

metrics_df = pd.DataFrame(metrics_rows)
print(metrics_df.to_string())
metrics_to_parquet(metrics_rows, ARTIFACTS_DIR / "metrics_prophet_xgb_pool.parquet")
print("Saved:", ARTIFACTS_DIR / "metrics_prophet_xgb_pool.parquet")

         model   symbol          MAE         RMSE    MAPE_%
0  prophet_xgb     AAPL     6.916656     8.844215  2.639959
1  prophet_xgb  BTC-USD  5213.805438  6728.529359  5.622486
2  prophet_xgb  ETH-USD   239.607595   298.585664  8.137582
3  prophet_xgb     MSFT    14.993584    21.564739  3.277882
4  prophet_xgb     NVDA     6.805341     8.543884  3.670462
5  prophet_xgb      QQQ    11.673870    14.548404  1.914327
6  prophet_xgb      SPY     9.103990    11.438129  1.345667
7  prophet_xgb  overall   786.129496  2545.674748  3.801195
Saved: C:\capstone_project_unfc\model\experiments-pool\artifacts\metrics_prophet_xgb_pool.parquet
