### Backtesting StockLLM Alpha on US Mega-cap Tech (Daily)

Pipeline:
- Fetch last 5 years of daily bars from Alpaca for 10 large-cap US tech stocks
- Build a FAISS candidate index using the first 4 years (2020-01 to 2023-12) with paper-aligned indicators (OHLCV + RSI + MACD_hist)
- Run the StockLLM alpha on the last 1 year (2024-01 to 2024-12) only
- Backtest with vectorbt and run analytics (performance, hit rate, per-symbol breakdown, calibration/Brier)

Notes:
- Requires `alpaca-py`, `transformers`, `torch`, and `faiss-cpu` (CPU fallback for FAISS implemented)
- StockLLM and FinSeer models will be downloaded on first run

In [None]:
import os
from datetime import datetime
import numpy as np
import pandas as pd

from src.utils.logging import configure_logging
configure_logging(level="INFO", use_rich=False)

from src.schemas import TimeFrame
from src.adapters.alpaca_data import AlpacaMarketData, AlpacaMarketDataConfig
from src.retrieval.finseer_client import FinSeerEmbedder, FinSeerConfig
from src.retrieval.faiss_index import FaissCandidateIndex, default_indicator_builder
from src.llm.stockllm_client import StockLLMGenerator, StockLLMConfig
from src.alphas.stockllm_alpha import stockllm_alpha
from src.engines.alpha_engine import AlphaEngine
from src.engines.vbt_engine import run_backtest_vbt


In [None]:

# 0) Symbols and date ranges
symbols = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "NVDA",
    "META", "TSLA", "AVGO", "ORCL", "CRM",
]
fetch_start = datetime(2020, 1, 1)
fetch_end = datetime(2024, 12, 31)
train_start = datetime(2020, 1, 1)
train_end = datetime(2023, 12, 31)
test_start = datetime(2024, 1, 1)
test_end = datetime(2024, 12, 31)

# 1) Fetch daily OHLCV from Alpaca
# Reads keys from env (ALPACA_API_KEY, ALPACA_SECRET_KEY) or env.py if available
api_key = os.getenv("ALPACA_API_KEY")
secret_key = os.getenv("ALPACA_SECRET_KEY")
if (api_key is None or secret_key is None):
    try:
        from env import paper_api_key as api_key, paper_secret_key as secret_key  # type: ignore
    except Exception:
        api_key = None
        secret_key = None

md = AlpacaMarketData(AlpacaMarketDataConfig(api_key=api_key, secret_key=secret_key, use_paper=True))
print(f"Using Alpaca API key={bool(api_key)} secret={bool(secret_key)}")
sym_to_df = md.get_stock_bars(
    symbols=symbols,
    timeframe=TimeFrame.day,
    start=fetch_start,
    end=fetch_end,
)
print(f"Fetched {len(sym_to_df)} symbols: {[k for k in sym_to_df.keys()]}")
sym_to_df

In [None]:

# Filter to only symbols that returned data
available_symbols = [s for s in symbols if s in sym_to_df and len(sym_to_df[s]) > 0]
if len(available_symbols) < len(symbols):
    missing = sorted(set(symbols) - set(available_symbols))
    print(f"Warning: missing data for symbols: {missing}")

# Combine to one MultiIndex DataFrame
all_df = pd.concat({s: sym_to_df[s] for s in available_symbols}, names=["symbol", "timestamp"]).sort_index()

# 2) Split periods
train_df = all_df.loc[(slice(None), slice(train_start, train_end)), :]
test_df = all_df.loc[(slice(None), slice(test_start, test_end)), :]
print({
    "train_span_days": (train_end - train_start).days + 1,
    "test_span_days": (test_end - test_start).days + 1,
    "train_rows": len(train_df),
    "test_rows": len(test_df),
})


In [None]:

# 3) Build FinSeer+FAISS index on 2020-01..2023-12
embedder = FinSeerEmbedder(FinSeerConfig())
index = FaissCandidateIndex(
    embedder,
    normalize=True,
    use_gpu=True,
    persist_dir="./.alpacium_index",
    auto_persist=True,
)

In [None]:

added = index.build_from_symbol_dfs(
    {s: train_df.loc[s] for s in available_symbols},
    lookback=5,
    indicator_builder=default_indicator_builder,
    timeframe=TimeFrame.day,
    show_progress=True,
)
print(f"Indexed {added} candidates from training period")
print(f"Index size (ids) = {len(index.id_to_meta)}; last dates per symbol = {list(index.symbol_last_date.items())[:5]} ...")


In [None]:

# 4) Initialize StockLLM generator
stockllm = StockLLMGenerator(StockLLMConfig(temperature=0.0, max_new_tokens=64))

# 5) Generate signals on 2024 only using the trained index
alpha_engine = AlphaEngine()
alpha_signals = alpha_engine.generate_signals(
    historical_data=test_df,
    alpha_function=lambda data: stockllm_alpha(
        historical_data=data,
        index=index,
        generator=stockllm,
        lookback=5,
        top_k=5,
        timeframe=TimeFrame.day,
        confidence_threshold=0.0,
    ),
    parameters=None,
    show_progress=False,
)
print(alpha_signals.signals[["movement", "prob_rise", "prob_fall", "prob_freeze", "confidence", "signal"]].dropna().head())


In [None]:

# 6) Backtest with vectorbt on 2024
bt_results = run_backtest_vbt(
    historical_data=test_df,
    alpha_signals=alpha_signals,
    initial_capital=1_000_000.0,
    transaction_cost=0.001,
)
print({
    "total_return": bt_results.total_return,
    "annual_return": bt_results.annual_return,
    "sharpe_ratio": bt_results.sharpe_ratio,
})


In [None]:

# 7) Analytics: hit-rate, Brier score, per-symbol breakdown
sigs = alpha_signals.signals.copy()
# Realized next-day movement from adjusted_close
adj = test_df["adjusted_close"].unstack(level=0)
next_ret = adj.pct_change().shift(-1)
# Long/short correctness based on sign of next day's return
realized_move = next_ret.applymap(lambda x: "rise" if x > 0 else ("fall" if x < 0 else "freeze"))
realized_move = realized_move.stack().rename("realized_movement")

# Align with signals index
eval_df = sigs[["movement", "prob_rise", "prob_fall", "prob_freeze", "confidence", "signal"]].join(realized_move, how="left")

def brier_row(row: pd.Series) -> float:
    y = {"rise": 0.0, "fall": 0.0, "freeze": 0.0}
    if isinstance(row.get("realized_movement"), str) and row["realized_movement"] in y:
        y[row["realized_movement"]] = 1.0
    p_r = float(row.get("prob_rise", 0.0) or 0.0)
    p_f = float(row.get("prob_fall", 0.0) or 0.0)
    p_z = float(row.get("prob_freeze", 0.0) or 0.0)
    return (y["rise"] - p_r) ** 2 + (y["fall"] - p_f) ** 2 + (y["freeze"] - p_z) ** 2

# Classification accuracy on non-null realized
mask = eval_df["realized_movement"].notna()
overall_acc = (eval_df.loc[mask, "movement"] == eval_df.loc[mask, "realized_movement"]).mean()
brier = eval_df.loc[mask].apply(brier_row, axis=1).mean()

# Per-symbol accuracy
per_symbol = (
    eval_df.loc[mask]
    .reset_index()
    .groupby("symbol")
    .apply(lambda g: pd.Series({
        "n": len(g),
        "accuracy": (g["movement"] == g["realized_movement"]).mean(),
        "avg_confidence": g["confidence"].astype(float).mean(),
    }))
    .sort_values("accuracy", ascending=False)
)

# Confidence calibration buckets
bins = [0.0, 0.55, 0.7, 0.85, 1.01]
labels = ["<=0.55", "0.55-0.7", "0.7-0.85", ">0.85"]
eval_df["conf_bin"] = pd.cut(eval_df["confidence"].astype(float), bins=bins, labels=labels, include_lowest=True)
calib = (
    eval_df.loc[mask]
    .groupby("conf_bin")
    .apply(lambda g: pd.Series({
        "n": len(g),
        "accuracy": (g["movement"] == g["realized_movement"]).mean(),
        "avg_confidence": g["confidence"].astype(float).mean(),
        "avg_brier": g.apply(brier_row, axis=1).mean(),
    }))
)

print("\nAnalytics summary (2024):")
print({"accuracy": float(overall_acc), "brier": float(brier)})
print("\nPer-symbol accuracy (top 10):")
print(per_symbol.head(10))
print("\nConfidence calibration:")
print(calib)
