### Backtesting StockLLM Alpha on US Mega-cap Tech (Daily)

Pipeline:
- Fetch last 5 years of daily bars from Alpaca for 10 large-cap US tech stocks
- Build a FAISS candidate index using the first 4 years (2020-01 to 2023-12) with paper-aligned indicators (OHLCV + RSI + MACD_hist)
- Run the StockLLM alpha on the last 1 year (2024-01 to 2024-12) only
- Backtest with vectorbt and run analytics (performance, hit rate, per-symbol breakdown, calibration/Brier)

Notes:
- Requires `alpaca-py`, `transformers`, `torch`, and `faiss-cpu` (CPU fallback for FAISS implemented)
- StockLLM and FinSeer models will be downloaded on first run

In [1]:
import os
from datetime import datetime
import numpy as np
import pandas as pd

from src.utils.logging import configure_logging
configure_logging(level="INFO", use_rich=False)

from src.schemas import TimeFrame
from src.adapters.alpaca_data import AlpacaMarketData, AlpacaMarketDataConfig
from src.retrieval.finseer_client import FinSeerEmbedder, FinSeerConfig
from src.retrieval.faiss_index import FaissCandidateIndex, default_indicator_builder
from src.llm.stockllm_client import StockLLMGenerator, StockLLMConfig
from src.alphas.stockllm_alpha import stockllm_alpha
from src.engines.alpha_engine import AlphaEngine
from src.engines.vbt_engine import run_backtest_vbt

# Prefer cuVS-like GPU index when available; fallback to FAISS
try:
    from src.retrieval.cuvs_index import CuVSLikeCandidateIndex as _CuIndex
    _HAS_CUVS = True
except Exception:
    _HAS_CUVS = False


2025-08-10 02:05:27,601 | INFO | src.utils.logging: Logging configured: level=INFO rich=False


In [3]:

# 0) Symbols and date ranges
symbols = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "NVDA",
    "META", "TSLA", "AVGO", "ORCL", "CRM",
]
fetch_start = datetime(2020, 1, 1)
fetch_end = datetime(2024, 12, 31)
train_start = datetime(2020, 1, 1)
train_end = datetime(2023, 12, 31)
test_start = datetime(2024, 1, 1)
test_end = datetime(2024, 12, 31)

# 1) Fetch daily OHLCV from Alpaca
# Reads keys from env (ALPACA_API_KEY, ALPACA_SECRET_KEY) or env.py if available
api_key = os.getenv("ALPACA_API_KEY")
secret_key = os.getenv("ALPACA_SECRET_KEY")
if (api_key is None or secret_key is None):
    try:
        from env import paper_api_key as api_key, paper_secret_key as secret_key  # type: ignore
    except Exception:
        api_key = None
        secret_key = None

md = AlpacaMarketData(AlpacaMarketDataConfig(api_key=api_key, secret_key=secret_key, use_paper=True))
print(f"Using Alpaca API key={bool(api_key)} secret={bool(secret_key)}")
sym_to_df = md.get_stock_bars(
    symbols=symbols,
    timeframe=TimeFrame.day,
    start=fetch_start,
    end=fetch_end,
)
print(f"Fetched {len(sym_to_df)} symbols: {[k for k in sym_to_df.keys()]}")
sym_to_df

2025-08-10 02:06:23,106 | INFO | src.adapters.alpaca_data: Fetching bars: symbols=['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'AVGO', 'ORCL', 'CRM'] timeframe=TimeFrame.day start=2020-01-01 00:00:00 end=2024-12-31 00:00:00 limit=None adjustment=None


Using Alpaca API key=True secret=True


2025-08-10 02:06:33,579 | INFO | src.adapters.alpaca_data: Timezone normalized for bars: intraday=False tz=None tz_naive=True
2025-08-10 02:06:33,586 | INFO | src.adapters.alpaca_data: Prepared symbol=NVDA rows=1257 cols=6
2025-08-10 02:06:33,588 | INFO | src.adapters.alpaca_data: Prepared symbol=AAPL rows=1257 cols=6
2025-08-10 02:06:33,591 | INFO | src.adapters.alpaca_data: Prepared symbol=AMZN rows=1257 cols=6
2025-08-10 02:06:33,597 | INFO | src.adapters.alpaca_data: Prepared symbol=AVGO rows=1257 cols=6
2025-08-10 02:06:33,600 | INFO | src.adapters.alpaca_data: Prepared symbol=CRM rows=1257 cols=6
2025-08-10 02:06:33,604 | INFO | src.adapters.alpaca_data: Prepared symbol=GOOGL rows=1257 cols=6
2025-08-10 02:06:33,610 | INFO | src.adapters.alpaca_data: Prepared symbol=META rows=1257 cols=6
2025-08-10 02:06:33,615 | INFO | src.adapters.alpaca_data: Prepared symbol=MSFT rows=1257 cols=6
2025-08-10 02:06:33,619 | INFO | src.adapters.alpaca_data: Prepared symbol=ORCL rows=1257 cols=6
2

Fetched 10 symbols: ['NVDA', 'AAPL', 'AMZN', 'AVGO', 'CRM', 'GOOGL', 'META', 'MSFT', 'ORCL', 'TSLA']


{'NVDA':                        open    high       low   close  adjusted_close  \
 timestamp                                                               
 2020-01-02 05:00:00  238.75  239.91  236.7200  239.91          239.91   
 2020-01-03 05:00:00  235.10  237.83  234.1000  236.07          236.07   
 2020-01-06 05:00:00  232.32  237.27  231.2700  237.06          237.06   
 2020-01-07 05:00:00  238.20  241.77  236.3900  239.93          239.93   
 2020-01-08 05:00:00  239.76  242.04  238.1490  240.38          240.38   
 ...                     ...     ...       ...     ...             ...   
 2024-12-23 05:00:00  136.28  139.79  135.1201  139.67          139.67   
 2024-12-24 05:00:00  140.00  141.90  138.6500  140.22          140.22   
 2024-12-26 05:00:00  139.70  140.85  137.7300  139.93          139.93   
 2024-12-27 05:00:00  138.55  139.02  134.7100  137.01          137.01   
 2024-12-30 05:00:00  134.83  140.27  134.0200  137.49          137.49   
 
                           v

In [4]:

# Filter to only symbols that returned data
available_symbols = [s for s in symbols if s in sym_to_df and len(sym_to_df[s]) > 0]
if len(available_symbols) < len(symbols):
    missing = sorted(set(symbols) - set(available_symbols))
    print(f"Warning: missing data for symbols: {missing}")

# Combine to one MultiIndex DataFrame
all_df = pd.concat({s: sym_to_df[s] for s in available_symbols}, names=["symbol", "timestamp"]).sort_index()

# 2) Split periods
train_df = all_df.loc[(slice(None), slice(train_start, train_end)), :]
test_df = all_df.loc[(slice(None), slice(test_start, test_end)), :]
print({
    "train_span_days": (train_end - train_start).days + 1,
    "test_span_days": (test_end - test_start).days + 1,
    "train_rows": len(train_df),
    "test_rows": len(test_df),
})


{'train_span_days': 1461, 'test_span_days': 366, 'train_rows': 10060, 'test_rows': 2510}


In [5]:

# 3) Build FinSeer index (prefer GPU CuPy index if available)
embedder = FinSeerEmbedder(FinSeerConfig())
if _HAS_CUVS:
    index = _CuIndex(
        embedder,
        normalize=True,
        use_gpu=True,
        persist_dir="./.alpacium_index",
        auto_persist=True,
    )
else:
    index = FaissCandidateIndex(
        embedder,
        normalize=True,
        use_gpu=True,
        persist_dir="./.alpacium_index",
        auto_persist=True,
    )

Some weights of BertModel were not initialized from the model checkpoint at TheFinAI/FinSeer and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-08-10 02:06:44,402 | INFO | src.retrieval.cuvs_index: Using NumPy inner-product index (CPU) dim=768
2025-08-10 02:06:44,404 | INFO | src.retrieval.cuvs_index: Loaded cuVS-like index from ./.alpacium_index (ids=8016 symbols=1)


In [8]:

added = index.build_from_symbol_dfs(
    {s: train_df.loc[s] for s in available_symbols},
    lookback=5,
    indicator_builder=default_indicator_builder,
    timeframe=TimeFrame.day,
    show_progress=True,
)
print(f"Indexed {added} candidates from training period")
print(f"Index size (ids) = {len(index.id_to_meta)}; last dates per symbol = {list(index.symbol_last_date.items())[:5]} ...")


Indexing symbols:   0%|          | 0/10 [00:00<?, ?it/s]

2025-08-10 02:07:09,587 | INFO | src.retrieval.faiss_index: Building candidates: symbol=AAPL rows=1006 indicators=['open', 'high', 'low', 'close', 'adjusted_close', 'volume', 'RSI', 'MACD_hist'] lookback=5
2025-08-10 02:07:14,861 | INFO | src.retrieval.faiss_index: Built 8016 candidates for symbol=AAPL


Embedding batches:   0%|          | 0/251 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
# FAISS index: load from .alpacium_index if available; else build and save once
from pathlib import Path
import pandas as pd

from src.retrieval.finseer_client import FinSeerEmbedder, FinSeerConfig
from src.retrieval.faiss_index import FaissCandidateIndex, default_indicator_builder
from src.schemas.timeseries import TimeFrame

persist_dir = ".alpacium_index"

# Configure embedder (adjust batch_size/device if desired)
embedder = FinSeerEmbedder(FinSeerConfig(batch_size=32))

# If id_to_meta.json exists, constructor will auto-load the index and metadata
index = FaissCandidateIndex(embedder, persist_dir=persist_dir, auto_persist=False)

# Ensure we have symbol->DataFrame if we need to build
if not Path(persist_dir, "id_to_meta.json").exists():
    # Try to derive a symbol->df map from a MultiIndex OHLCV frame if available
    symbol_to_df = None
    if "train_df" in locals() and isinstance(train_df.index, pd.MultiIndex):
        symbol_to_df = {sym: df.droplevel(0) if isinstance(df.index, pd.MultiIndex) else df
                        for sym, df in train_df.groupby(level="symbol")}
    elif "full_df" in locals() and isinstance(full_df.index, pd.MultiIndex):
        symbol_to_df = {sym: df.droplevel(0) if isinstance(df.index, pd.MultiIndex) else df
                        for sym, df in full_df.groupby(level="symbol")}
    elif "symbol_to_df" in locals():
        symbol_to_df = symbol_to_df  # already provided

    if symbol_to_df is None:
        raise RuntimeError("Please provide a symbol->DataFrame mapping as `symbol_to_df` or a MultiIndex `train_df`/`full_df`.")

    lookback = (lookback if "lookback" in locals() else 5)
    timeframe = (timeframe if "timeframe" in locals() else TimeFrame.day)

    total = index.build_from_symbol_dfs(
        symbol_to_df,
        lookback=lookback,
        indicator_builder=default_indicator_builder,
        timeframe=timeframe,
        show_progress=True,
    )
    index.save(persist_dir)

# `index` is ready; proceed to alpha generation cells that use `index`

Some weights of BertModel were not initialized from the model checkpoint at TheFinAI/FinSeer and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-08-10 02:07:42,822 | INFO | src.retrieval.faiss_index: Loaded index from .alpacium_index (ids=8016 symbols=1)


In [7]:

# 4) Initialize StockLLM generator
stockllm = StockLLMGenerator(StockLLMConfig(temperature=0.0, max_new_tokens=64))


2025-08-10 02:06:52,330 | INFO | accelerate.utils.modeling: We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

# Restrict alpha generation to symbols that exist in the retrieval index
indexed_symbols = sorted([s for s, ids in getattr(index, "symbol_to_ids", {}).items() if ids])
if len(indexed_symbols) == 0:
    print("Warning: No indexed symbols found; skipping alpha generation.")
    test_df_for_alpha = test_df.iloc[0:0]
else:
    # Filter 2024 test data to indexed symbols only
    test_df_for_alpha = test_df.loc[(indexed_symbols, slice(None)), :]
    missing = sorted(set(test_df.index.get_level_values(0).unique()) - set(indexed_symbols))
    if missing:
        print(f"Skipping non-indexed symbols (no candidates in retrieval index): {missing}")

# 5) Generate signals on 2024 only using the trained index
alpha_engine = AlphaEngine()
alpha_signals = alpha_engine.generate_signals(
    historical_data=test_df_for_alpha,
    alpha_function=lambda data: stockllm_alpha(
        historical_data=data,
        index=index,
        generator=stockllm,
        lookback=5,
        top_k=5,
        timeframe=TimeFrame.day,
        confidence_threshold=0.0,
        filter_symbols=indexed_symbols,
    ),
    parameters=None,
    show_progress=False,
)
print(alpha_signals.signals[["movement", "prob_rise", "prob_fall", "prob_freeze", "confidence", "signal"]].dropna().head())


Skipping non-indexed symbols (no candidates in retrieval index): ['AMZN', 'AVGO', 'CRM', 'GOOGL', 'META', 'MSFT', 'NVDA', 'ORCL', 'TSLA']


Generating StockLLM signals:   0%|          | 0/246 [00:00<?, ?it/s]



In [None]:

# 6) Backtest with vectorbt on 2024
bt_results = run_backtest_vbt(
    historical_data=test_df,
    alpha_signals=alpha_signals,
    initial_capital=1_000_000.0,
    transaction_cost=0.001,
)
print({
    "total_return": bt_results.total_return,
    "annual_return": bt_results.annual_return,
    "sharpe_ratio": bt_results.sharpe_ratio,
})


In [None]:

# 7) Analytics: hit-rate, Brier score, per-symbol breakdown
sigs = alpha_signals.signals.copy()
# Realized next-day movement from adjusted_close
adj = test_df["adjusted_close"].unstack(level=0)
next_ret = adj.pct_change().shift(-1)
# Long/short correctness based on sign of next day's return
realized_move = next_ret.applymap(lambda x: "rise" if x > 0 else ("fall" if x < 0 else "freeze"))
realized_move = realized_move.stack().rename("realized_movement")

# Align with signals index
eval_df = sigs[["movement", "prob_rise", "prob_fall", "prob_freeze", "confidence", "signal"]].join(realized_move, how="left")

def brier_row(row: pd.Series) -> float:
    y = {"rise": 0.0, "fall": 0.0, "freeze": 0.0}
    if isinstance(row.get("realized_movement"), str) and row["realized_movement"] in y:
        y[row["realized_movement"]] = 1.0
    p_r = float(row.get("prob_rise", 0.0) or 0.0)
    p_f = float(row.get("prob_fall", 0.0) or 0.0)
    p_z = float(row.get("prob_freeze", 0.0) or 0.0)
    return (y["rise"] - p_r) ** 2 + (y["fall"] - p_f) ** 2 + (y["freeze"] - p_z) ** 2

# Classification accuracy on non-null realized
mask = eval_df["realized_movement"].notna()
overall_acc = (eval_df.loc[mask, "movement"] == eval_df.loc[mask, "realized_movement"]).mean()
brier = eval_df.loc[mask].apply(brier_row, axis=1).mean()

# Per-symbol accuracy
per_symbol = (
    eval_df.loc[mask]
    .reset_index()
    .groupby("symbol")
    .apply(lambda g: pd.Series({
        "n": len(g),
        "accuracy": (g["movement"] == g["realized_movement"]).mean(),
        "avg_confidence": g["confidence"].astype(float).mean(),
    }))
    .sort_values("accuracy", ascending=False)
)

# Confidence calibration buckets
bins = [0.0, 0.55, 0.7, 0.85, 1.01]
labels = ["<=0.55", "0.55-0.7", "0.7-0.85", ">0.85"]
eval_df["conf_bin"] = pd.cut(eval_df["confidence"].astype(float), bins=bins, labels=labels, include_lowest=True)
calib = (
    eval_df.loc[mask]
    .groupby("conf_bin")
    .apply(lambda g: pd.Series({
        "n": len(g),
        "accuracy": (g["movement"] == g["realized_movement"]).mean(),
        "avg_confidence": g["confidence"].astype(float).mean(),
        "avg_brier": g.apply(brier_row, axis=1).mean(),
    }))
)

print("\nAnalytics summary (2024):")
print({"accuracy": float(overall_acc), "brier": float(brier)})
print("\nPer-symbol accuracy (top 10):")
print(per_symbol.head(10))
print("\nConfidence calibration:")
print(calib)


In [None]:
# 8) VectorBT: rich stats and core plots
import vectorbt as vbt  # noqa: F401
import plotly.graph_objects as go

pf = bt_results.portfolio

# Full stats table
stats = pf.stats()
display(stats)

# Equity curve with positions
fig = pf.plot()
fig.update_layout(title='Strategy equity and positions')
fig.show()

# Drawdowns plot
fig = pf.drawdowns.plot()
fig.update_layout(title='Strategy drawdowns')
fig.show()


In [None]:
# 9) Trade analysis (per-symbol, expectancy) and turnover
trades = pf.trades

# Overall trade stats
trade_stats = trades.stats()
display(trade_stats)

# Per-symbol trade stats (win rate, PF, expectancy)
per_symbol_trades = trades.stats(group_by='symbol')
display(per_symbol_trades.sort_values('Win Rate [%]', ascending=False).head(10))

# Expectancy distribution
exp_series = trades.expectancy()
fig = exp_series.vbt.histplot(xaxis_title='Expectancy', title='Trade expectancy distribution')
fig.show()

# Turnover and exposure
turnover = pf.turnover()
exposure = pf.gross_exposure()
fig = vbt.make_subplots(rows=2, cols=1, shared_xaxes=True)
turnover.vbt.plot(add_trace_kwargs=dict(row=1, col=1), fig=fig)
exposure.vbt.plot(add_trace_kwargs=dict(row=2, col=1), fig=fig)
fig.update_layout(title='Turnover and Gross Exposure')
fig.show()


In [None]:
# 10) Rolling performance and factor exposure proxy
# Rolling Sharpe (60d) on aggregated portfolio returns and rolling max drawdown (252d)
strat_ret = pf.returns().sum(axis=1)
rolling_sharpe = (strat_ret.rolling(60).mean() / strat_ret.rolling(60).std()).mul(np.sqrt(252))

# Rolling max drawdown from equity curve
equity = pf.value()
roll_peak = equity.rolling(252).max()
rolling_maxdd = 1.0 - (equity / roll_peak)

fig = vbt.make_subplots(rows=2, cols=1, shared_xaxes=True)
rolling_sharpe.vbt.plot(add_trace_kwargs=dict(row=1, col=1), fig=fig)
fig.update_yaxes(title_text='Rolling Sharpe (60d)', row=1, col=1)
rolling_maxdd.vbt.plot(add_trace_kwargs=dict(row=2, col=1), fig=fig)
fig.update_yaxes(title_text='Rolling Max DD (252d)', row=2, col=1)
fig.update_layout(title='Rolling performance metrics')
fig.show()

# Beta to market (proxy): regress daily returns vs SPY if available via YF
try:
    spy = vbt.YFData.download('SPY', period='5y').get('Close').pct_change().rename('mkt')
    strat_ret = strat_ret.rename('strat')
    joined = pd.concat([strat_ret, spy], axis=1).dropna()
    beta = joined.cov().iloc[0,1] / joined.var().iloc[1,0]
    alpha = joined['strat'].mean() - beta * joined['mkt'].mean()
    print({'market_beta': float(beta), 'alpha_daily_mean': float(alpha)})
except Exception as e:
    print('Beta calc skipped:', e)


In [None]:
# 11) Confidence-threshold sweep vs performance
# Evaluate how confidence gating impacts returns and accuracy
conf_grid = np.linspace(0.0, 0.9, 10)
records = []
close_w = test_df['close'].unstack(level=0)
sig = alpha_signals.signals['signal'].unstack(level=0).reindex(close_w.index).fillna(0.0)
probs = alpha_signals.signals['confidence'].unstack(level=0).reindex(close_w.index).fillna(0.0)

for thr in conf_grid:
    gated = sig.where(probs >= thr, other=0.0)
    long_now = gated > 0
    long_prev = long_now.shift(1, fill_value=False)
    entries = long_now & ~long_prev
    exits = ~long_now & long_prev
    short_now = gated < 0
    short_prev = short_now.shift(1, fill_value=False)
    short_entries = short_now & ~short_prev
    short_exits = ~short_now & short_prev
    n_symbols = close_w.shape[1]
    alloc_per_symbol = 1_000_000.0 / max(n_symbols, 1)
    pf_thr = vbt.Portfolio.from_signals(
        close=close_w,
        entries=entries,
        exits=exits,
        short_entries=short_entries,
        short_exits=short_exits,
        init_cash=1_000_000.0,
        fees=0.001,
        size=(alloc_per_symbol / close_w).where(entries, other=0.0),
        short_size=(alloc_per_symbol / close_w).where(short_entries, other=0.0),
        freq='1D',
    )
    stats_thr = pf_thr.stats()
    tr = float(stats_thr.get('Total Return [%]', np.nan))
    sr = float(stats_thr.get('Sharpe Ratio', np.nan))
    dd = float(stats_thr.get('Max Drawdown [%]', np.nan))
    records.append({'threshold': thr, 'total_return_%': tr, 'sharpe': sr, 'max_dd_%': dd})

thr_df = pd.DataFrame.from_records(records).set_index('threshold')
display(thr_df)
fig = thr_df[['total_return_%', 'sharpe']].vbt.plot(title='Performance vs confidence threshold')
fig.show()


In [None]:
# 12) Per-symbol contribution and heatmaps
# Total return per symbol and turnover heatmap
ret_ps = pf.total_return()
fig = ret_ps.vbt.barplot(title='Total Return per Symbol')
fig.show()

heat_perf = pf.total_return().vbt.heatmap(xaxis_title='symbol', title='Total return heatmap')
heat_perf.show()

# Turnover heatmap
turnover_ps = pf.turnover()
fig = turnover_ps.vbt.heatmap(title='Turnover heatmap')
fig.show()

# Correlation of symbol returns
sym_returns = pf.assets_returns()
fig = sym_returns.corr().vbt.heatmap(title='Correlation of asset returns')
fig.show()


In [None]:
# 13) Animated chart of signals vs price per symbol
# Create a simple animation across symbols showing price and position state
from itertools import cycle

close_w = test_df['close'].unstack(level=0)
position = pf.positions.values  # same shape as close_w
symbols_list = list(close_w.columns)

# Save a GIF cycling over symbols

def plot_symbol(idx_sym):
    sym = symbols_list[idx_sym]
    s_close = close_w[sym]
    s_pos = pd.Series(position[:, idx_sym], index=close_w.index, name='pos')
    fig = vbt.make_subplots(rows=2, cols=1, shared_xaxes=True)
    s_close.vbt.plot(add_trace_kwargs=dict(row=1, col=1, name='Close'), fig=fig)
    s_pos.vbt.plot(add_trace_kwargs=dict(row=2, col=1, name='Position'), fig=fig)
    fig.update_layout(template='plotly_dark', width=900, height=500, title=f'{sym}: Close & Position')
    return fig

vbt.save_animation('symbol_positions.gif', range(len(symbols_list)), plot_symbol, delta=1, fps=2)


### Paper-aligned analysis additions

From the StockLLM + FinSeer paper in `docs/stockLLM/paper-large.md` and `docs/stockLLM/paper-small.md`, we add:

- Calibration and Brier score analysis to match focus on probabilistic movement prediction.
- Confidence-threshold gating performance curves to evaluate trade-off of precision vs coverage.
- Per-indicator/per-symbol breakdowns to inspect where the alpha works best.
- Rolling performance to assess regime sensitivity, esp. during volatile periods.

These are implemented in cells 8–13 using vectorbt’s stats, trades, heatmaps, and animation helpers.


In [None]:
# 14) Retrieved indicator occurrences (paper-style)
# Sample retrievals across test period to see which indicators are most used
from src.schemas import QueryBasic

counts = {}
sample_dates = (
    test_df.index.get_level_values('timestamp')
    .unique()
    .sort_values()
)
# downsample dates for speed (every 5th trading day)
sample_dates = sample_dates[::5]

for sym in available_symbols:
    sdf = test_df.loc[sym].sort_index()
    dates = [d for d in sample_dates if d in sdf.index]
    for as_of in dates:
        qb = QueryBasic.from_dataframe(sym, test_df, as_of=as_of.date(), lookback=5, timeframe=TimeFrame.day)
        hits = index.query(qb, top_k=5)
        for h in hits:
            ind = h.get('indicator', 'unknown')
            counts[ind] = counts.get(ind, 0) + 1

ind_df = pd.Series(counts, name='occurrences').sort_values(ascending=False)
display(ind_df.head(20))

fig = ind_df.head(30).vbt.barplot(title='Top retrieved indicators (sampled)')
fig.show()


In [None]:
# 15) Accuracy vs confidence threshold and reliability curve
# Uses eval_df from cell 8
thr_records = []
for thr in conf_grid:
    sel = eval_df['confidence'].astype(float) >= thr
    sub = eval_df.loc[sel & eval_df['realized_movement'].notna()]
    acc = (sub['movement'] == sub['realized_movement']).mean() if len(sub) else np.nan
    thr_records.append({'threshold': thr, 'accuracy': acc, 'coverage': len(sub)})
acc_df = pd.DataFrame(thr_records).set_index('threshold')
display(acc_df)
fig = acc_df[['accuracy']].vbt.plot(title='Classification accuracy vs confidence threshold')
fig.show()

# Reliability (calibration) curve
calib_points = (
    eval_df.loc[eval_df['realized_movement'].notna()]
    .groupby('conf_bin')
    .apply(lambda g: pd.Series({'avg_conf': g['confidence'].astype(float).mean(),
                                'emp_acc': (g['movement'] == g['realized_movement']).mean()}))
    .dropna()
)
fig = go.Figure()
fig.add_trace(go.Scatter(x=calib_points['avg_conf'], y=calib_points['emp_acc'], mode='lines+markers', name='Empirical'))
fig.add_trace(go.Scatter(x=[0,1], y=[0,1], mode='lines', name='Perfect', line=dict(dash='dash')))
fig.update_layout(title='Reliability curve (confidence vs empirical accuracy)', xaxis_title='Average confidence', yaxis_title='Empirical accuracy')
fig.show()


In [None]:
# Save train and test DataFrames to disk
import os

out_dir = "artifacts"
os.makedirs(out_dir, exist_ok=True)

train_df.to_parquet(os.path.join(out_dir, "train_df.parquet"))
test_df.to_parquet(os.path.join(out_dir, "test_df.parquet"))

print("Saved:", os.path.join(out_dir, "train_df.parquet"), "and", os.path.join(out_dir, "test_df.parquet"))
