# Backfill Price Gaps

Scans all `data/<year>/PRICES_*.csv` for **tickers you specify**, detects missing trading days, and backfills from the same data source.

Trading days are derived from the data itself (dates where any ticker has a row), so **holidays and weekends are never treated as gaps**. Per-ticker ranges are bounded by each ticker's first appearance, so **pre-IPO dates are never flagged**.

In [7]:
import sys
from datetime import date, timedelta
from pathlib import Path

import pandas as pd

_root = Path.cwd().resolve()
while _root != _root.parent and not (_root / ".git").exists():
    _root = _root.parent
sys.path.insert(0, str(_root))

from research.functions.download_helper import (
    find_project_root,
    normalize_dates,
    split_into_contiguous_ranges,
)
from research.config.constants import get_universe
from research.functions.fetch_and_store import fetch_and_store

PROJECT_ROOT = find_project_root(Path.cwd())

In [8]:
DATA_DIR = PROJECT_ROOT / "data"
END_DATE = date.today()

TICKERS_TO_BACKFILL = get_universe()

## 1. Load all existing data (vectorised)

Build two things from the CSVs:
- **`trading_dates`** — set of dates where *any* ticker has a row (= actual market open days).
- **`ticker_dates`** — `{ticker: set of dates}` for the tickers we care about.

In [9]:
all_dates: set[date] = set()
ticker_dates: dict[str, set[date]] = {t: set() for t in TICKERS_TO_BACKFILL}
ticker_set = set(TICKERS_TO_BACKFILL)

for path in sorted(DATA_DIR.rglob("PRICES_*.csv")):
    try:
        df = pd.read_csv(path, usecols=["date", "ticker"], parse_dates=["date"])
        df = normalize_dates(df)
        # Collect all trading dates
        dates_in_file = set(df["date"].unique())
        # Convert numpy dates to python dates
        dates_in_file = {d.date() if hasattr(d, "date") else d for d in dates_in_file}
        all_dates.update(dates_in_file)
        # Collect per-ticker dates
        for t in ticker_set & set(df["ticker"].unique()):
            t_dates = set(df.loc[df["ticker"] == t, "date"].unique())
            t_dates = {d.date() if hasattr(d, "date") else d for d in t_dates}
            ticker_dates[t].update(t_dates)
    except Exception as e:
        print(f"Skip {path.name}: {e}")

trading_dates = sorted(all_dates)
print(f"Trading dates in data: {len(trading_dates)} ({trading_dates[0]} → {trading_dates[-1]})")
for t in TICKERS_TO_BACKFILL:
    print(f"  {t}: {len(ticker_dates[t])} dates")

Trading dates in data: 1495 (2020-01-02 → 2026-02-09)
  SPY: 1495 dates
  IVV: 1495 dates
  VOO: 1495 dates
  SPLG: 1494 dates
  RSP: 1495 dates
  VTI: 1495 dates
  ITOT: 1495 dates
  SCHB: 1495 dates
  IWV: 1495 dates
  IWD: 1495 dates
  IWF: 1495 dates
  SCHX: 1495 dates
  VV: 1495 dates
  VTV: 1495 dates
  VUG: 1495 dates
  MDY: 1495 dates
  IJH: 1495 dates
  IWR: 1495 dates
  IWM: 1495 dates
  VB: 1495 dates
  SCHA: 1495 dates
  VXF: 1495 dates
  DIA: 1495 dates
  QQQ: 1495 dates
  QQQM: 1305 dates
  XLK: 1495 dates
  VGT: 1495 dates
  IYW: 1495 dates
  IGV: 1495 dates
  SMH: 1495 dates
  SOXX: 1495 dates
  XLV: 1495 dates
  VHT: 1495 dates
  IYH: 1495 dates
  XBI: 1495 dates
  XLF: 1495 dates
  VFH: 1495 dates
  IYF: 1495 dates
  KBE: 1495 dates
  KRE: 1495 dates
  XLE: 1495 dates
  VDE: 1495 dates
  IYE: 1495 dates
  XLI: 1495 dates
  VIS: 1495 dates
  IYJ: 1495 dates
  PAVE: 1495 dates
  XLY: 1495 dates
  VCR: 1495 dates
  IYC: 1495 dates
  XHB: 1495 dates
  XLP: 1495 dates
  VD

## 2. Find gaps per ticker

For each ticker, gaps = trading dates between the ticker's **first and last** existing date that it is missing. This avoids pre-IPO false positives and holiday false positives in one shot.

In [10]:
gaps_by_ticker: dict[str, list[date]] = {}
trading_dates_set = set(trading_dates)

for t in TICKERS_TO_BACKFILL:
    if not ticker_dates[t]:
        print(f"  {t}: no existing data, skipping")
        continue
    first = min(ticker_dates[t])
    last = max(ticker_dates[t])
    # Expected = trading dates in [first, last] for this ticker
    expected = {d for d in trading_dates_set if first <= d <= last}
    missing = sorted(expected - ticker_dates[t])
    if missing:
        gaps_by_ticker[t] = missing

total_gaps = sum(len(v) for v in gaps_by_ticker.values())
print(f"Total gaps: {total_gaps} across {len(gaps_by_ticker)} tickers")
for t, dates in gaps_by_ticker.items():
    print(f"  {t}: {len(dates)} missing days ({dates[0]} → {dates[-1]})")

  IEU: no existing data, skipping
Total gaps: 1 across 1 tickers
  SPLG: 1 missing days (2025-10-24 → 2025-10-24)


## 3. Fetch and merge into monthly CSVs

Gap dates are split into **tight contiguous ranges** (new range when consecutive gaps are >30 days apart), then tickers sharing the same date range are **batched into a single yfinance call** (up to 20 tickers per batch). Only gap dates are kept before merging.

In [11]:
ticker_ranges = {
    t: split_into_contiguous_ranges(gaps, max_gap_days=30)
    for t, gaps in gaps_by_ticker.items()
}
filter_dates = {t: set(gaps) for t, gaps in gaps_by_ticker.items()}

total_ranges = sum(len(r) for r in ticker_ranges.values())
print(f"Fetching {total_ranges} ranges across {len(ticker_ranges)} tickers (batched)")
for t, ranges in ticker_ranges.items():
    print(f"  {t}: {len(ranges)} range(s)")

result = fetch_and_store(
    ticker_ranges, DATA_DIR,
    filter_dates=filter_dates,
    on_ticker=lambda t, n: print(f"  {t}: {n} rows backfilled"),
)
print(f"\nDone. Total rows backfilled: {sum(result.stored.values())}")
if result.failed:
    print(f"WARNING: {len(result.failed)} tickers returned no data: {result.failed}")

Fetching 1 ranges across 1 tickers (batched)
  SPLG: 1 range(s)

Done. Total rows backfilled: 0


In [12]:
## Re-check gaps after backfill
all_dates_v: set[date] = set()
ticker_dates_v: dict[str, set[date]] = {t: set() for t in TICKERS_TO_BACKFILL}
ticker_set_v = set(TICKERS_TO_BACKFILL)

for path in sorted(DATA_DIR.rglob("PRICES_*.csv")):
    try:
        df = pd.read_csv(path, usecols=["date", "ticker"], parse_dates=["date"])
        df = normalize_dates(df)
        dates_in_file = {d.date() if hasattr(d, "date") else d for d in df["date"].unique()}
        all_dates_v.update(dates_in_file)
        for t in ticker_set_v & set(df["ticker"].unique()):
            t_dates = {d.date() if hasattr(d, "date") else d for d in df.loc[df["ticker"] == t, "date"].unique()}
            ticker_dates_v[t].update(t_dates)
    except Exception:
        continue

trading_dates_v = set(all_dates_v)
remaining_gaps = 0
for t in TICKERS_TO_BACKFILL:
    if not ticker_dates_v[t]:
        continue
    first = min(ticker_dates_v[t])
    last = max(ticker_dates_v[t])
    expected = {d for d in trading_dates_v if first <= d <= last}
    missing = expected - ticker_dates_v[t]
    if missing:
        remaining_gaps += len(missing)
        print(f"  {t}: {len(missing)} gaps remain")

print(f"\nRemaining gaps after backfill: {remaining_gaps}")
if remaining_gaps == 0:
    print("All gaps resolved.")
else:
    print(f"WARNING: {remaining_gaps} gaps remain. Check failed tickers above.")

  SPLG: 1 gaps remain

Remaining gaps after backfill: 1
