# Download Price Data

Fetches price data in **batches** (up to 20 tickers per yfinance call), then merges into monthly CSVs under `data/<year>/PRICES_<year>-M<month>.csv`. Tickers sharing the same date range are grouped automatically. Start date in `config/constants.py`. Set `FORCE_REDOWNLOAD = True` to re-download from start.

In [None]:
import sys
from pathlib import Path
from datetime import date, timedelta

_root = Path.cwd().resolve()
while _root != _root.parent and not (_root / ".git").exists():
    _root = _root.parent
sys.path.insert(0, str(_root))

from research.functions.download_helper import (
    find_project_root,
    cleanup_existing_files,
    get_last_dates_per_ticker,
)
from research.functions.fetch_and_store import fetch_and_store
from research.config.constants import get_universe, START_DATE as START_DATE_STR

PROJECT_ROOT = find_project_root(Path.cwd())

In [None]:
DATA_DIR = PROJECT_ROOT / "data"
START_DATE = date.fromisoformat(START_DATE_STR)
TICKERS = get_universe()
FORCE_REDOWNLOAD = False

DATA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
today = date.today()

In [None]:
if FORCE_REDOWNLOAD:
    n = cleanup_existing_files(DATA_DIR)
    if n:
        print(f"Deleted {n} files.")

last_dates = get_last_dates_per_ticker(DATA_DIR, TICKERS) if not FORCE_REDOWNLOAD else {}
resuming = any(v is not None for v in last_dates.values())
print(f"Fetching by ticker: {START_DATE} → {today}", end="")
if resuming:
    print(" (resume per ticker from last date)", end="")
print()

# Build per-ticker date ranges (yfinance end is exclusive → today = T-1 data)
ticker_ranges: dict[str, tuple[date, date]] = {}
skipped = 0
for ticker in TICKERS:
    last = last_dates.get(ticker)
    start = START_DATE if (FORCE_REDOWNLOAD or last is None) else last + timedelta(days=1)
    if start > today:
        skipped += 1
        continue
    ticker_ranges[ticker] = (start, today)

result = fetch_and_store(
    ticker_ranges, DATA_DIR,
    on_ticker=lambda t, n: print(f"  {t}: {n} rows"),
)
print(f"\nUpdated: {len(result.stored)} | Up to date: {skipped} | "
      f"No new data: {len(ticker_ranges) - len(result.stored)}")
if result.failed:
    print(f"WARNING: {len(result.failed)} tickers returned no data: {result.failed}")
print("Done.")

In [None]:
## Verification
from research.functions.load_data import load_prices

df = load_prices(tickers=TICKERS, start_date=START_DATE, end_date=today)
print(f"Total rows loaded: {len(df):,}")
print(f"Tickers with data: {df['ticker'].nunique()} / {len(TICKERS)}")
print(f"Date range: {df['date'].min()} → {df['date'].max()}")

counts = df.groupby("ticker").size()
median_rows = counts.median()
sparse = counts[counts < median_rows * 0.5]
if not sparse.empty:
    print(f"\nWARNING: {len(sparse)} tickers have < 50% of median row count:")
    print(sparse.to_string())
else:
    print("\nAll tickers have reasonable data coverage.")