# Download Price Data

Quarterly CSVs in `data/<year>/PRICES_<year>-Q<q>.csv`. Start date in `constants.py`. Set `FORCE_REDOWNLOAD = True` to re-download from start.

In [9]:
import sys
from pathlib import Path
from datetime import date, timedelta

import pandas as pd

PROJECT_ROOT = Path.cwd().parent if (Path.cwd() / "constants.py").exists() else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

from research.constants import UNIVERSE, START_DATE as START_DATE_STR
from research.data_source import fetch_prices

In [10]:
DATA_DIR = PROJECT_ROOT / "data"
START_DATE = date.fromisoformat(START_DATE_STR)
TICKERS = UNIVERSE
FORCE_REDOWNLOAD = False

DATA_DIR.mkdir(parents=True, exist_ok=True)

In [11]:
def quarter_range(year: int, quarter: int) -> tuple[date, date]:
    i = quarter - 1
    return date(year, (1, 4, 7, 10)[i], 1), date(year, (3, 6, 9, 12)[i], (31, 30, 30, 31)[i])


def quarters_from(start: date, end: date):
    y, q = start.year, (start.month - 1) // 3 + 1
    while date(y, (q - 1) * 3 + 1, 1) <= end:
        yield y, q
        q, y = (q + 1, y) if q < 4 else (1, y + 1)


today = date.today()

In [12]:
def _date_series(d):
    return pd.to_datetime(d).dt.date if pd.api.types.is_datetime64_any_dtype(d) else d

if FORCE_REDOWNLOAD:
    for f in DATA_DIR.rglob("PRICES_*.csv"):
        f.unlink()
        print(f"Deleted {f.relative_to(DATA_DIR)}")

for year, q in quarters_from(START_DATE, today):
    qstart, qend = quarter_range(year, q)
    end_cap = min(qend, today)
    if qstart > end_cap:
        continue
    path = DATA_DIR / str(year) / f"PRICES_{year}-Q{q}.csv"
    path.parent.mkdir(parents=True, exist_ok=True)

    if path.exists():
        existing = pd.read_csv(path, parse_dates=["date"])
        last = existing["date"].max()
        fetch_start = pd.Timestamp(last).date() + timedelta(days=1)
        if fetch_start > end_cap:
            print(f"{path.name}: up to date")
            continue
        new_df = fetch_prices(TICKERS, fetch_start, end_cap)
        if new_df.empty:
            continue
        combined = pd.concat([existing.assign(date=_date_series(existing["date"])),
                              new_df.assign(date=_date_series(new_df["date"]))], ignore_index=True)
    else:
        combined = fetch_prices(TICKERS, qstart, end_cap)
        if combined.empty:
            continue
        combined["date"] = _date_series(combined["date"])

    combined = combined.drop_duplicates(subset=["date", "ticker"]).sort_values(["date", "ticker"]).reset_index(drop=True)
    combined.to_csv(path, index=False)
    print(f"{path.name}: ok")

PRICES_2020-Q1.csv: ok
PRICES_2020-Q2.csv: ok
PRICES_2020-Q3.csv: ok
PRICES_2020-Q4.csv: ok
PRICES_2021-Q1.csv: ok
PRICES_2021-Q2.csv: ok
PRICES_2021-Q3.csv: ok
PRICES_2021-Q4.csv: ok
PRICES_2022-Q1.csv: ok
PRICES_2022-Q2.csv: ok
PRICES_2022-Q3.csv: ok
PRICES_2022-Q4.csv: ok
PRICES_2023-Q1.csv: ok
PRICES_2023-Q2.csv: ok
PRICES_2023-Q3.csv: ok
PRICES_2023-Q4.csv: ok
PRICES_2024-Q1.csv: ok
PRICES_2024-Q2.csv: ok
PRICES_2024-Q3.csv: ok
PRICES_2024-Q4.csv: ok
PRICES_2025-Q1.csv: ok
PRICES_2025-Q2.csv: ok
PRICES_2025-Q3.csv: ok
PRICES_2025-Q4.csv: ok
PRICES_2026-Q1.csv: ok
