# TripScore â€” TDX Bulk Prefetch (Stageable Full Dataset Download)

This notebook demonstrates how to **gradually** fetch full TDX datasets (paged OData) and **resume** later.

Key ideas:
- Each run fetches only a small number of pages (or uses a time budget), so you can run it repeatedly.
- Progress is persisted under the cache directory (default: `.cache/tripscore/tdx_bulk/`).
- This is designed to be gentle with TDX rate limits (global request spacing + retries + stale-if-error).


In [None]:
import os
import sys
from pathlib import Path

repo_root = Path.cwd()
src_dir = repo_root / "src"
if src_dir.exists():
    sys.path.insert(0, str(src_dir))

try:
    from tripscore.core.env import load_dotenv_if_present

    load_dotenv_if_present()
except Exception:
    pass

print("TDX_CLIENT_ID configured:", bool(os.getenv("TDX_CLIENT_ID")))
print("TDX_CLIENT_SECRET configured:", bool(os.getenv("TDX_CLIENT_SECRET")))


In [None]:
from tripscore.config.settings import get_settings
from tripscore.core.env import resolve_project_path
from tripscore.ingestion.tdx_bulk import bulk_prefetch_all
from tripscore.ingestion.tdx_client import TdxClient
from tripscore.recommender.recommend import build_cache

settings = get_settings()
cache = build_cache(settings)
tdx = TdxClient(settings, cache)

cache_dir = resolve_project_path(settings.cache.dir)
print("TDX city:", settings.ingestion.tdx.city)
print("Cache dir:", cache_dir)
print("Bulk dir:", cache_dir / "tdx_bulk")


## Run one "stage" (repeatable)

Re-run this cell as many times as you want. Each run fetches only a small amount.

Tips:
- If you keep hitting `429`, increase `ingestion.tdx.request_spacing_seconds` in `src/tripscore/config/defaults.yaml`.
- You can also lower `ingestion.tdx.retry.max_attempts` to avoid long notebook waits.


In [None]:
datasets = [
    "bus_stops",
    "bike_stations",
    "bike_availability",
    "metro_stations",
    "parking_lots",
    "parking_availability",
]

try:
    results = bulk_prefetch_all(
        tdx_client=tdx,
        cache=cache,
        city=settings.ingestion.tdx.city,
        datasets=datasets,
        max_pages_per_dataset=1,
        max_seconds_total=20,
        reset=False,
    )
    for r in results:
        status = "done" if r.done else f"next_skip={r.next_skip}"
        print(f"{r.dataset}/{r.scope}: pages={r.pages_fetched} added={r.items_added} total={r.total_items} {status}")
        print(f"  data: {r.data_path}")
        print(f"  progress: {r.progress_path}")
except Exception as e:
    print("Bulk prefetch failed:", type(e).__name__, str(e))


## Inspect progress

Progress files track `next_skip` and `done` so the next run can resume.


In [None]:
import json


def read_progress_files(cache_dir: Path) -> list[dict]:
    out = []
    base = cache_dir / "tdx_bulk"
    if not base.exists():
        return out
    for p in sorted(base.rglob("*.progress.json")):
        try:
            out.append({"path": str(p), **json.loads(p.read_text(encoding="utf-8"))})
        except Exception:
            continue
    return out


for row in read_progress_files(cache_dir):
    print(row["dataset"], row["scope"], "done=", row.get("done"), "next_skip=", row.get("next_skip"), "path=", row["path"])


## Reset (optional)

If you want to start over for selected datasets, run with `reset=True`.


In [None]:
# WARNING: This deletes only the bulk-prefetch data/progress files for the selected datasets.
#
# results = bulk_prefetch_all(
#     tdx_client=tdx,
#     cache=cache,
#     city=settings.ingestion.tdx.city,
#     datasets=["bus_stops"],
#     max_pages_per_dataset=1,
#     max_seconds_total=10,
#     reset=True,
# )
# print(results)
