In [17]:
# ============================================================
# 02_update_adp_snapshot.ipynb
# Refresh current-season ADP time series (time = draft timestamps)
# ============================================================
import os
import time
from typing import Any, List, Tuple, Optional, Set
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
import pandas as pd
import numpy as np
from tqdm import tqdm

# ----------------------------
# CONFIG
# ----------------------------
CURRENT_SEASON = 2023

SEED_USERS = [
    ("camsnotsober", "567994319854673920"),
    ("dynastybuck", "332066581859282944"),
    ("curtistodd", "568256222760906752"),
    ("elnostrathomas", "387839476958965760"),
    ("coombesie9", "386648007942254592"),
]

MAX_EXPANSION_STEPS = 2
MAX_USERS_PER_STEP = 2500
MAX_LEAGUES_TOTAL = 20000

MAX_WORKERS = 40
CHUNK_SIZE = 400
SLEEP_BETWEEN_CHUNKS_SEC = 8

ROOT_DIR = "sleeper_dynasty_adp"
RAW_DIR  = os.path.join(ROOT_DIR, "data", "raw")
SNAP_DIR = os.path.join(ROOT_DIR, "data", "snapshots")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(SNAP_DIR, exist_ok=True)

for sub in ["leagues", "league_users", "drafts", "picks", "players"]:
    os.makedirs(os.path.join(RAW_DIR, sub), exist_ok=True)

# ----------------------------
# HTTP
# ----------------------------
BASE = "https://api.sleeper.app/v1"
session = requests.Session()
session.headers.update({"User-Agent": "Sleeper-Dynasty-ADP/1.0"})

def get_json(url: str, timeout: int = 30, retries: int = 4, backoff: float = 1.8) -> Any:
    last_err = None
    for i in range(retries):
        try:
            r = session.get(url, timeout=timeout)
            if r.status_code == 429:
                time.sleep(min(30, (backoff ** i) + 1))
                continue
            r.raise_for_status()
            return r.json()
        except Exception as e:
            last_err = e
            time.sleep(min(30, (backoff ** i) + 0.5))
    raise RuntimeError(f"GET failed: {url}\nLast error: {last_err}")

def chunked(lst: List[Any], n: int):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

def parallel_fetch(urls: List[str], desc: str) -> List[Tuple[str, Any, Optional[str]]]:
    out = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = {ex.submit(get_json, u): u for u in urls}
        for fut in tqdm(as_completed(futs), total=len(futs), desc=desc):
            u = futs[fut]
            try:
                out.append((u, fut.result(), None))
            except Exception as e:
                out.append((u, None, str(e)))
    return out

# ----------------------------
# URL helpers
# ----------------------------
def url_user_leagues(user_id: str, season: int) -> str:
    return f"{BASE}/user/{user_id}/leagues/nfl/{season}"

def url_league_users(league_id: str) -> str:
    return f"{BASE}/league/{league_id}/users"

def url_league_drafts(league_id: str) -> str:
    return f"{BASE}/league/{league_id}/drafts"

def url_draft_picks(draft_id: str) -> str:
    return f"{BASE}/draft/{draft_id}/picks"

# ----------------------------
# Discovery
# ----------------------------
def fetch_leagues_for_users(user_ids: List[str], season: int) -> pd.DataFrame:
    urls = [url_user_leagues(uid, season) for uid in user_ids]
    rows = []
    for i, chunk in enumerate(chunked(urls, CHUNK_SIZE), start=1):
        res = parallel_fetch(chunk, desc=f"[{season}] leagues chunk {i} ({len(chunk)})")
        for u, data, err in res:
            if err or data is None:
                continue
            for lg in data:
                lg["_season"] = season
                rows.append(lg)
        if len(urls) > CHUNK_SIZE:
            time.sleep(SLEEP_BETWEEN_CHUNKS_SEC)
    if not rows:
        return pd.DataFrame()
    return pd.json_normalize(rows).drop_duplicates(subset=["league_id"])

def fetch_users_for_leagues(league_ids: List[str], season: int) -> pd.DataFrame:
    urls = [url_league_users(lid) for lid in league_ids]
    rows = []
    for i, chunk in enumerate(chunked(urls, CHUNK_SIZE), start=1):
        res = parallel_fetch(chunk, desc=f"[{season}] league users chunk {i} ({len(chunk)})")
        for u, data, err in res:
            if err or data is None:
                continue
            league_id = u.split("/league/")[1].split("/users")[0]
            for usr in data:
                usr["_league_id"] = league_id
                rows.append(usr)
        if len(urls) > CHUNK_SIZE:
            time.sleep(SLEEP_BETWEEN_CHUNKS_SEC)
    if not rows:
        return pd.DataFrame()
    return pd.json_normalize(rows)

def discover_leagues(season: int, seed_users: List[Tuple[str, str]]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    frontier_users = [uid for _name, uid in seed_users]
    seen_users: Set[str] = set(frontier_users)
    seen_leagues: Set[str] = set()
    leagues_parts, memberships_parts = [], []

    for step in range(MAX_EXPANSION_STEPS + 1):
        frontier_users = frontier_users[:MAX_USERS_PER_STEP]
        print(f"\n=== [{season}] DISCOVERY STEP {step} | users={len(frontier_users)} ===")

        leagues_df = fetch_leagues_for_users(frontier_users, season)
        if leagues_df.empty:
            break

        new_leagues_df = leagues_df[~leagues_df["league_id"].astype(str).isin(seen_leagues)].copy()
        print(f"[{season}] Leagues fetched={len(leagues_df)} | new={len(new_leagues_df)}")
        if new_leagues_df.empty:
            break

        leagues_parts.append(new_leagues_df)
        new_league_ids = new_leagues_df["league_id"].astype(str).tolist()
        seen_leagues.update(new_league_ids)

        if len(seen_leagues) >= MAX_LEAGUES_TOTAL:
            print(f"[{season}] Hit MAX_LEAGUES_TOTAL cap.")
            break

        mem_df = fetch_users_for_leagues(new_league_ids, season)
        if not mem_df.empty:
            memberships_parts.append(mem_df)

        if step == MAX_EXPANSION_STEPS or mem_df.empty or "user_id" not in mem_df.columns:
            break

        discovered_users = mem_df["user_id"].dropna().astype(str).unique().tolist()
        frontier_users = [u for u in discovered_users if u not in seen_users]
        seen_users.update(frontier_users)
        print(f"[{season}] Next frontier users={len(frontier_users)} | total users seen={len(seen_users)}")

    leagues_out = pd.concat(leagues_parts, ignore_index=True) if leagues_parts else pd.DataFrame()
    memberships_out = pd.concat(memberships_parts, ignore_index=True) if memberships_parts else pd.DataFrame()
    return leagues_out, memberships_out

# ----------------------------
# Drafts + Picks
# ----------------------------
def draft_to_row(d: dict, league_id: str, season: int) -> dict:
    md = d.get("metadata") or {}
    st = d.get("settings") or {}
    return {
        "draft_id": str(d.get("draft_id") or ""),
        "league_id": str(league_id),
        "season": int(season),
        "status": d.get("status"),
        "type": d.get("type"),
        "start_time": d.get("start_time"),
        "md_scoring_type": md.get("scoring_type"),
        "st_teams": st.get("teams"),
        "st_rounds": st.get("rounds"),
        "st_slots_super_flex": st.get("slots_super_flex"),
    }

def fetch_drafts_for_leagues(league_ids: List[str], season: int) -> pd.DataFrame:
    urls = [url_league_drafts(lid) for lid in league_ids]
    parts, buf = [], []
    for i, chunk in enumerate(chunked(urls, CHUNK_SIZE), start=1):
        res = parallel_fetch(chunk, desc=f"[{season}] drafts chunk {i} ({len(chunk)})")
        for u, data, err in res:
            if err or data is None:
                continue
            league_id = u.split("/league/")[1].split("/drafts")[0]
            for d in data:
                row = draft_to_row(d, league_id, season)
                if row["draft_id"]:
                    buf.append(row)
        if buf:
            parts.append(pd.DataFrame(buf).drop_duplicates(subset=["draft_id"]))
            buf = []
        if len(urls) > CHUNK_SIZE:
            time.sleep(SLEEP_BETWEEN_CHUNKS_SEC)
    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

def pick_to_row(p: dict, draft_id: str) -> dict:
    return {
        "draft_id": str(draft_id),
        "player_id": str(p.get("player_id")) if p.get("player_id") is not None else None,
        "pick_no": p.get("pick_no"),
    }

def fetch_picks_for_completed_drafts(drafts_df: pd.DataFrame, season: int) -> pd.DataFrame:
    completed_ids = (
        drafts_df.loc[drafts_df["status"].astype(str).str.lower() == "complete", "draft_id"]
        .astype(str).unique().tolist()
    )
    urls = [url_draft_picks(did) for did in completed_ids]
    parts, buf = [], []
    for i, chunk in enumerate(chunked(urls, CHUNK_SIZE), start=1):
        res = parallel_fetch(chunk, desc=f"[{season}] picks chunk {i} ({len(chunk)})")
        for u, data, err in res:
            if err or data is None:
                continue
            draft_id = u.split("/draft/")[1].split("/picks")[0]
            for p in data:
                buf.append(pick_to_row(p, draft_id))
        if buf:
            parts.append(pd.DataFrame(buf))
            buf = []
        if len(urls) > CHUNK_SIZE:
            time.sleep(SLEEP_BETWEEN_CHUNKS_SEC)
    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

def build_draft_catalog(drafts_df: pd.DataFrame) -> pd.DataFrame:
    df = drafts_df.copy()
    for c in ["draft_id", "league_id"]:
        if c in df.columns:
            df[c] = df[c].astype(str)

    for c in ["start_time", "st_teams", "st_rounds", "st_slots_super_flex"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    df["start_dt"] = pd.to_datetime(df["start_time"], unit="ms", utc=True, errors="coerce")
    df["start_month"] = df["start_dt"].dt.strftime("%Y-%m")

    df["is_dynasty"] = df["md_scoring_type"].astype(str).str.startswith("dynasty_", na=False)
    df["is_superflex"] = df["st_slots_super_flex"].fillna(0) >= 1

    def dynasty_class(r):
        if not r["is_dynasty"]:
            return "non_dynasty"
        if pd.notna(r["st_rounds"]) and r["st_rounds"] <= 6:
            return "rookie"
        if pd.notna(r["st_rounds"]) and r["st_rounds"] >= 14:
            return "startup"
        return "other"

    df["dynasty_class"] = df.apply(dynasty_class, axis=1)
    return df

def compute_adp_time_series(picks_df: pd.DataFrame, draft_catalog: pd.DataFrame) -> pd.DataFrame:
    p = picks_df.copy()
    p["pick_no"] = pd.to_numeric(p["pick_no"], errors="coerce")
    p["draft_id"] = p["draft_id"].astype(str)
    p["player_id"] = p["player_id"].astype(str)

    d = draft_catalog[[
        "draft_id", "season", "type", "md_scoring_type", "st_teams", "st_rounds", "is_superflex",
        "dynasty_class", "start_month"
    ]].copy()
    d["draft_id"] = d["draft_id"].astype(str)

    m = p.merge(d, on="draft_id", how="left")
    m = m[m["pick_no"].notna() & m["player_id"].notna()].copy()
    m = m[m["dynasty_class"].isin(["startup", "rookie"])].copy()

    adp_month = (
        m.groupby(
            ["season","start_month","dynasty_class","type","md_scoring_type","st_teams","st_rounds","is_superflex","player_id"],
            dropna=False
        )
        .agg(
            drafts=("draft_id","nunique"),
            picks=("pick_no","size"),
            adp=("pick_no","mean"),
            min_pick=("pick_no","min"),
            max_pick=("pick_no","max"),
        )
        .reset_index()
    )
    adp_month["adp"] = adp_month["adp"].round(2)

    adp_season = (
        m.groupby(
            ["season","dynasty_class","type","md_scoring_type","st_teams","st_rounds","is_superflex","player_id"],
            dropna=False
        )
        .agg(
            drafts=("draft_id","nunique"),
            picks=("pick_no","size"),
            adp=("pick_no","mean"),
            min_pick=("pick_no","min"),
            max_pick=("pick_no","max"),
        )
        .reset_index()
    )
    adp_season["adp"] = adp_season["adp"].round(2)
    adp_season["start_month"] = "ALL"

    return pd.concat([adp_month, adp_season], ignore_index=True)

# ----------------------------
# RUN UPDATE
# ----------------------------
season = CURRENT_SEASON
leagues_df, league_users_df = discover_leagues(season, SEED_USERS)

leagues_df.to_parquet(os.path.join(RAW_DIR, "leagues", f"leagues_{season}.parquet"), index=False)
league_users_df.to_parquet(os.path.join(RAW_DIR, "league_users", f"league_users_{season}.parquet"), index=False)

if leagues_df.empty:
    raise RuntimeError("No leagues discovered; cannot update.")

league_ids = leagues_df["league_id"].astype(str).unique().tolist()
drafts_df = fetch_drafts_for_leagues(league_ids, season)
drafts_df.to_parquet(os.path.join(RAW_DIR, "drafts", f"drafts_{season}.parquet"), index=False)

draft_catalog = build_draft_catalog(drafts_df)

picks_df = fetch_picks_for_completed_drafts(drafts_df, season)
picks_df.to_parquet(os.path.join(RAW_DIR, "picks", f"picks_{season}.parquet"), index=False)

adp_ts = compute_adp_time_series(picks_df, draft_catalog)

out_dir = os.path.join(SNAP_DIR, "adp_time_series", f"season={season}")
os.makedirs(out_dir, exist_ok=True)
adp_ts.to_parquet(os.path.join(out_dir, "adp_time_series.parquet"), index=False)

cat_dir = os.path.join(SNAP_DIR, "draft_catalog", f"season={season}")
os.makedirs(cat_dir, exist_ok=True)
draft_catalog.to_parquet(os.path.join(cat_dir, "draft_catalog.parquet"), index=False)

print("[OK] Updated ADP time-series:", os.path.join(out_dir, "adp_time_series.parquet"), adp_ts.shape)



=== [2023] DISCOVERY STEP 0 | users=5 ===


[2023] leagues chunk 1 (5): 100%|████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 31.96it/s]


[2023] Leagues fetched=172 | new=172


[2023] league users chunk 1 (172): 100%|█████████████████████████████████████████████| 172/172 [00:15<00:00, 11.34it/s]


[2023] Next frontier users=1280 | total users seen=1285

=== [2023] DISCOVERY STEP 1 | users=1280 ===


[2023] leagues chunk 1 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:01<00:00, 325.44it/s]
[2023] leagues chunk 2 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:00<00:00, 403.41it/s]
[2023] leagues chunk 3 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:01<00:00, 341.59it/s]
[2023] leagues chunk 4 (80): 100%|████████████████████████████████████████████████████| 80/80 [00:00<00:00, 512.39it/s]


[2023] Leagues fetched=17407 | new=17237


[2023] league users chunk 1 (400): 100%|████████████████████████████████████████████| 400/400 [00:00<00:00, 485.94it/s]
[2023] league users chunk 2 (400): 100%|████████████████████████████████████████████| 400/400 [00:00<00:00, 439.40it/s]
[2023] league users chunk 3 (400): 100%|████████████████████████████████████████████| 400/400 [00:00<00:00, 452.03it/s]
[2023] league users chunk 4 (400): 100%|████████████████████████████████████████████| 400/400 [00:01<00:00, 392.46it/s]
[2023] league users chunk 5 (400): 100%|████████████████████████████████████████████| 400/400 [00:00<00:00, 427.35it/s]
[2023] league users chunk 6 (400): 100%|████████████████████████████████████████████| 400/400 [00:00<00:00, 464.36it/s]
[2023] league users chunk 7 (400): 100%|████████████████████████████████████████████| 400/400 [00:00<00:00, 422.08it/s]
[2023] league users chunk 8 (400): 100%|████████████████████████████████████████████| 400/400 [00:00<00:00, 404.52it/s]
[2023] league users chunk 9 (400): 100%|

[2023] Next frontier users=61270 | total users seen=62555

=== [2023] DISCOVERY STEP 2 | users=2500 ===


[2023] leagues chunk 1 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:01<00:00, 235.86it/s]
[2023] leagues chunk 2 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:03<00:00, 123.55it/s]
[2023] leagues chunk 3 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:01<00:00, 334.75it/s]
[2023] leagues chunk 4 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:03<00:00, 130.97it/s]
[2023] leagues chunk 5 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:01<00:00, 243.97it/s]
[2023] leagues chunk 6 (400): 100%|█████████████████████████████████████████████████| 400/400 [00:01<00:00, 211.73it/s]
[2023] leagues chunk 7 (100): 100%|█████████████████████████████████████████████████| 100/100 [00:00<00:00, 333.10it/s]


[2023] Leagues fetched=39224 | new=29372
[2023] Hit MAX_LEAGUES_TOTAL cap.


[2023] drafts chunk 1 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 490.20it/s]
[2023] drafts chunk 2 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 496.46it/s]
[2023] drafts chunk 3 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 491.12it/s]
[2023] drafts chunk 4 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 481.29it/s]
[2023] drafts chunk 5 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 497.93it/s]
[2023] drafts chunk 6 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 512.17it/s]
[2023] drafts chunk 7 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 498.81it/s]
[2023] drafts chunk 8 (400): 100%|██████████████████████████████████████████████████| 400/400 [00:00<00:00, 504.37it/s]
[2023] drafts chunk 9 (400): 100%|██████

MemoryError: Unable to allocate 1.38 MiB for an array with shape (60275, 3) and data type object