In [1]:
from __future__ import annotations

import json
import os
from pathlib import Path

CONFIG = {
    "riot_api_key": os.getenv("RIOT_API_KEY", "RGAPI-fb1828da-c76f-41f2-ad12-17b0bf4db4d6"),
    "region_routing": os.getenv("RIOT_REGION_ROUTING", "asia"),
    "platform_routing": os.getenv("RIOT_PLATFORM_ROUTING", "kr"),
    "queue_id": int(os.getenv("RIOT_QUEUE_FILTER", "420") or 420),
    "target_match_count": 10000,
    "seeds_path": Path("seeds.txt"),
    "raw_output_path": Path("raw.csv"),
    "timeline_dir": Path("timeline"),
    "match_ids_per_page": 100,
    "max_player_queue": 400,
    "request_window_seconds": 120.0,
    "request_window_limit": 95,
    "sleep_between_matches": 0.0,
}

if not CONFIG["riot_api_key"]:
    raise RuntimeError("Set the RIOT_API_KEY environment variable before running this notebook.")

CONFIG

{'riot_api_key': 'RGAPI-fb1828da-c76f-41f2-ad12-17b0bf4db4d6',
 'region_routing': 'asia',
 'platform_routing': 'kr',
 'queue_id': 420,
 'target_match_count': 10000,
 'seeds_path': WindowsPath('seeds.txt'),
 'raw_output_path': WindowsPath('raw.csv'),
 'timeline_dir': WindowsPath('timeline'),
 'match_ids_per_page': 100,
 'max_player_queue': 400,
 'request_window_seconds': 120.0,
 'request_window_limit': 95,
 'sleep_between_matches': 0.0}

In [2]:
import csv
import os
import time
from collections import deque
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
from urllib.parse import quote

import pandas as pd
import requests

API_KEY = CONFIG["riot_api_key"]
REGION_ROUTING = CONFIG["region_routing"]
QUEUE_FILTER = CONFIG.get("queue_id")
MATCH_IDS_PER_PAGE = min(CONFIG["match_ids_per_page"], 100)
MAX_PLAYER_QUEUE = CONFIG["max_player_queue"]
REQUEST_RETRIES = 5
REQUEST_WINDOW_SECONDS = CONFIG["request_window_seconds"]
REQUEST_WINDOW_LIMIT = CONFIG["request_window_limit"]
SLEEP_BETWEEN_MATCHES = CONFIG["sleep_between_matches"]

if isinstance(QUEUE_FILTER, str) and not QUEUE_FILTER:
    QUEUE_FILTER = None

HEADERS = {"X-Riot-Token": API_KEY} if API_KEY else {}
REQUEST_TIMES: deque[float] = deque()

FIELDNAMES = [
    "match_id",
    "data_version",
    "queue_id",
    "map_id",
    "game_mode",
    "game_type",
    "game_version",
    "game_creation",
    "game_start_timestamp",
    "game_end_timestamp",
    "game_duration",
    "tournament_code",
    "participant_puuid",
    "participant_riotIdGameName",
    "participant_riotIdTagline",
    "participant_summonerId",
    "participant_summonerName",
    "profile_icon",
    "team_id",
    "team_side",
    "team_position",
    "individual_position",
    "role",
    "lane",
    "win",
    "champion_id",
    "champion_name",
    "champ_level",
    "kills",
    "deaths",
    "assists",
    "kda",
    "gold_earned",
    "gold_spent",
    "total_damage_dealt",
    "total_damage_dealt_to_champions",
    "total_damage_taken",
    "damage_dealt_to_objectives",
    "damage_dealt_to_turrets",
    "total_heal",
    "total_heals_on_teammates",
    "total_damage_shielded_on_teammates",
    "vision_score",
    "wards_placed",
    "wards_killed",
    "vision_wards_bought",
    "total_minions_killed",
    "neutral_minions_killed",
    "cs",
    "double_kills",
    "triple_kills",
    "quadra_kills",
    "penta_kills",
    "summoner1_id",
    "summoner2_id",
    "item0",
    "item1",
    "item2",
    "item3",
    "item4",
    "item5",
    "item6",
    "time_played",
    "time_ccing_others",
    "total_time_cc_dealt",
]

def log(message: str) -> None:
    now = time.strftime("%H:%M:%S")
    print(f"[{now}] {message}")

def throttle() -> None:
    if REQUEST_WINDOW_LIMIT <= 0:
        return
    now = time.time()
    while REQUEST_TIMES and now - REQUEST_TIMES[0] >= REQUEST_WINDOW_SECONDS:
        REQUEST_TIMES.popleft()
    if len(REQUEST_TIMES) >= REQUEST_WINDOW_LIMIT:
        wait = REQUEST_WINDOW_SECONDS - (now - REQUEST_TIMES[0]) + 0.05
        if wait > 0:
            time.sleep(wait)

def riot_get(url: str, params: Optional[Dict[str, Any]] = None) -> Optional[Any]:
    if not API_KEY:
        raise RuntimeError("Set RIOT_API_KEY in your environment before running this notebook.")
    for attempt in range(REQUEST_RETRIES):
        throttle()
        try:
            resp = requests.get(url, headers=HEADERS, params=params, timeout=20)
        except requests.RequestException as exc:
            REQUEST_TIMES.append(time.time())
            log(f"Request error ({exc}). Retrying...")
            time.sleep(1 + attempt)
            continue
        REQUEST_TIMES.append(time.time())
        if resp.status_code == 200:
            if resp.content:
                return resp.json()
            return None
        if resp.status_code == 429:
            retry_after = float(resp.headers.get("Retry-After", "1"))
            log(f"Rate limited. Sleeping for {retry_after} seconds.")
            time.sleep(retry_after + 0.1)
            continue
        if resp.status_code == 404:
            return None
        if resp.status_code >= 500:
            time.sleep(1 + attempt)
            continue
        log(f"HTTP {resp.status_code} error: {resp.text}")
        return None
    log(f"Gave up on {url} after {REQUEST_RETRIES} attempts.")
    return None

def read_seeds(path: str) -> List[str]:
    if not os.path.exists(path):
        log(f"Seeds file not found at {path}.")
        return []
    seeds: List[str] = []
    with open(path, "r", encoding="utf-8") as handle:
        for line in handle:
            value = line.strip()
            if not value or value.startswith("//"):
                continue
            seeds.append(value)
    return seeds

def looks_like_puuid(seed: str) -> bool:
    return len(seed) >= 40 and all(ch.isalnum() or ch in "-_=" for ch in seed)

def resolve_seed(seed: str) -> Optional[Tuple[str, str]]:
    if looks_like_puuid(seed):
        return seed, seed
    if "#" in seed:
        game_name, tag_line = seed.split("#", 1)
        url = (
            f"https://{REGION_ROUTING}.api.riotgames.com/riot/account/v1/accounts/by-riot-id/"
            f"{quote(game_name)}/{quote(tag_line)}"
        )
        data = riot_get(url)
        if data and isinstance(data, dict) and data.get("puuid"):
            display = f"{data.get('gameName', game_name)}#{data.get('tagLine', tag_line)}"
            return data["puuid"], display
    log(f"Could not resolve seed '{seed}'. Use Riot ID (Name#Tag) or a PUUID.")
    return None

def fetch_match_ids(puuid: str, start: int) -> List[str]:
    params: Dict[str, Any] = {"start": start, "count": MATCH_IDS_PER_PAGE}
    if QUEUE_FILTER is not None:
        params["queue"] = QUEUE_FILTER
    url = f"https://{REGION_ROUTING}.api.riotgames.com/lol/match/v5/matches/by-puuid/{puuid}/ids"
    data = riot_get(url, params=params)
    if isinstance(data, list):
        return data
    return []

def fetch_match(match_id: str) -> Optional[Dict[str, Any]]:
    url = f"https://{REGION_ROUTING}.api.riotgames.com/lol/match/v5/matches/{match_id}"
    data = riot_get(url)
    if isinstance(data, dict):
        return data
    return None

def team_side(team_id: Optional[int]) -> str:
    if team_id == 100:
        return "blue"
    if team_id == 200:
        return "red"
    return ""

def extract_rows(match: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], Set[str]]:
    metadata = match.get("metadata") or {}
    info = match.get("info") or {}
    if QUEUE_FILTER is not None and info.get("queueId") != QUEUE_FILTER:
        return [], set()
    match_id = metadata.get("matchId") or info.get("gameId") or ""
    game_duration = info.get("gameDuration")
    if isinstance(game_duration, dict):
        game_duration = game_duration.get("duration")
    base = {
        "match_id": match_id,
        "data_version": metadata.get("dataVersion", ""),
        "queue_id": info.get("queueId"),
        "map_id": info.get("mapId"),
        "game_mode": info.get("gameMode", ""),
        "game_type": info.get("gameType", ""),
        "game_version": info.get("gameVersion", ""),
        "game_creation": info.get("gameCreation"),
        "game_start_timestamp": info.get("gameStartTimestamp"),
        "game_end_timestamp": info.get("gameEndTimestamp"),
        "game_duration": game_duration,
        "tournament_code": info.get("tournamentCode", ""),
    }
    participants = info.get("participants") or []
    rows: List[Dict[str, Any]] = []
    puuids: Set[str] = set()
    for participant in participants:
        puuid = participant.get("puuid", "")
        puuids.add(puuid)
        kills = participant.get("kills", 0) or 0
        deaths = participant.get("deaths", 0) or 0
        assists = participant.get("assists", 0) or 0
        total_minions = participant.get("totalMinionsKilled")
        neutral_minions = participant.get("neutralMinionsKilled")
        row = dict(base)
        row["participant_puuid"] = puuid
        row["participant_riotIdGameName"] = participant.get("riotIdGameName", "")
        row["participant_riotIdTagline"] = participant.get("riotIdTagline", "")
        row["participant_summonerId"] = participant.get("summonerId", "")
        row["participant_summonerName"] = participant.get("summonerName", "")
        row["profile_icon"] = participant.get("profileIcon")
        team_id = participant.get("teamId")
        row["team_id"] = team_id
        row["team_side"] = team_side(team_id)
        row["team_position"] = participant.get("teamPosition", "")
        row["individual_position"] = participant.get("individualPosition", "")
        row["role"] = participant.get("role", "")
        row["lane"] = participant.get("lane", "")
        row["win"] = participant.get("win")
        row["champion_id"] = participant.get("championId")
        row["champion_name"] = participant.get("championName", "")
        row["champ_level"] = participant.get("champLevel")
        row["kills"] = kills
        row["deaths"] = deaths
        row["assists"] = assists
        row["kda"] = round((kills + assists) / max(1, deaths), 3)
        row["gold_earned"] = participant.get("goldEarned")
        row["gold_spent"] = participant.get("goldSpent")
        row["total_damage_dealt"] = participant.get("totalDamageDealt")
        row["total_damage_dealt_to_champions"] = participant.get("totalDamageDealtToChampions")
        row["total_damage_taken"] = participant.get("totalDamageTaken")
        row["damage_dealt_to_objectives"] = participant.get("damageDealtToObjectives")
        row["damage_dealt_to_turrets"] = participant.get("damageDealtToTurrets")
        row["total_heal"] = participant.get("totalHeal")
        row["total_heals_on_teammates"] = participant.get("totalHealsOnTeammates")
        row["total_damage_shielded_on_teammates"] = participant.get("totalDamageShieldedOnTeammates")
        row["vision_score"] = participant.get("visionScore")
        row["wards_placed"] = participant.get("wardsPlaced")
        row["wards_killed"] = participant.get("wardsKilled")
        row["vision_wards_bought"] = participant.get("visionWardsBoughtInGame")
        row["total_minions_killed"] = total_minions
        row["neutral_minions_killed"] = neutral_minions
        row["cs"] = (
            (total_minions or 0)
            + (neutral_minions or 0)
            if (total_minions is not None or neutral_minions is not None)
            else None
        )
        row["double_kills"] = participant.get("doubleKills")
        row["triple_kills"] = participant.get("tripleKills")
        row["quadra_kills"] = participant.get("quadraKills")
        row["penta_kills"] = participant.get("pentaKills")
        row["summoner1_id"] = participant.get("summoner1Id")
        row["summoner2_id"] = participant.get("summoner2Id")
        for slot in range(7):
            row[f"item{slot}"] = participant.get(f"item{slot}")
        row["time_played"] = participant.get("timePlayed")
        row["time_ccing_others"] = participant.get("timeCCingOthers")
        row["total_time_cc_dealt"] = participant.get("totalTimeCCDealt")
        rows.append(row)
    return rows, puuids


In [3]:
seeds = read_seeds(str(CONFIG["seeds_path"]))
if not seeds:
    raise RuntimeError(f"No seeds found at {CONFIG['seeds_path']}. Populate the file with Riot IDs (Name#Tag) or PUUIDs.")

resolved_seeds: List[Tuple[str, str]] = []
for seed in seeds:
    resolution = resolve_seed(seed)
    if resolution:
        resolved_seeds.append(resolution)

if not resolved_seeds:
    raise RuntimeError("None of the seeds resolved to a PUUID. Verify the entries and API key.")

len(resolved_seeds), resolved_seeds[:5]


[16:11:05] Could not resolve seed 'DNF Quantum#KR1'. Use Riot ID (Name#Tag) or a PUUID.
[16:11:33] Could not resolve seed 'NASA#쟁구'. Use Riot ID (Name#Tag) or a PUUID.
[16:12:48] Could not resolve seed '나만 없어 개냥이#냥멍'. Use Riot ID (Name#Tag) or a PUUID.
[16:13:15] Could not resolve seed 'min#ST'. Use Riot ID (Name#Tag) or a PUUID.
[16:13:47] Could not resolve seed '아이스크림 콘 뿔보#ST'. Use Riot ID (Name#Tag) or a PUUID.
[16:14:02] Could not resolve seed '아름다운 나라#KT'. Use Riot ID (Name#Tag) or a PUUID.
[16:14:34] Could not resolve seed '소프트#KT'. Use Riot ID (Name#Tag) or a PUUID.
[16:14:40] Could not resolve seed 'DK Despair#안산'. Use Riot ID (Name#Tag) or a PUUID.
[16:15:30] Could not resolve seed 'NoRoo#T1'. Use Riot ID (Name#Tag) or a PUUID.
[16:15:36] Could not resolve seed '렝 화#RH'. Use Riot ID (Name#Tag) or a PUUID.
[16:15:45] Could not resolve seed '해 태#T1'. Use Riot ID (Name#Tag) or a PUUID.
[16:15:54] Could not resolve seed '깜지곰#링곰'. Use Riot ID (Name#Tag) or a PUUID.
[16:15:57] Could

(469,
 [('N3t4YEyVIKhXM8TeWQXcsc7LuFW8DrrxLwBDkwUqYlBfFZ8tTiubJeZljFt_RqVCslJLPvFFkYXhlA',
   'DRX kyeahoo#0813'),
  ('cFWxSdXdtUfU6_tbByX0HAY-x4v0EzI0Bp9C7iZmvqwWEk4i2zRbBxx-XzzmTxNr3hXgHAYYWzFx2A',
   '진돗개#0415'),
  ('KQJxaCtJ9GOA-_2IWrkXabJhSsO8dqhiKkT3ZaBSRR9gPx_718IwWe1UeJ4BTPq8TJ5yO9frMas7dA',
   'Peyz#KR11'),
  ('cpwv88LWJbzd0poxei5Uj2EkvHZJNkU9ldroWdtMYqYFLgruoqQgXfFz-H1wh42YNqc2YgABysaZ9g',
   'Vincenzo#kr40'),
  ('NOu1vi3U3LJdz_vLRCHumr6GGqGJIMo7x6O1ECDd1OgfreTbw1LaV9x0f-iaVFfg-tSZwqRWWUTdQw',
   'Hype#KRR')])

In [4]:
def collect_matches(
    target_match_count: int,
    seeds: Sequence[Tuple[str, str]],
    output_csv: Path,
    max_player_queue: int,
    progress_every: int = 50,
) -> List[str]:
    """Harvest ranked matches and write per-participant rows to CSV.

    Returns the list of unique match_ids encountered."""
    output_csv.parent.mkdir(parents=True, exist_ok=True)
    queue: deque[str] = deque()
    cursor: Dict[str, int] = {}
    known_puuids: set[str] = set()
    for puuid, _ in seeds:
        queue.append(puuid)
        cursor[puuid] = 0
        known_puuids.add(puuid)

    seen_matches: set[str] = set()
    total_rows = 0
    matches_collected = 0
    start_time = time.time()

    with output_csv.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(handle, fieldnames=FIELDNAMES, extrasaction="ignore")
        writer.writeheader()
        while queue and matches_collected < target_match_count:
            puuid = queue.popleft()
            start = cursor.get(puuid, 0)
            match_ids = fetch_match_ids(puuid, start)
            if not match_ids:
                continue
            cursor[puuid] = start + len(match_ids)
            queue.append(puuid)

            for match_id in match_ids:
                if match_id in seen_matches:
                    continue

                match_data = fetch_match(match_id)
                seen_matches.add(match_id)
                if not match_data:
                    continue

                rows, puuids = extract_rows(match_data)
                if not rows:
                    continue

                writer.writerows(rows)
                handle.flush()
                total_rows += len(rows)
                matches_collected += 1

                for new_puuid in puuids:
                    if (
                        new_puuid
                        and new_puuid not in known_puuids
                        and len(known_puuids) < max_player_queue
                    ):
                        known_puuids.add(new_puuid)
                        cursor[new_puuid] = 0
                        queue.append(new_puuid)

                if matches_collected % progress_every == 0:
                    elapsed = time.time() - start_time
                    print(
                        f"Collected {matches_collected} matches ({total_rows} rows) in {elapsed/60:.1f} minutes."
                    )

                if matches_collected >= target_match_count:
                    break

                if SLEEP_BETWEEN_MATCHES > 0:
                    time.sleep(SLEEP_BETWEEN_MATCHES)

    print(
        f"Finished match harvest: {matches_collected} unique matches written to {output_csv}."
    )
    return sorted(seen_matches)


In [None]:
unique_match_ids = collect_matches(
    target_match_count=CONFIG["target_match_count"],
    seeds=resolved_seeds,
    output_csv=CONFIG["raw_output_path"],
    max_player_queue=CONFIG["max_player_queue"],
)
len(unique_match_ids)

Collected 50 matches (500 rows) in 1.2 minutes.
Collected 100 matches (1000 rows) in 2.5 minutes.
Collected 150 matches (1500 rows) in 3.8 minutes.
Collected 200 matches (2000 rows) in 5.1 minutes.
Collected 250 matches (2500 rows) in 6.4 minutes.
Collected 300 matches (3000 rows) in 7.8 minutes.
Collected 350 matches (3500 rows) in 9.1 minutes.
Collected 400 matches (4000 rows) in 10.5 minutes.
Collected 450 matches (4500 rows) in 11.7 minutes.
Collected 500 matches (5000 rows) in 12.9 minutes.
Collected 550 matches (5500 rows) in 14.1 minutes.
Collected 600 matches (6000 rows) in 15.4 minutes.
Collected 650 matches (6500 rows) in 16.8 minutes.
Collected 700 matches (7000 rows) in 18.0 minutes.
Collected 750 matches (7500 rows) in 19.3 minutes.
Collected 800 matches (8000 rows) in 20.6 minutes.
Collected 850 matches (8500 rows) in 22.0 minutes.
Collected 900 matches (9000 rows) in 23.3 minutes.
Collected 950 matches (9500 rows) in 24.6 minutes.
Collected 1000 matches (10000 rows) in 2

10002

In [None]:
match_ids_series = pd.read_csv(CONFIG["raw_output_path"], usecols=["match_id"]).squeeze("columns")
unique_match_ids = match_ids_series.dropna().drop_duplicates().tolist()
print(f"{len(unique_match_ids)} unique matches available for timeline download.")

10000 unique matches available for timeline download.


In [4]:
def fetch_and_store_timelines(match_ids: Sequence[str], config: dict) -> None:
    timeline_dir = Path(config["timeline_dir"])
    timeline_dir.mkdir(parents=True, exist_ok=True)

    saved = 0
    skipped = 0
    total = len(match_ids)

    for idx, match_id in enumerate(match_ids, start=1):
        output_path = timeline_dir / f"{match_id}.json"
        if output_path.exists():
            skipped += 1
            continue

        url = f"https://{config['region_routing']}.api.riotgames.com/lol/match/v5/matches/{match_id}/timeline"
        payload = riot_get(url)
        if not isinstance(payload, dict):
            print(f"Timeline unavailable for {match_id} (response type {type(payload)}).")
            continue

        with output_path.open("w", encoding="utf-8") as handle:
            json.dump(payload, handle)
        saved += 1

        if idx % 100 == 0 or idx == total:
            print(
                f"Processed {idx}/{total} timelines | saved={saved} skipped={skipped}."
            )

    print(f"Timeline download complete: {saved} new files, {skipped} skipped.")


In [None]:
fetch_and_store_timelines(unique_match_ids, CONFIG)

Processed 100/10000 timelines | saved=100 skipped=0.
Processed 200/10000 timelines | saved=200 skipped=0.
Processed 300/10000 timelines | saved=300 skipped=0.
Processed 400/10000 timelines | saved=400 skipped=0.
Processed 500/10000 timelines | saved=500 skipped=0.
Processed 600/10000 timelines | saved=600 skipped=0.
Processed 700/10000 timelines | saved=700 skipped=0.
Processed 800/10000 timelines | saved=800 skipped=0.
Processed 900/10000 timelines | saved=900 skipped=0.
Processed 1000/10000 timelines | saved=1000 skipped=0.
Processed 1100/10000 timelines | saved=1100 skipped=0.
Processed 1200/10000 timelines | saved=1200 skipped=0.
Processed 1300/10000 timelines | saved=1300 skipped=0.
Processed 1400/10000 timelines | saved=1400 skipped=0.
Processed 1500/10000 timelines | saved=1500 skipped=0.
Processed 1600/10000 timelines | saved=1600 skipped=0.
Processed 1700/10000 timelines | saved=1700 skipped=0.
Processed 1800/10000 timelines | saved=1800 skipped=0.
Processed 1900/10000 timelin