# Match Crawler

This notebook is responsible for data collection process. The notebook is a crawler for high elo League of Legends matches. Crawling is achieved through the Riot Development API and the package `riotwatcher`.

In [1]:
from riotwatcher import LolWatcher, ApiError
from tqdm.notebook import tqdm
import zilean

import os
import json
import re

api_key = zilean.read_api_key()
watcher = LolWatcher(api_key=api_key)
REGION = 'kr'

### Fetch Challenger League

Fetch the summoner's in challenger league of Korea server and cache it locally to `data/kr_challenger_league.json`. The last time it was cached is in 2022-05-23.

In [2]:
if os.path.exists('data/kr_challenger_league.json'):
    with open('data/kr_challenger_league.json', 'r') as f:
        kr_challenger_league = json.load(f)
    print("Loaded previously cached data.")
else :
    kr_challenger_league = watcher.league.challenger_by_queue(REGION, "RANKED_SOLO_5x5")
    with open('data/kr_challenger_league.json', 'w') as f:
        json.dump(kr_challenger_league, f)

Loaded previously cached data.


Fetch the `puuid` for each challenger account. This is because we can only access matches using `puuid`.

In [3]:
# List of challenger summonerIds
challengers = [entry['summonerId'] for entry in kr_challenger_league['entries']]

# Fetch puuid for each summonerId
if os.path.exists('data/accounts.json'):
    with open('data/accounts.json', 'r') as f:
        accounts = json.load(f)
        print("Loaded previously cached accounts")
else:
    # Clean dirty files resulting from previous search
    if os.path.exists('data/accounts.txt'):
        os.remove('data/accounts.txt')
    # Iterate over summonerIds
    for i, challenger in enumerate(challengers):
        account = watcher.summoner.by_id(REGION, challenger)
        with open('data/accounts.txt', 'a') as f:
            f.write(str(account))
        print(f"Counter: {i}.")

    with open('data/accounts.txt', 'r') as f:
            accounts = f.read()

    accounts = re.sub('}{', '}@@@{', accounts)
    accounts = accounts.split('@@@')

    with open('data/accounts.json', 'w') as f:
        json.dump(accounts, f)

Loaded previously cached accounts


### Fetch High Elo Matches and Timelines

Fetch the recent matches and timelines by the `puuid` of each challenger account

In [4]:
MATCH_PER_PUUID = 5
visited_matchids = set()

# Fetch match for each PUUID
if os.path.exists('data/matches.json'):
    with open('data/matches.json', 'r') as f:
        matches = json.load(f)
        print("Loaded previously cached matches")
else :
    # Iterate over accounts (n=300)
    for account_index, account in enumerate(tqdm(accounts)):
        # Prepare string to JSON
        account = account.replace("\'", "\"")
        account = json.loads(account)
        matchids = watcher.match.matchlist_by_puuid(REGION, account['puuid'])
        # Iterate over recent matches (n=MATCH_PER_PUUID)
        for match_index in range(MATCH_PER_PUUID):
            matchid = matchids[match_index]
            # Skip over visited matches
            if matchid in visited_matchids: continue
            timeline = watcher.match.timeline_by_match(REGION, matchid)
            matchinfo = {"id": matchid, "timeline": timeline}
            zilean.write_messy_json(matchinfo, "data/matches.json")
            visited_matchids.add(matchid)
            # print(f"Counter: {account_index * MATCH_PER_PUUID + match_index}.")

    zilean.clean_json("data/matches.json")
    with open('data/matches.json', 'r') as f:
        matches = json.load(f)

Loaded previously cached matches


### Cleanup and store to JSON

The files `matches.json` may contain some unwanted matches. We iterate through each match and only retain matches that is longer than 20 minutes.

In [5]:
# Minimum duration (in minutes) of a match
min_duration = 16

# Create a mask to filter out short matches
long_matchids = [True for i in range(len(matches))]
for i, match in enumerate(matches):
    frame_interval = match['timeline']['info']['frameInterval']
    if len(match['timeline']['info']['frames']) < int(min_duration * 60000 / frame_interval):
        long_matchids[i] = False
matches = [match for match, is_long in zip(matches, long_matchids) if is_long]
print(f"There are in total {len(matches)} crawled KR high elo matches longer than {min_duration} minutes.")

# Save to disk
with open('data/matches_cleaned.json', 'w') as f:
    json.dump(matches, f)

There are in total 787 crawled KR high elo matches longer than 16 minutes.
