# Match Crawler

This notebook is responsible for data collection process. The notebook is a crawler for high elo League of Legends matches. Crawling is achieved through the Riot Development API and the package `riotwatcher`.

In [9]:
from riotwatcher import LolWatcher, ApiError
import pandas as pd

import os
import json
import re

api_key = 'RGAPI-28637d7d-49ae-49f1-993f-e6a41359ff1e'
watcher = LolWatcher(api_key=api_key)
REGION = 'kr'

### Fetch Challenger League

Fetch the summoner's in challenger league of Korea server and cache it locally to `data/kr_challenger_league.json`. The last time it was cached is in 2022-05-23.

In [10]:
if os.path.exists('data/kr_challenger_league.json'):
    with open('data/kr_challenger_league.json', 'r') as f:
        kr_challenger_league = json.load(f)
    print("Loaded previously cached data.")
else :
    kr_challenger_league = watcher.league.challenger_by_queue(REGION, "RANKED_SOLO_5x5")
    with open('data/kr_challenger_league.json', 'w') as f:
        json.dump(kr_challenger_league, f)

Loaded previously cached data.


Fetch the `puuid` for each challenger account. This is because we can only access matches using `puuid`.

In [11]:
# List of challenger summonerIds
challengers = [entry['summonerId'] for entry in kr_challenger_league['entries']]

# Fetch puuid for each summonerId
if os.path.exists('data/accounts.json'):
    with open('data/accounts.json', 'r') as f:
        accounts = json.load(f)
        print("Loaded previously cached accounts")
else:
    # Clean dirty files resulting from previous search
    if os.path.exists('data/accounts.txt'):
        os.remove('data/accounts.txt')
    # Iterate over summonerIds
    for i, challenger in enumerate(challengers):
        account = watcher.summoner.by_id(REGION, challenger)
        with open('data/accounts.txt', 'a') as f:
            f.write(str(account))
        print(f"Counter: {i}.")

    with open('data/accounts.txt', 'r') as f:
            accounts = f.read()

    accounts = re.sub('}{', '}@@@{', accounts)
    accounts = accounts.split('@@@')

    with open('data/accounts.json', 'w') as f:
        json.dump(accounts, f)

Loaded previously cached accounts


### Fetch High Elo Matches and Timelines

Fetch the recent matches and timelines by the `puuid` of each challenger account

In [37]:
MATCH_PER_PUUID = 5
visited_matchids = set()

# Fetch match for each PUUID
if os.path.exists('data/matches.json') and os.path.exists('data/timelines.json'):
    with open('data/matches.json', 'r') as f:
        matches = json.load(f)
        print("Loaded previously cached matches")
    with open('data/timelines.json', 'r') as f:
        timelines = json.load(f)
        print("Loaded previously cached matches")

elif not os.path.exists('data/matches.json') and not os.path.exists('data/timelines.json'):

    # Clean dirty files resulting from previous search
    if os.path.exists('data/matches.txt'):
        os.remove('data/matches.txt')
    if os.path.exists('data/timelines.txt'):
        os.remove('data/timelines.txt')  

    # Iterate over accounts (n=300)
    for account_index, account in enumerate(accounts):
        # Prepare string to JSON
        account = account.replace("\'", "\"")
        account = json.loads(account)
        matchids = watcher.match.matchlist_by_puuid(REGION, account['puuid'])
        # Iterate over recent matches (n=MATCH_PER_PUUID)
        for match_index in range(MATCH_PER_PUUID):
            matchid = matchids[match_index]
            # Skip over visited matches
            if matchid in visited_matchids: continue
            match = watcher.match.by_id(REGION, matchid)
            timeline = watcher.match.timeline_by_match(REGION, matchid)
            with open('data/matches.txt', 'a') as f:
                f.write(str(match))
                f.write("@@@@@@")
            with open('data/timelines.txt', 'a') as f:
                f.write(str(timeline))
                f.write("@@@@@@")
            visited_matchids.add(matchid)
            print(f"Counter: {account_index * MATCH_PER_PUUID + match_index}.")

    # Save crawled matches to disk
    with open('data/matches.txt', 'r') as f:
        matches = f.read()  
    matches = matches.split('@@@@@@')
    with open('data/matches.json', 'w') as f:
        json.dump(matches, f)

    # Save crawled timelines to disk
    with open('data/timelines.txt', 'r') as f:
        timelines = f.read()  
    timelines = timelines.split('@@@@@@')
    with open('data/timelines.json', 'w') as f:
        json.dump(timelines, f)

else:
    print("Mismatch files, please check data directory")

Loaded previously cached matches
Loaded previously cached matches
