# Web scraper to obtain World Championship matches

## Workflow for scraping data

In [1]:
# Imports

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


In [2]:
# URLs from which we will be scraping data
# Each item in this list is the matchlist URL for a different year of the World Championship
tournament_matchlist_urls = [f"https://gol.gg/tournament/tournament-matchlist/World%20Championship%2020{i}/" for i in range(14,23)]

In [3]:
# Print the list of urls
tournament_matchlist_urls

['https://gol.gg/tournament/tournament-matchlist/World%20Championship%202014/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202015/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202016/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202017/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202018/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202019/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202020/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202021/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202022/']

In [4]:
def get_match_numbers(tournament_matchlist_url):
    '''
    Retrieves the list of game numbers from the matches in a given tournament.
    Input: URL of the tournament matchlist
    Returns: A list of match numbers corresponding to the games in that tournament.
    '''
    
    # Load the URL using requests
    # We need to use a request header to pretend we are using a popular browser or the website will (correctly) think that we are a bot.
    tournament_matchlist = requests.get(tournament_matchlist_url, headers={'User-Agent': 'Mozilla/5.0'})
    # Parse the html file using BeautifulSoup
    tournament_matchlist_soup = BeautifulSoup(tournament_matchlist.text)
    
    # Select the table in which we are interested (uses CSS selectors)
    matchlist_table = tournament_matchlist_soup.select('table.table_list')[0]
    # Find all of the 'a' tags (recall that <a href... tags in HTML are links) in the table.
    links = matchlist_table.find_all('a')
    # Extract the actual links from these items, and filter out so we only have the page for the game
    links = [l.get("href") for l in links]
    
    # If the link url has 'page-game' in it, then there in only one game in the match.
    match_numbers = [re.split('/', l)[3] for l in links if 'summary' not in l]
    
    # If the link has 'page-summary' in it, then there may be multiple games in the match.
    links_multiples = [l for l in links if 'summary' in l]

    # In this case, getting the match numbers is a little trickier. First, we find out how many games were in the match.
    # For each link in links_multiples, retrieve the appropriate html and find out how many games are in that match.
    for link in links_multiples:
        # Load the appropriate URL
        link_data = requests.get('https://gol.gg'+link[2:], headers={'User-Agent': 'Mozilla/5.0'})
        # Parse the URL with BeautifulSoup
        link_soup = BeautifulSoup(link_data.text)
        # Count the number of times the div class 'row pb-1' appears in the html. This is the number of games played.
        n_games = len(link_soup.find_all("div", {"class":"row pb-1"}))
        # We get the other game numbers by adding 1 to the game number from link. We do this n_games-1 number of times. 
        for i in range(n_games):
            match_numbers.append(str(int(re.split('/', link)[3]) + i))
    
    
    return sorted(match_numbers)


def flatten(l):
    '''
    Flattens a list
    '''
    
    return [item for sublist in l for item in sublist]

In [5]:
%%time
# Get all the match numbers in all the tournaments in tournament_matchlist_urls. 
match_numbers = flatten([get_match_numbers(url) for url in tournament_matchlist_urls])

Wall time: 21.7 s


In [6]:
def get_match_url(match_number):
    '''
    Takes a match number and returns the URL of the data table for that match. Very simple, very easy. 
    '''
    
    return f'https://gol.gg/game/stats/{match_number}/page-fullstats/'

In [7]:
# Store all the match URLs we are interested in into one list 
match_urls = [get_match_url(num) for num in match_numbers]

In [11]:
def get_df(match_url):
    '''
    Takes a match URL and returns the match data in the form of a pandas dataframe.
    '''
    
    link_data = requests.get(match_url, headers={'User-Agent': 'Mozilla/5.0'})
    # Parse the URL with BeautifulSoup
    link_soup = BeautifulSoup(link_data.text)

    stats = pd.read_html(link_data.text)[0]
    stats.set_index('Unnamed: 0',inplace=True)
    stats.index.name = None
    stats = stats.T

    stats.iloc[0].Role = 'BLUE_TOP'
    stats.iloc[1].Role = 'BLUE_JNG'
    stats.iloc[2].Role = 'BLUE_MID'
    stats.iloc[3].Role = 'BLUE_ADC'
    stats.iloc[4].Role = 'BLUE_SUP'

    stats.iloc[5].Role = 'RED_TOP'
    stats.iloc[6].Role = 'RED_JNG'
    stats.iloc[7].Role = 'RED_MID'
    stats.iloc[8].Role = 'RED_ADC'
    stats.iloc[9].Role = 'RED_SUP'


    stats.set_index('Role',inplace=True)

    return stats


In [12]:
get_df(match_urls[0])

Unnamed: 0_level_0,Player,Level,Kills,Deaths,Assists,KDA,CS,CS in Team's Jungle,CS in Enemy Jungle,CSM,...,Damage self mitigated,Total Damage Shielded On Teammates,Time ccing others,Total Time CC Dealt,Total damage taken,Total Time Spent Dead,Consumables purchased,Items Purchased,Shutdown bounty collected,Shutdown bounty lost
Role,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BLUE_TOP,Koro1,,0,6,6,1.0,322,7,2,7.3,...,,,,,,,,,,
BLUE_JNG,ClearLove,,2,3,7,3.0,131,91,4,3.0,...,,,,,,,,,,
BLUE_MID,U,,4,4,3,1.8,320,40,4,7.3,...,,,,,,,,,,
BLUE_ADC,NaMei,,4,4,5,2.3,380,37,8,8.6,...,,,,,,,,,,
BLUE_SUP,FZZF,,1,3,9,3.3,42,0,0,1.0,...,,,,,,,,,,
RED_TOP,Looper,,3,3,10,4.3,267,4,0,6.1,...,,,,,,,,,,
RED_JNG,DanDy,,3,1,9,12.0,198,102,31,4.5,...,,,,,,,,,,
RED_MID,PawN,,2,4,10,3.0,350,15,3,8.0,...,,,,,,,,,,
RED_ADC,imp,,9,2,7,8.0,411,32,17,9.4,...,,,,,,,,,,
RED_SUP,Mata,,3,1,14,17.0,27,0,0,0.6,...,,,,,,,,,,


In [16]:
df = get_df(match_urls[-1])
df

Unnamed: 0_level_0,Player,Level,Kills,Deaths,Assists,KDA,CS,CS in Team's Jungle,CS in Enemy Jungle,CSM,...,Damage self mitigated,Total Damage Shielded On Teammates,Time ccing others,Total Time CC Dealt,Total damage taken,Total Time Spent Dead,Consumables purchased,Items Purchased,Shutdown bounty collected,Shutdown bounty lost
Role,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BLUE_TOP,Zeus,18,4,4,3,1.8,292,37,,6.9,...,31989,0,9,93,36130,100,11,31,,
BLUE_JNG,Oner,17,2,2,7,4.5,256,181,,6.1,...,30126,0,17,222,31569,96,12,32,,
BLUE_MID,Faker,18,2,5,4,1.2,325,24,,7.7,...,22574,0,21,473,20289,144,15,33,,
BLUE_ADC,Gumayusi,17,1,3,3,1.3,356,32,,8.4,...,8791,0,27,481,12202,84,13,31,,
BLUE_SUP,Keria,13,1,5,3,0.8,35,0,,0.8,...,11043,3543,20,224,15192,155,30,44,,
RED_TOP,kingen,18,6,3,6,4,269,27,,6.4,...,61310,0,20,189,40472,150,13,35,,
RED_JNG,Pyosik,16,5,4,8,3.3,192,163,,4.6,...,50572,0,19,223,40180,155,22,41,,
RED_MID,Zeka,18,3,2,9,6,387,29,,9.2,...,13752,0,15,312,15604,42,14,33,,
RED_ADC,Deft,18,5,0,4,Perfect KDA,388,37,,9.2,...,18651,0,12,76,22208,0,9,31,,
RED_SUP,BeryL,15,0,1,10,10,47,0,,1.1,...,15724,1047,24,194,21875,35,30,47,,


In [17]:
df[df.columns[~df.isnull().all()]]

Unnamed: 0_level_0,Player,Level,Kills,Deaths,Assists,KDA,CS,CS in Team's Jungle,CSM,Golds,...,Total heal,Total Heals On Teammates,Damage self mitigated,Total Damage Shielded On Teammates,Time ccing others,Total Time CC Dealt,Total damage taken,Total Time Spent Dead,Consumables purchased,Items Purchased
Role,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BLUE_TOP,Zeus,18,4,4,3,1.8,292,37,6.9,15552,...,12842,0,31989,0,9,93,36130,100,11,31
BLUE_JNG,Oner,17,2,2,7,4.5,256,181,6.1,14713,...,16210,0,30126,0,17,222,31569,96,12,32
BLUE_MID,Faker,18,2,5,4,1.2,325,24,7.7,15380,...,1294,0,22574,0,21,473,20289,144,15,33
BLUE_ADC,Gumayusi,17,1,3,3,1.3,356,32,8.4,16301,...,3036,0,8791,0,27,481,12202,84,13,31
BLUE_SUP,Keria,13,1,5,3,0.8,35,0,0.8,10124,...,863,0,11043,3543,20,224,15192,155,30,44
RED_TOP,kingen,18,6,3,6,4,269,27,6.4,15346,...,21444,0,61310,0,20,189,40472,150,13,35
RED_JNG,Pyosik,16,5,4,8,3.3,192,163,4.6,13765,...,19270,0,50572,0,19,223,40180,155,22,41
RED_MID,Zeka,18,3,2,9,6,387,29,9.2,18568,...,2981,0,13752,0,15,312,15604,42,14,33
RED_ADC,Deft,18,5,0,4,Perfect KDA,388,37,9.2,17575,...,5462,0,18651,0,12,76,22208,0,9,31
RED_SUP,BeryL,15,0,1,10,10,47,0,1.1,9449,...,14408,2834,15724,1047,24,194,21875,35,30,47
