# Web scraper to obtain World Championship matches

## Workflow for scraping data

In [14]:
# Imports

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

pd.set_option('display.max_columns', 200)


In [2]:
# URLs from which we will be scraping data
# Each item in this list is the matchlist URL for a different year of the World Championship
tournament_matchlist_urls = [f"https://gol.gg/tournament/tournament-matchlist/World%20Championship%2020{i}/" for i in range(14,23)]

In [3]:
# Print the list of urls
tournament_matchlist_urls

['https://gol.gg/tournament/tournament-matchlist/World%20Championship%202014/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202015/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202016/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202017/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202018/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202019/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202020/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202021/',
 'https://gol.gg/tournament/tournament-matchlist/World%20Championship%202022/']

In [4]:


def get_match_numbers(tournament_matchlist_url):
    '''
    Retrieves the list of game numbers from the matches in a given tournament.
    Input: URL of the tournament matchlist
    Returns: A list of match numbers corresponding to the games in that tournament.
    '''
    
    # Load the URL using requests
    # We need to use a request header to pretend we are using a popular browser or the website will (correctly) think that we are a bot.
    tournament_matchlist = requests.get(tournament_matchlist_url, headers={'User-Agent': 'Mozilla/5.0'})
    # Parse the html file using BeautifulSoup
    tournament_matchlist_soup = BeautifulSoup(tournament_matchlist.text)
    
    # Select the table in which we are interested (uses CSS selectors)
    matchlist_table = tournament_matchlist_soup.select('table.table_list')[0]
    # Find all of the 'a' tags (recall that <a href... tags in HTML are links) in the table.
    links = matchlist_table.find_all('a')
    # Extract the actual links from these items, and filter out so we only have the page for the game
    links = [l.get("href") for l in links]
    
    # If the link url has 'page-game' in it, then there in only one game in the match.
    match_numbers = [re.split('/', l)[3] for l in links if 'summary' not in l]
    
    # If the link has 'page-summary' in it, then there may be multiple games in the match.
    links_multiples = [l for l in links if 'summary' in l]

    # In this case, getting the match numbers is a little trickier. First, we find out how many games were in the match.
    # For each link in links_multiples, retrieve the appropriate html and find out how many games are in that match.
    for link in links_multiples:
        # Load the appropriate URL
        link_data = requests.get('https://gol.gg'+link[2:], headers={'User-Agent': 'Mozilla/5.0'})
        # Parse the URL with BeautifulSoup
        link_soup = BeautifulSoup(link_data.text)
        # Count the number of times the div class 'row pb-1' appears in the html. This is the number of games played.
        n_games = len(link_soup.find_all("div", {"class":"row pb-1"}))
        # We get the other game numbers by adding 1 to the game number from link. We do this n_games-1 number of times. 
        for i in range(n_games):
            match_numbers.append(str(int(re.split('/', link)[3]) + i))
    
    
    return sorted(match_numbers)


def flatten(l):
    '''
    Flattens a list
    '''
    
    return [item for sublist in l for item in sublist]

In [5]:
%%time
# Get all the match numbers in all the tournaments in tournament_matchlist_urls. 
match_numbers = flatten([get_match_numbers(url) for url in tournament_matchlist_urls])

CPU times: total: 2.98 s
Wall time: 20.2 s


In [6]:
def get_match_url(match_number):
    '''
    Takes a match number and returns the URL of the data table for that match. Very simple, very easy. 
    '''
    
    return f'https://gol.gg/game/stats/{match_number}/page-fullstats/'

In [7]:
# Store all the match URLs we are interested in into one list 
match_urls = [get_match_url(num) for num in match_numbers]

In [67]:
col_dtypes_dict = {'Level':'Int64', 
                   'Kills':'Int64', 
                   'Deaths':'Int64', 
                   'Assists':'Int64', 
                   #'KDA':'float64',
                   'CS':'Int64',
                   "CS in Team's Jungle":'Int64',
                   'CS in Enemy Jungle':'Int64',
                   'CSM':'float64'}

def get_df(match_url):
    '''
    Takes a match URL and returns the match data in the form of a pandas dataframe.
    '''
    
    link_data = requests.get(match_url, headers={'User-Agent': 'Mozilla/5.0'})
    # Parse the URL with BeautifulSoup
    link_soup = BeautifulSoup(link_data.text)

    stats = pd.read_html(link_data.text)[0]
    stats.set_index('Unnamed: 0',inplace=True)
    stats.index.name = None
    stats = stats.T

    stats.iloc[0].Role = 'BLUE_TOP'
    stats.iloc[1].Role = 'BLUE_JNG'
    stats.iloc[2].Role = 'BLUE_MID'
    stats.iloc[3].Role = 'BLUE_ADC'
    stats.iloc[4].Role = 'BLUE_SUP'

    stats.iloc[5].Role = 'RED_TOP'
    stats.iloc[6].Role = 'RED_JNG'
    stats.iloc[7].Role = 'RED_MID'
    stats.iloc[8].Role = 'RED_ADC'
    stats.iloc[9].Role = 'RED_SUP'
    
    stats["GOLD%"] = stats["GOLD%"].str.rstrip('%')
    stats["VS%"] = stats["VS%"].str.rstrip('%')
    stats["DMG%"] = stats["DMG%"].str.rstrip('%')
    stats["KP%"] = stats["KP%"].str.rstrip('%')


    stats.set_index('Role',inplace=True)

    stats = stats.astype(col_dtypes_dict)
    
    return stats


In [73]:
get_df(match_urls[0])

Unnamed: 0_level_0,Player,Level,Kills,Deaths,Assists,KDA,CS,CS in Team's Jungle,CS in Enemy Jungle,CSM,Golds,GPM,GOLD%,Vision Score,Wards placed,Wards destroyed,Control Wards Purchased,Detector Wards Placed,VSPM,WPM,VWPM,WCPM,VS%,Total damage to Champion,Physical Damage,Magic Damage,True Damage,DPM,DMG%,K+A Per Minute,KP%,Solo kills,Double kills,Triple kills,Quadra kills,Penta kills,GD@15,CSD@15,XPD@15,LVLD@15,Objectives Stolen,Damage dealt to turrets,Damage dealt to buildings,Total heal,Total Heals On Teammates,Damage self mitigated,Total Damage Shielded On Teammates,Time ccing others,Total Time CC Dealt,Total damage taken,Total Time Spent Dead,Consumables purchased,Items Purchased,Shutdown bounty collected,Shutdown bounty lost
Role,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
BLUE_TOP,Koro1,,0,6,6,1.0,322,7,2,7.3,13286,302,20.6,,12,4,8,,,0.27,0.18,0.09,,16719,1427,15292,0,380,21.8%,0.14,54.5%,,0,0,0,0,172,11,-1011,-1,,,,,,,,,,,,,,,
BLUE_JNG,ClearLove,,2,3,7,3.0,131,91,4,3.0,11185,254,17.4,,45,19,4,,,1.02,0.09,0.43,,7212,5668,999,545,164,9.4%,0.2,81.8%,,0,0,0,0,-672,-10,-477,-1,,,,,,,,,,,,,,,
BLUE_MID,U,,4,4,3,1.8,320,40,4,7.3,14217,323,22.1,,15,5,4,,,0.34,0.09,0.11,,16269,546,14807,916,370,21.2%,0.16,63.6%,,1,0,0,0,1185,17,-106,-1,,,,,,,,,,,,,,,
BLUE_ADC,NaMei,,4,4,5,2.3,380,37,8,8.6,16813,383,26.1,,11,11,2,,,0.25,0.05,0.25,,29872,26747,2619,506,680,39%,0.2,81.8%,,1,0,0,0,-352,3,-620,-1,,,,,,,,,,,,,,,
BLUE_SUP,FZZF,,1,3,9,3.3,42,0,0,1.0,8869,202,13.8,,55,11,11,,,1.25,0.25,0.25,,6558,561,5401,596,149,8.6%,0.23,90.9%,,0,0,0,0,-30,15,64,0,,,,,,,,,,,,,,,
RED_TOP,Looper,,3,3,10,4.3,267,4,0,6.1,15187,346,18.7,,18,5,8,,,0.41,0.18,0.11,,26117,930,25187,0,594,27.3%,0.3,65%,,0,0,0,0,-172,-11,1011,1,,,,,,,,,,,,,,,
RED_JNG,DanDy,,3,1,9,12.0,198,102,31,4.5,15662,356,19.3,,16,15,5,,,0.36,0.11,0.34,,16713,13822,1493,1398,380,17.5%,0.27,60%,,0,0,0,0,672,10,477,1,,,,,,,,,,,,,,,
RED_MID,PawN,,2,4,10,3.0,350,15,3,8.0,16868,384,20.7,,14,11,5,,,0.32,0.11,0.25,,18613,529,16738,1346,424,19.5%,0.27,60%,,0,0,0,0,-1185,-17,106,1,,,,,,,,,,,,,,,
RED_ADC,imp,,9,2,7,8.0,411,32,17,9.4,21101,480,25.9,,12,8,2,,,0.27,0.05,0.18,,26821,17145,8312,1364,610,28.1%,0.36,80%,,0,2,0,0,352,-3,620,1,,,,,,,,,,,,,,,
RED_SUP,Mata,,3,1,14,17.0,27,0,0,0.6,12531,285,15.4,,84,19,19,,,1.91,0.43,0.43,,7232,1387,5845,0,165,7.6%,0.39,85%,,0,0,0,0,30,-15,-64,0,,,,,,,,,,,,,,,


In [72]:
get_df(match_urls[4]).columns

Index(['Player', 'Level', 'Kills', 'Deaths', 'Assists', 'KDA', 'CS',
       'CS in Team's Jungle', 'CS in Enemy Jungle', 'CSM', 'Golds', 'GPM',
       'GOLD%', 'Vision Score', 'Wards placed', 'Wards destroyed',
       'Control Wards Purchased', 'Detector Wards Placed', 'VSPM', 'WPM',
       'VWPM', 'WCPM', 'VS%', 'Total damage to Champion', 'Physical Damage',
       'Magic Damage', 'True Damage', 'DPM', 'DMG%', 'K+A Per Minute', 'KP%',
       'Solo kills', 'Double kills', 'Triple kills', 'Quadra kills',
       'Penta kills', 'GD@15', 'CSD@15', 'XPD@15', 'LVLD@15',
       'Objectives Stolen', 'Damage dealt to turrets',
       'Damage dealt to buildings', 'Total heal', 'Total Heals On Teammates',
       'Damage self mitigated', 'Total Damage Shielded On Teammates',
       'Time ccing others', 'Total Time CC Dealt', 'Total damage taken',
       'Total Time Spent Dead', 'Consumables purchased', 'Items Purchased',
       'Shutdown bounty collected', 'Shutdown bounty lost'],
      dtype='obj

In [59]:
df = get_df(match_urls[-1])
df.dtypes

Player                                 object
Level                                   Int64
Kills                                   Int64
Deaths                                  Int64
Assists                                 Int64
KDA                                    object
CS                                      Int64
CS in Team's Jungle                     Int64
CS in Enemy Jungle                      Int64
CSM                                   float64
Golds                                   Int64
GPM                                    object
GOLD%                                  object
Vision Score                           object
Wards placed                           object
Wards destroyed                        object
Control Wards Purchased                object
Detector Wards Placed                  object
VSPM                                   object
WPM                                    object
VWPM                                   object
WCPM                              

In [64]:
get_df(match_urls[-4])

ValueError: could not convert string to float: '21.5%'

In [63]:
col_dtypes_dict = {"Level":"Int64", 
                   "Kills":"Int64", 
                   "Deaths":"Int64", 
                   "Assists":"Int64", 
                   #'KDA':'float64',
                   "CS":"Int64",
                   "CS in Team's Jungle":"Int64",
                   "CS in Enemy Jungle":"Int64",
                   "CSM":'float64',
                   "Golds":"Int64",
                   "GPM":"Int64",
                   "GOLD%":"float64"}