# Scraping wikipedia to find allround ice-skating times

In this Notebook, I'll scrape the wikipedia pages containing the results for men's allround world championship results. I'll first start by creating a python environment

In [None]:
# !conda create -n csa python==3.8
# !conda activate csa
# !pip install ipykernel
!pip install -r requirements.txt

First, we'll have to import the packages.

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import math

## Writing functions to scrape web pages
Unfortunately, wikipedia does not have a standardized format of publishing the pages. Depending on the year, they use another weblink to publish the page. Using if statements, I'll find the wikiurl for each year. After the year 2006, their page syntax changed as well and I've fixed this using a simple string lookup.

In [12]:
def scrape(year = int, distance = int):
    # first, properly format the distance
    if distance == 500:
        distance = "500m"
    elif distance == 1500:
        distance = "1500m"
    elif distance == 5000:
        distance = "5000m"
    elif distance == 10000:
        distance = "10.000m"
    else:
        raise Exception("Invalid distance entered. Please enter a valid race distance.")

    # wikipedia has different URL's depending on the year. After 2006, the page structure also changed, so I'll need a different function to scrape.
    if year < 2006:
        if year < 1933:
            wikiurl = f"https://nl.wikipedia.org/wiki/Wereldkampioenschap_schaatsen_allround_{year}"
        elif year < 1996:
            wikiurl = f"https://nl.wikipedia.org/wiki/Wereldkampioenschap_schaatsen_allround_mannen_{year}"
        else:
            wikiurl = f"https://nl.wikipedia.org/wiki/Wereldkampioenschappen_schaatsen_allround_{year}"

        # a page should have response code 200, otherwise it's not legal to scrape the page. Let's check that first:
        response = requests.get(wikiurl)
        if response.status_code != 200:
            # print(f"URL for year {year} cannot be scraped. Repsonse: {response}")
            return "page not found"

        # table_class="wikitable sortable jquery-tablesorter"

        # parse data from the html into a beautifulsoup object
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table',{'class':"wikitable"})

        # soup to table:
        df=pd.read_html(str(table), header = 0)
        # convert list to dataframe
        df=pd.DataFrame(df[0])

        try:
            global df1
            df1 = df.copy()
            column = df[distance]
            try:
                row = list(column.str.contains("(1)", regex = False))
                row_number = row.index(True)
                result = df[distance][row_number]
            except:
                return "winner not found - please add data by hand"
        except:
            return f"no {distance} race"
        
            # finally, some postprocessing. We have more digits available in later years:
        if distance == "500m":
            if year < 1972:
                result = result[0:5]
            else:
                result = result[0:6]
        elif distance == "1500m":
            if year < 1972:
                result = result[0:6]
            else:
                result = result[0:7]
        elif distance == "5000m":            
            if year < 1972:
                result = result[0:6]
            else:
                result = result[0:7]
        elif distance == "10.000m":
            if year < 1972:
                result = result[0:7]
            else:
                result = result[0:8]
        
        return result
    else:
        wikiurl = f"https://nl.wikipedia.org/wiki/Wereldkampioenschappen_schaatsen_allround_{year}"

        # a page should have response code 200, otherwise it's not legal to scrape the page. Let's check that first:
        response = requests.get(wikiurl)
        if response.status_code != 200:
            # print(f"URL for year {year} cannot be scraped. Repsonse: {response}")
            return "page not found"

        # if there's more tables on a page, we'll just ignore the first ones. To make it hard, wikipedia wouldn't be wikipedia if they didn't have a different syntax on every page
        html = response.text
        if year < 2010:
            einduitslag_start = html.index("""<span class="mw-headline" id="Eindklassement_2">Eindklassement</span>""")
        else:
            einduitslag_start = html.index("""<span class="mw-headline" id="Klassement">Klassement</span>""")


        html = html[einduitslag_start : ]

        # parse data from the html into a beautifulsoup object
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table',{'class':"wikitable"})

        # soup to table:
        df=pd.read_html(str(table), header = 0)
        # convert list to dataframe
        df=pd.DataFrame(df[0])
        # print(df)

        try:
            column = df[distance]
            try:
                row = list(column.str.contains("(1)", regex = False))
                row_number = row.index(True)
                result = df[distance][row_number]
            except:
                return "winner not found - please add data by hand"
        except:
            return f"no {distance} race"
        
            # finally, some postprocessing. We have more digits available in later years, and the number of digits also depend on distance
        if distance == "500m":
            if year < 1972:
                result = result[0:5]
            else:
                result = result[0:6]
        elif distance == "1500m":
            if year < 1972:
                result = result[0:6]
            else:
                result = result[0:7]
        elif distance == "5000m":            
            if year < 1972:
                result = result[0:6]
            else:
                result = result[0:7]
        elif distance == "10.000m":
            if year < 1972:
                result = result[0:7]
            else:
                result = result[0:8]
        
        return result

for distance in (500, 1500, 5000, 10000):
    df = pd.DataFrame()
    for i in range(1888, 1913):
        this_result = pd.DataFrame([scrape(i, distance)], columns = ['time'], index = [i])
        df = pd.concat([df, this_result])
    
    df.to_csv(path_or_buf = f"../data/{distance}m data - test.csv", sep = "|", header = True, index = True)

                time
1888  page not found
1889    no 500m race
1890    no 500m race
1891    no 500m race
1892  page not found
1893           51,0 
1894           50,4 
1895           48,2 
1896           50,2 
1897    no 500m race
1898           47,2 
1899           50,5 
1900           46,4 
1901           54,0 
1902           47,0 
1903           49,4 
1904           46,6 
1905           49,8 
1906           50,8 
1907           47,4 
1908           44,8 
1909           45,6 
1910           46,3 
1911           46,4 
1912           44,2 


Exception: Invalid distance entered. Please enter a valid race distance.

Let's finally secure the data by saving it:

In [None]:
df.to_csv(path_or_buf = "10.000m data.csv", sep = "|", header = True, index = True)