In [None]:
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import ssl
import math

In [None]:
def get_soup(event, year, gender, pg_no="1", print_url=False):
    
    if event.lower() == "mass": 
        url = ("https://results.london-marathon.co.uk/" + str(year) + 
            "/?event=MAS&num_results=1000&page=" + str(pg_no) + 
            "&pid=list&pidp=start&search%5Bsex%5D=" + gender)
    elif event.lower() == "elite" and year == 2020 and gender == "W":
        url = ("https://results.london-marathon.co.uk/" + str(year) +
            "/?event=LMRW&pid=list&pidp=start&search%5Bsex%5D=" + gender)
    elif event.lower() == "elite":
        url = ("https://results.london-marathon.co.uk/" + str(year) +
            "/?event=ELIT&pid=list&pidp=start&search%5Bsex%5D=" + gender)
    if print_url:
        print(url)

    request_site = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    webpage = urlopen(request_site).read()

    soup = BeautifulSoup(webpage, 'lxml')
    
    return soup
    

In [None]:
def get_n_pages(year, gender):
    soup = get_soup("mass", year, gender, "1", print_url=True)
    if year >= 2019:
        list_length = int(soup.find("li", class_="list-group-item").text.split(" ")[0])
    elif year <= 2018:
        list_length = int(soup.find("div", class_="list-info-text").text.split(" ")[0])
    n_pages = math.trunc(list_length/1000 + 1)
    return n_pages

In [None]:
def add_runners(year, df):

    errors = pd.DataFrame(columns=["Page", "tr item", "Error"])
    
    for gender in ["M", "W"]:
        
        n_pages = get_n_pages(year, gender)
        
        # for n in list(range(1,n_pages + 1)):
        for n in [1]:
            
            soup = get_soup("mass", year, gender, str(n))
            datas = soup.find_all("li")

            for i, data in enumerate(datas):
        #         print(data.prettify())
                try:
                    place_overall = data.find_all('div', class_="list-field type-place place-secondary hidden-xs numeric")[0].text
                    place_gender = data.find('div', class_="list-field type-place place-primary numeric").text
                    place_category = data.find_all('div', class_="list-field type-place place-secondary hidden-xs numeric")[1].text
                    if year == 2019:
                        event = "Mass"
                    else:
                        event = data.find('div', class_="list-field type-event_name").text[5:]
                    name = data.find('h4', class_="list-field type-fullname").text
                    club = data.find('div', class_="list-field type-field hidden-xs").text[4:]
                    runner_no = data.find('div', class_="list-field type-field").text[13:]
                    category = data.find('div', class_='list-field type-age_class').text[8:]
                    half_time = data.find('div', class_="split list-field type-time hidden-xs").text[5:]
                    finish_time = data.find('div', class_="split list-field type-time").text[6:]

                    df.loc[len(df.index)] = [place_overall, place_gender, place_category, name, club,
                                            runner_no, gender, category, event, half_time, finish_time]
                # except IndexError:
                #     continue
                except Exception as e:
                    errors.loc[len(errors.index)] = [n, i, e]
                    continue
            print(f'DataFrame length after reading page {n} of {n_pages} in {gender} = {len(df.index)} rows')

    print(errors)
    
    return df

In [None]:
def add_runners_old(year, df):

    errors = pd.DataFrame(columns=["Page", "tr item", "Error"])
    
    for gender in ["M", "W"]:

        n_pages = get_n_pages(year, gender)

        # for n in list(range(1,n_pages + 1)):
        for n in [1]:
            
            soup = get_soup("mass", year, gender, n)
            datas = soup.find_all("tr")

            print(f'Reading page {n} of {n_pages} in {gender}')

            for i, data in enumerate(datas):
    #             print(data.prettify())
                try:
                    place_overall = data.find_all('td')[0].text
                    place_gender = data.find_all('td')[1].text
                    place_category = data.find_all('td')[2].text
                    event = "Mass"

                    if year == 2014:
                        name = data.find_all('td')[3].text[1:]
                        club = data.find_all('td')[5].text
                        runner_no = data.find_all('td')[6].text
                        category = data.find_all('td')[7].text
                        event = "Mass"
                        half_time = data.find_all('td')[8].text
                        finish_time = data.find_all('td')[9].text
                    
                    else:
                        name = data.find_all('td')[3].text[1:-1]
                        club = data.find_all('td')[4].text
                        runner_no = data.find_all('td')[5].text
                        category = data.find_all('td')[6].text
                        half_time = data.find_all('td')[7].text
                        finish_time = data.find_all('td')[8].text



                    df.loc[len(df.index)] = [place_overall, place_gender, place_category, name, club,
                                            runner_no, gender, category, event, half_time, finish_time]
    #             except IndexError:
    #                 continue
                except Exception as e:
                    errors.loc[len(errors.index)] = [n, i, e]
                    continue

        print(errors)
        
    return df

In [None]:
def numeric_places(df):
    df["Overall Place"] = pd.to_numeric(df["Overall Place"], errors='coerce')
    df["Gender Place"] = pd.to_numeric(df["Gender Place"], errors='coerce')
    df["Category Place"] = pd.to_numeric(df["Category Place"], errors='coerce')
    df[["Overall Place", "Gender Place", "Category Place"]] = df[["Overall Place", "Gender Place", 
                                                                "Category Place"]].astype(int, errors="ignore")
    return df

In [None]:
# Get results from mass start

def get_mass_results(years):

    for year in years:
        
        if year == 2020:
            continue

        # choose not to authenticate security certificate
        # https://clay-atlas.com/us/blog/2021/09/26/python-en-urllib-error-ssl-certificate/
        ssl._create_default_https_context = ssl._create_unverified_context

        print()
        print(year)

        empty_df = pd.DataFrame(columns=["Overall Place", "Gender Place", "Category Place", "Name", "Club", "Runner Number",
                                "Gender", "Category", "Event", "Half Time", "Finish Time"])
        

        if year >= 2019:
            df = add_runners(year, empty_df)

        elif year <= 2018:
            df = add_runners_old(year, empty_df)
                        
        df = numeric_places(df)

        df = df.sort_values("Overall Place")
        df.to_csv("London_" + str(year) + "_mass_results.csv", index=False)
        print("end")

In [None]:
def add_elite_runners(year, df):

    errors = pd.DataFrame(columns=["Page", "tr item", "Error"])
    
    for gender in ["M", "W"]:
                            
        soup = get_soup("elite", year, gender, print_url=True)
        datas = soup.find_all("li")

        for i, data in enumerate(datas):
            # print(data.prettify())

            try:
                place_overall = data.find_all('div', class_="list-field type-place place-secondary hidden-xs numeric")[0].text
                place_gender = data.find('div', class_="list-field type-place place-primary numeric").text
                club = data.find('div', class_="list-field type-field hidden-xs").text[4:]
                category = data.find('div', class_='list-field type-age_class').text[8:]

                if year == 2022:
                    place_category = "-"

                else:
                    place_category = data.find_all('div', class_="list-field type-place place-secondary hidden-xs numeric")[1].text

                if year == 2020:
                    name = data.find('h4', class_="list-field type-eval").text
                    runner_no = data.find('div', class_="list-field type-field").text[14:]
                    finish_time = data.find('div', class_="list-field type-time").text[6:]
                else:
                    name = data.find('h4', class_="list-field type-fullname").text
                    runner_no = data.find('div', class_="list-field type-field").text[14:]
                    finish_time = data.find('div', class_="split list-field type-time").text[6:]

                if year == 2019:
                    half_time = data.find('div', class_="split list-field type-time hidden-xs").text[4:]
                elif year == 2020:
                    half_time = "-"
                else:                    
                    half_time = data.find('div', class_="split list-field type-time hidden-xs").text[5:]

                if year in [2020, 2019]:
                    event = "Elite " + gender
                else:
                    event = data.find('div', class_="list-field type-event_name").text[5:]

                print([place_overall, place_gender, place_category, name, club, 
                    runner_no, gender, category, event, half_time, finish_time])
                df.loc[len(df.index)] = [place_overall, place_gender, place_category, name, club,
                                        runner_no, gender, category, event, half_time, finish_time]
            # except IndexError as e:
            #     print(e)
            #     continue
            except Exception as e:
                errors.loc[len(errors.index)] = ["1", i, e]
                continue
            
    print(errors)
    
    return df

In [None]:
def add_elite_runners_old(year, df):

    errors = pd.DataFrame(columns=["Page", "tr item", "Error"])
    
    for gender in ["M", "W"]:


        soup = get_soup("elite", year, gender, print_url=True)
        datas = soup.find_all("tr")

        for i, data in enumerate(datas):
#             print(data.prettify())
            try:
                place_overall = data.find_all('td')[0].text
                place_gender = data.find_all('td')[1].text
                place_category = data.find_all('td')[2].text
                event = "Elite"

                if year == 2014:
                    name = data.find_all('td')[3].text[1:]
                    club = data.find_all('td')[5].text
                    runner_no = data.find_all('td')[6].text
                    category = data.find_all('td')[7].text
                    event = "Elite"
                    half_time = data.find_all('td')[8].text
                    finish_time = data.find_all('td')[9].text
                
                else:
                    name = data.find_all('td')[3].text[1:-1]
                    club = data.find_all('td')[4].text
                    runner_no = data.find_all('td')[5].text
                    category = data.find_all('td')[6].text
                    half_time = data.find_all('td')[7].text
                    finish_time = data.find_all('td')[8].text

                print([place_overall, place_gender, place_category, name, club, 
                    runner_no, gender, category, event, half_time, finish_time])
                df.loc[len(df.index)] = [place_overall, place_gender, place_category, name, club,
                                        runner_no, gender, category, event, half_time, finish_time]
#             except IndexError:
#                 continue
            except Exception as e:
                errors.loc[len(errors.index)] = ["1", i, e]
                continue

    print(errors)
        
    return df

In [None]:
# Get results from elite start

def get_elite_results(years):
    
    for year in years:

        # choose not to authenticate security certificate
        # https://clay-atlas.com/us/blog/2021/09/26/python-en-urllib-error-ssl-certificate/
        ssl._create_default_https_context = ssl._create_unverified_context

        print()
        print(year)

        empty_df = pd.DataFrame(columns=["Overall Place", "Gender Place", "Category Place", "Name", "Club", "Runner Number",
                                "Gender", "Category", "Event", "Half Time", "Finish Time"])
        

        if year >= 2019:
            df = add_elite_runners(year, empty_df)

        elif year <= 2018:
            df = add_elite_runners_old(year, empty_df)
                        
        df = numeric_places(df)

        df = df.sort_values("Overall Place")
        df.to_csv("London_" + str(year) + "_elite_results.csv", index=False)
        print("end")

In [None]:
all_years = list(range(2022, 2013, -1))
get_mass_results(all_years)
get_elite_results(all_years)