# Data Scraping from the Boston Marathon 2024 Website

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle

### Create a list of urls associated with each runner

In [37]:
%%time
## Each runner entry and data is on a separate url, this creates a list of all runner urls

url_list = []

for i in range(1,1175):
    soup = scrape_race_data('https://results.baa.org/2024/?page='+ str(i) + '&event=R&event_main_group=runner&pid=search')
    fullname_elements = soup.find_all(class_='list-field type-fullname')

    # Extract and print all href attributes
    for element in fullname_elements:
        a_tag = element.find('a', href=True)
        if a_tag:
            url_list.append('https://results.baa.org/2024/' + str(a_tag['href']))

CPU times: total: 5min 8s
Wall time: 29min 13s


In [4]:
#Verify that the number of urls matches the number of runner expected
len(url_list)

29330

In [39]:
#Save the list of runner urls 
# with open('url_list.pkl', 'wb') as file:
#     pickle.dump(url_list, file)

#Load the list runner urls if perviously run and saved
with open('url_list.pkl', 'rb') as file:
    url_list = pickle.load(file)

### Extract data for each race participant

In [2]:
## Dictionary of all variables to be collected from each url

marathon_dict = {
    'name' : [], 'team' : [], 'age_group' : [], 'bib_num' : [], 'state' : [], 'place_mw' : [], 'place_ac' : [], 
    'place_total' : [], 'time_net' : [], 'time_gun' : [], 'mile_split' : [], 'race_status' : [], 'last_split' : [],
    '5K_time_of_day' : [], '5K_time' : [], '5K_time_diff' : [], '5K_min_per_mile' : [],
    '10K_time_of_day' : [], '10K_time' : [], '10K_time_diff' : [], '10K_min_per_mile' : [],
    '15K_time_of_day' : [], '15K_time' : [], '15K_time_diff' : [], '15K_min_per_mile' : [],
    '20K_time_of_day' : [], '20K_time' : [], '20K_time_diff' : [], '20K_min_per_mile' : [],
    'HALF_time_of_day' : [], 'HALF_time' : [], 'HALF_time_diff' : [], 'HALF_min_per_mile' : [],
    '25K_time_of_day' : [], '25K_time' : [], '25K_time_diff' : [], '25K_min_per_mile' : [],
    '30K_time_of_day' : [], '30K_time' : [], '30K_time_diff' : [], '30K_min_per_mile' : [],
    '20_miles_time_of_day' : [], '20_miles_time' : [], '20_miles_time_diff' : [], '20_miles_min_per_mile' : [],
    '21_miles_time_of_day' : [], '21_miles_time' : [], '21_miles_time_diff' : [], '21_miles_min_per_mile' : [],
    '35K_time_of_day' : [], '35K_time' : [], '35K_time_diff' : [], '35K_min_per_mile' : [],
    '23_miles_time_of_day' : [], '23_miles_time' : [], '23_miles_time_diff' : [], '23_miles_min_per_mile' : [],
    '24_miles_time_of_day' : [], '24_miles_time' : [], '24_miles_time_diff' : [], '24_miles_min_per_mile' : [],
    '40K_time_of_day' : [], '40K_time' : [], '40K_time_diff' : [], '40K_min_per_mile' : [],
    '25_2_miles_time_of_day' : [], '25_2_miles_time' : [], '25_2_miles_time_diff' : [], '25_2_miles_min_per_mile' : [],
    '26_2_miles_time_of_day' : [], '26_2_miles_time' : [], '26_2_miles_time_diff' : [], '26_2_miles_min_per_mile' : []
}

In [5]:
## Dictionaries to map from html locations to the dictionary names for each variable collected

participant_elements_dict = {
    'name' : 'f-__fullname last',
    'team' : 'f-team last',
    'age_group' : 'f-age_class last', 
    'bib_num': 'f-start_no_text last',
    'state' : 'f-state last',
    'place_mw' : 'f-place_all last',
    'place_ac' : 'f-place_age last',
    'place_total' : 'f-place_nosex last',
    'time_net' : 'f-time_finish_netto last',
    'time_gun' : 'f-time_finish_brutto last',
    'mile_split' : 'f-total_min_mile last',
    'race_status' : 'f-race_status last' ,
    'last_split' : 'f-__last_split_name last' 
}

race_elements_dict = {
    '5K' : 'f-time_01 split',
    '10K' : 'list-highlight f-time_02 split',
    '15K' : 'f-time_03 split',
    '20K' : 'list-highlight f-time_04 split',
    'HALF' : 'f-time_05 split',
    '25K' : 'list-highlight f-time_06 split',
    '30K' : 'f-time_07 split',
    '20_miles' : 'list-highlight f-time_42 split',
    '21_miles' : 'f-time_43 split',
    '35K' : 'list-highlight f-time_08 split',
    '23_miles' : 'f-time_49 split',
    '24_miles' : 'list-highlight f-time_50 split',
    '40K' : 'f-time_09 split',
    '25_2_miles' : 'list-highlight f-time_44 split',
    '26_2_miles' : 'f-time_finish_netto highlight split'
}

race_subelements_dict = {
    'time_of_day' : 'time_day',
    'time' : 'time',
    'time_diff' : 'diff right',
    'min_per_mile' : 'min_km right opt',
}

In [5]:
## Functions created to facilitate repetitive data collection tasks

## Participant data is located with distinct htmp classes
def get_participant_data(class_name, soup):
    data = soup.find('td', class_=class_name)
    if data is not None:
        participant_data = data.get_text()
        return participant_data
    else:
        return None

## Race performance data is located within subclasses and must be extracted separately
def get_race_data(class_name, subclass_name, soup):
    data = soup.find('tr', class_= class_name)
    if data is not None:
        sub_data = data.find('td', class_= subclass_name)
        if sub_data is not None:
            race_data = sub_data.get_text()
            return race_data
        else:
            return None
    else:
        return None

## Collection the html from each url
def scrape_race_data(url):
    # Make a GET request to the provided URL, make sure that there are no errors.
    try:                                 
        response = requests.get(url)
    except:
        return None
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup
    else:
        return None

In [None]:
%%time

## Collect the data from each url using the provided functions
for url in url_list:
    html_data = scrape_race_data(url)
    if html_data is not None:
        # for element in participant_elements_dict: 
        #     class_name = participant_elements_dict.get(element)
        #     participant_data = get_participant_data(class_name, html_data)
        #     marathon_dict[element].append(participant_data)

        for element_race in race_elements_dict:
            race_class_name = race_elements_dict.get(element_race)
            for subelement_race in race_subelements_dict:
                race_subclass_name = race_subelements_dict.get(subelement_race)
                race_data = get_race_data(race_class_name, race_subclass_name, html_data)
                marathon_dict[element_race+ '_' + subelement_race].append(race_data)

In [None]:
##Save the dictionary of runner data
# with open('marathon_dict.pkl', 'wb') as file:
#     pickle.dump(marathon_dict, file)

## Extract the dictionary of runner data
with open('marathon_dict.pkl', 'rb') as file:
    marathon_dict = pickle.load(file)

## Convert data to a Dataframe
marathon_data = pd.DataFrame.from_dict(marathon_dict)

### Extract Gender Data for each Runner

In [None]:
## The runner gender was provided on a separate webpage and had to be extracted separately and joined to the existing dataset

In [13]:
%%time
## Extract the names of all female runners
women_list = []

for i in range(1,438):
    soup = scrape_race_data('https://results.baa.org/2024/?page='+ str(i) + '&event=R&event_main_group=runner&pid=list&search%5Bsex%5D=W&search%5Bage_class%5D=%25')
    fullname_elements = soup.find_all(class_='list-field type-fullname')

    # Extract and print all href attributes
    for name in fullname_elements:
        #a_tag = element.find('a', href=True)
        name_data = name.get_text()

        if name_data:
            women_list.append(str(name_data))

CPU times: total: 2min 16s
Wall time: 13min 12s


In [15]:
with open('women_list.pkl', 'wb') as file:
    pickle.dump(women_list, file)

In [10]:
%%time
## Extract the names of all male runners
men_list = []

for i in range(1,585):
    soup = scrape_race_data('https://results.baa.org/2024/?page='+ str(i) + '&event=R&event_main_group=runner&pid=list&search%5Bsex%5D=M&search%5Bage_class%5D=%25')
    fullname_elements = soup.find_all(class_='list-field type-fullname')

    # Extract and print all href attributes
    for name in fullname_elements:
        #a_tag = element.find('a', href=True)
        name_data = name.get_text()

        if name_data:
            men_list.append(str(name_data))

CPU times: total: 3min 9s
Wall time: 18min 32s


In [12]:
with open('men_list.pkl', 'wb') as file:
    pickle.dump(men_list, file)

In [19]:
%%time
## Extract the names of all non-binary runners
bi_list = []

for i in range(1,4):
    soup = scrape_race_data('https://results.baa.org/2024/?page='+ str(i) + '&event=R&event_main_group=runner&pid=list&search%5Bsex%5D=X&search%5Bage_class%5D=%25')
    fullname_elements = soup.find_all(class_='list-field type-fullname')

    # Extract and print all href attributes
    for name in fullname_elements:
        #a_tag = element.find('a', href=True)
        name_data = name.get_text()

        if name_data:
            bi_list.append(str(name_data))

CPU times: total: 828 ms
Wall time: 4.87 s


In [21]:
with open('binary_list.pkl', 'wb') as file:
    pickle.dump(bi_list, file)

In [None]:
#Create Dictionary of all the gender data provided
name_list = men_list+women_list+bi_list
sex_list = (['M'] * len(men_list))+(['W'] * len(women_list))+(['X'] * len(bi_list))
sex_dict = {"name":name_list, "sex":sex_list}

In [None]:
## Create a Dataframe 
sex_data = pd.DataFrame.from_dict(sex_dict)

In [47]:
## Save the dataframe for future use
with open('sex_data.pkl', 'wb') as file:
    pickle.dump(sex_data, file)