In [None]:
!pip install BeautifulSoup4 html5lib requests lxml

In [None]:
import pandas as pd

# URL of the webpage containing marathon records
DATA_URL = 'https://www.runnersworld.com/races-places/a20823734/these-are-the-worlds-fastest-marathoners-and-marathon-courses/'

# Parse all the HTML tables and return a list of DataFrames
# First row contains column headers. Thus header=0
html_tables = pd.read_html(DATA_URL, header=0)
# Check data type and size for the returned value
print(f'TYPE: {type(html_tables)}, SIZE: {len(html_tables)}')

TYPE: <class 'list'>, SIZE: 8


In [None]:
# First table from the web page - fastest men
fastest_men = html_tables[0]
# Print data type
print(type(fastest_men))
# Print the content of the DataFrame
fastest_men

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Runner,Finish Time,Pace/Mile,Marathon
0,Kelvin Kiptum (Kenya),2:00:35,4:36.0,"Chicago, 2023"
1,Eliud Kipchoge (Kenya),2:01:09,4:37.2,"Berlin, 2022"
2,Kenenisa Bekele (Ethiopia),2:01:41,4:38.5,"Berlin, 2019"
3,Sisay Lemma (Ethiopia),2:01:48,4:38.7,"Valencia, 2023"
4,Birhanu Legese (Ethiopia),2:02:48,4:41.0,"Berlin, 2019"
5,Mosinet Geremew (Ethiopia),2:02:55,4:41.3,"London, 2019"
6,Dennis Kimetto (Kenya),2:02:57,4:41.4,"Berlin, 2014"
7,Evans Chebet (Kenya),2:03:00,4:41.5,"Valencia, 2020"
8,Gabriel Geay (Tanzania),2:03:00,4:41.5,"Valencia, 2022"
9,Lawrence Cherono (Kenya),2:03:04,4:41.6,"Valencia, 2022"


In [None]:
# Save the DataFrame to a CSV file.
# Dont save the index (index=False)
fastest_men.to_csv('fastest_men.csv', index=False)

In [None]:
# Second DataFrame in the list contains fastest female runners
fastest_women = html_tables[1]
fastest_women

Unnamed: 0,Runner,Finish Time,Pace/Mile,Marathon
0,Tigist Assefa (Ethiopia),2:11:53,5:01.8,"Berlin, 2023"
1,Sifan Hassan (Netherlands),2:13:44,5:06.0,"Chicago, 2023"
2,Brigid Kosgei (Kenya),2:14:04,5:06.8,"Chicago, 2019"
3,,,,
4,Ruth Chepngetich (Kenya),2:14:18,5:07.3,"Chicago, 2022"
5,Amane Beriso (Ethiopia),2:14:58,5:08.9,"Valencia, 2022"
6,Paula Radcliffe (Great Britain),2:15:25,5:09.9,"London, 2003"
7,Worknesh Degefa (Ethiopia),2:15:51,5:10.9,"Valencia, 2023"
8,Tigist Ketema (Ethiopia),2:16:07,5:11.5,"Dubai, 2024"
9,Almaz Ayana (Ethiopia),2:16:22,5:12.1,"Valencia, 2023"


In [None]:
# remove rows (axis='index) with missing values
# in all the columns (how='all')
fastest_women = fastest_women.dropna(axis='index', how='all')
fastest_women

Unnamed: 0,Runner,Finish Time,Pace/Mile,Marathon
0,Tigist Assefa (Ethiopia),2:11:53,5:01.8,"Berlin, 2023"
1,Sifan Hassan (Netherlands),2:13:44,5:06.0,"Chicago, 2023"
2,Brigid Kosgei (Kenya),2:14:04,5:06.8,"Chicago, 2019"
4,Ruth Chepngetich (Kenya),2:14:18,5:07.3,"Chicago, 2022"
5,Amane Beriso (Ethiopia),2:14:58,5:08.9,"Valencia, 2022"
6,Paula Radcliffe (Great Britain),2:15:25,5:09.9,"London, 2003"
7,Worknesh Degefa (Ethiopia),2:15:51,5:10.9,"Valencia, 2023"
8,Tigist Ketema (Ethiopia),2:16:07,5:11.5,"Dubai, 2024"
9,Almaz Ayana (Ethiopia),2:16:22,5:12.1,"Valencia, 2023"
10,Rosemary Wanjiru (Kenya),2:16:28,5:12.3,"Tokyo, 2023"


In [None]:
# Save DataFrame to CSV file
# Don't save index (index=False)
fastest_women.to_csv('fastest_women.csv', index=False)

In [None]:
# We'll need requests library for this
import requests

# Prepare a dictionary with the User-Agent header
# and its value (UA string for Firefox)
request_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0'
}
# Use requests.get() to read the webpage.
# Pass the headers dictionary
response = requests.get(DATA_URL, headers=request_headers)

# Now use Pandas read_html()
# Pass the page HTML (response.text) as input
html_tables = pd.read_html(response.text, index_col=0)
# The first table is what we wanted
fastest_bikes = html_tables[0]
fastest_bikes

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Runner,Finish Time,Pace/Mile,Marathon
Kelvin Kiptum (Kenya),2:00:35,4:36.0,"Chicago, 2023"
Eliud Kipchoge (Kenya),2:01:09,4:37.2,"Berlin, 2022"
Kenenisa Bekele (Ethiopia),2:01:41,4:38.5,"Berlin, 2019"
Sisay Lemma (Ethiopia),2:01:48,4:38.7,"Valencia, 2023"
Birhanu Legese (Ethiopia),2:02:48,4:41.0,"Berlin, 2019"
Mosinet Geremew (Ethiopia),2:02:55,4:41.3,"London, 2019"
Dennis Kimetto (Kenya),2:02:57,4:41.4,"Berlin, 2014"
Evans Chebet (Kenya),2:03:00,4:41.5,"Valencia, 2020"
Gabriel Geay (Tanzania),2:03:00,4:41.5,"Valencia, 2022"


In [None]:
fastest_100m = pd.DataFrame({
    'Rank': range(1, 6),
    'Runner': ['Usain Bolt', 'Tyson Gay', 'Yohan Blake', 'Asafa Powell', 'Justin Gatlin'],
    'Time': [9.58, 9.69, 9.69, 9.72, 9.74], # seconds
    'Country': ['Jamaica', 'USA', 'Jamaica', 'Jamaica', 'USA']
})
fastest_100m = fastest_100m.set_index('Rank')
fastest_100m

Unnamed: 0_level_0,Runner,Time,Country
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Usain Bolt,9.58,Jamaica
2,Tyson Gay,9.69,USA
3,Yohan Blake,9.69,Jamaica
4,Asafa Powell,9.72,Jamaica
5,Justin Gatlin,9.74,USA
