# Scrape Data
Jack Wilson
9/23/2025

# Import Modules

In [2]:
import pandas as pd
import time, random

from selenium import webdriver
from selenium.webdriver.common.by import By

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)

# Scrape F1 Website [Race Results 2001-2017]

In [None]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

# Establish year begin and end
year_begin = 2001
year_end = 2017

# Establish empty lists
race_urls = []

print('SCRAPING 2001 - 2017 URLS . . .')
while year_begin <= year_end:

    # Use the year begin to be able to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
    
    year_begin += 1

print('SCRAPING 2001 - 2017 DATA . . .')
# Create a dictionary to map driver names to unique IDs
driver_id_map = {}
next_id = 1

# Establish empty lists
race_url = []
position = []
driver_name = []
points = []
driver_id = []

# For each race link, open it and get data from the table
for link in race_urls:
    browser.get(link)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            if len(cells) >= 6:
                race_url.append(link)
                position.append(cells[0].text)
                driver_name.append(cells[2].text)
                points.append(cells[6].text)
                
                # Assign unique driver ID
                current_driver = cells[2].text
                if current_driver not in driver_id_map:
                    driver_id_map[current_driver] = next_id
                    next_id += 1
                driver_id.append(driver_id_map[current_driver])

print('COMPLETE')
browser.close()

SCRAPING 2001 - 2017 URLS . . .
SCRAPING 2001 - 2017 DATA . . .
COMPLETE


In [5]:
# Turn the lists into a dataframe
races_2001 = pd.DataFrame({
    "race_url": race_url,
    "driver_id": driver_id,
    "driver_name": driver_name,
    "position": position,
    "points": points
})

# Save results
races_2001.to_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8', index=False)

# Scrape F1 Website [Race Results 2001-2017]

In [7]:
races_2001 = pd.read_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8')
max_driver_id = races_2001['driver_id'].max()
print(max_driver_id)

105


In [17]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

# Establish year begin and end
year_begin = 2018
year_end = 2025

# Establish empty lists
race_urls = []

print('SCRAPING 2018 - 2025 URLS . . .')
while year_begin <= year_end:

    # Use the year begin to be able to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
    
    year_begin += 1

print('SCRAPING 2018 - 2025 DATA . . .')
# Create a dictionary to map driver, race, and circuit names to unique IDs
driver_id_map = {}
race_id_map = {}
circuit_id_map = {}

next_driver_id = max_driver_id + 1
next_race_id = 1
next_circuit_id = 1

# Establish empty lists
driver_id = []
race_id = []
circuit_id = []
year = []
rounds = []
driver_name = []
team_name = []
circuit_names = []
end_position = []
points = []
laps_completed = []


# For each race link, open it and get data from the table
current_year = None
repr = 0

for link in race_urls:
    browser.get(link)
    
    # Extract year from URL
    race_year = link.split('/')[5]
    
    # Extract circuit name from URL
    circuit_name = link.split('/')[7]
    
    # Reset round counter when year changes
    if race_year != current_year:
        current_year = race_year
        r = 1
    else:
        r += 1

    # Assign unique race ID (combination of year and round)
    race_key = f"{race_year}_{r}"
    if race_key not in race_id_map:
        race_id_map[race_key] = next_race_id
        next_race_id += 1
    
    # Assign unique circuit ID (circuit name should repeat in subsequent years)
    if circuit_name not in circuit_id_map:
        circuit_id_map[circuit_name] = next_circuit_id
        next_circuit_id += 1

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            if len(cells) >= 6:
                end_position.append(cells[0].text)
                driver_name.append(cells[2].text)
                team_name.append(cells[3].text)
                laps_completed.append(cells[4].text)
                points.append(cells[6].text)
                circuit_names.append(circuit_name)
                year.append(race_year)
                rounds.append(r)
                
                # Assign unique driver, race, and circuit IDs
                current_driver = cells[2].text
                if current_driver not in driver_id_map:
                    driver_id_map[current_driver] = next_driver_id
                    next_driver_id += 1
                driver_id.append(driver_id_map[current_driver])
                race_id.append(race_id_map[race_key])
                circuit_id.append(circuit_id_map[circuit_name])

print('COMPLETE')
browser.close()

SCRAPING 2018 - 2025 URLS . . .
SCRAPING 2018 - 2025 DATA . . .
COMPLETE


In [20]:
# Turn the lists into a dataframe
races_2018 = pd.DataFrame({
    'driver_id': driver_id,
    'race_id': race_id,
    'circuit_id': circuit_id,
    'year': year,
    'round_number': rounds,
    'circuit_name': circuit_names,
    'driver_name': driver_name,
    'team_name': team_name,
    'end_position': end_position,
    'points': points,
    'laps_completed': laps_completed
})

# Save results
races_2018.to_csv('../data/raw/races_results_raw_2018-2025.csv', encoding='utf-8', index=False)

# Weather Collection

In [32]:
import fastf1
import pandas as pd

# Enable cache (important for performance)
#fastf1.Cache.enable_cache("cache")  # creates a folder "cache" to store data

# Load a session: example Bahrain GP 2023 Qualifying
session = fastf1.get_session(2018, 'australia', 'fp1')
session.load(laps=False, telemetry=False, messages=False)  # downloads and parses the data

# Weather data is stored in session.weather_data (a structured numpy array)
weather_array = session.weather_data

# Convert to DataFrame
weather_df = pd.DataFrame(weather_array)

print(weather_df.head())

core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for weather_data. Loading data...
_api           INFO 	Fetching weather data...
req            INFO 	Data has been written to cache!
core           INFO 	Finished loading data for 20 drivers: ['2', '3', '5', '7', '8', '9', '10', '11', '14', '16', '18', '20', '27', '28', '31', '33', '35', '44', '55', '77']


                    Time  AirTemp  Humidity  Pressure  Rainfall  TrackTemp  \
0 0 days 00:00:24.964000     25.3      36.7    1020.2     False       38.5   
1 0 days 00:01:24.977000     25.4      36.9    1020.0     False       38.5   
2 0 days 00:02:24.990000     25.2      36.8    1020.1     False       38.5   
3 0 days 00:03:25.002000     25.3      36.1    1020.1     False       38.5   
4 0 days 00:04:25.014000     25.4      35.9    1020.0     False       38.6   

   WindDirection  WindSpeed  
0            330        2.6  
1            308        2.5  
2            305        2.7  
3            305        2.5  
4            325        2.8  


In [20]:
weather_df

Unnamed: 0,Time,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 00:00:41.425000,24.7,17.0,1017.5,False,29.8,338,0.8
1,0 days 00:01:41.221000,24.6,17.0,1017.5,False,29.7,17,0.8
2,0 days 00:02:41.235000,24.6,17.0,1017.5,False,29.6,323,0.5
3,0 days 00:03:41.250000,24.6,17.0,1017.5,False,29.6,345,1.0
4,0 days 00:04:41.264000,24.6,18.0,1017.4,False,29.5,355,0.8
...,...,...,...,...,...,...,...,...
90,0 days 01:30:41.640000,23.1,33.0,1017.9,False,26.7,0,1.1
91,0 days 01:31:41.639000,23.1,33.0,1017.9,False,26.7,311,0.4
92,0 days 01:32:41.638000,23.1,33.0,1017.9,False,26.7,318,1.2
93,0 days 01:33:41.637000,23.1,33.0,1017.9,False,26.7,332,1.0


In [None]:
def aggregate_weather(weather_df):
    agg = {}
    for col in ['AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'Pressure']:
        agg[f'{col}_mean'] = weather_df[col].mean()
        agg[f'{col}_min'] = weather_df[col].min()
        agg[f'{col}_max'] = weather_df[col].max()
        agg[f'{col}_std'] = weather_df[col].std()
    
    # Rain flag and proportion
    agg['RainAffected'] = int(weather_df['Rainfall'].any())
    agg['RainFraction'] = weather_df['Rainfall'].mean()
    
    return pd.Series(agg)

# Usage
session_weather_features = aggregate_weather(weather_df)
print(session_weather_features)


AirTemp_mean        23.880000
AirTemp_min         23.000000
AirTemp_max         24.700000
AirTemp_std          0.527660
TrackTemp_mean      28.085263
TrackTemp_min       26.600000
TrackTemp_max       29.800000
TrackTemp_std        0.970629
WindSpeed_mean       0.607368
WindSpeed_min        0.000000
WindSpeed_max        1.300000
WindSpeed_std        0.356174
Humidity_mean       21.652632
Humidity_min        15.000000
Humidity_max        34.000000
Humidity_std         6.799200
Pressure_mean     1017.641053
Pressure_min      1017.400000
Pressure_max      1017.900000
Pressure_std         0.132484
RainAffected         0.000000
RainFraction         0.000000
dtype: float64
