# Scrape Data
Jack Wilson
9/23/2025

# Import Modules

In [12]:
import pandas as pd
import time, random

import pickle


from selenium import webdriver
from selenium.webdriver.common.by import By

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)

# Scrape F1 Website [Race Results 2001-2017]

In [None]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

# Establish year begin and end
year_begin = 2001
year_end = 2017

# Establish empty lists
race_urls = []

print('SCRAPING 2001 - 2017 URLS . . .')
while year_begin <= year_end:

    # Use the year begin to be able to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
    
    year_begin += 1

print('SCRAPING 2001 - 2017 DATA . . .')
# Create a dictionary to map driver names to unique IDs
driver_id_map = {}
next_id = 1

# Establish empty lists
race_url = []
position = []
driver_name = []
points = []
driver_id = []

# For each race link, open it and get data from the table
for link in race_urls:
    browser.get(link)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            if len(cells) >= 6:
                race_url.append(link)
                position.append(cells[0].text)
                driver_name.append(cells[2].text)
                points.append(cells[6].text)
                
                # Assign unique driver ID
                current_driver = cells[2].text
                if current_driver not in driver_id_map:
                    driver_id_map[current_driver] = next_id
                    next_id += 1
                driver_id.append(driver_id_map[current_driver])

print('COMPLETE')
browser.close()

SCRAPING 2001 - 2017 URLS . . .
SCRAPING 2001 - 2017 DATA . . .
COMPLETE


In [5]:
# Turn the lists into a dataframe
races_2001 = pd.DataFrame({
    "race_url": race_url,
    "driver_id": driver_id,
    "driver_name": driver_name,
    "position": position,
    "points": points
})

# Save results
races_2001.to_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8', index=False)

# Scrape F1 Website [Race Results 2001-2017]

In [5]:
races_2001 = pd.read_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8')
max_driver_id = races_2001['driver_id'].max()
print(max_driver_id)

105


In [30]:
# Map constructors common name early so proper IDs can be made
constructor_mapping = {
    # Red Bull
    'Red Bull Racing Renault': 'Red Bull',
    'Red Bull Renault': 'Red Bull',
    'RBR Renault': 'Red Bull',
    'RBR Cosworth': 'Red Bull',
    'RBR Ferrari': 'Red Bull',
    'Red Bull Racing TAG Heuer': 'Red Bull',
    'Red Bull Racing Honda': 'Red Bull',
    'Red Bull Racing RBPT': 'Red Bull',
    'Red Bull Racing Honda RBPT': 'Red Bull',
    'Red Bull Racing': 'Red Bull',
    
    # AlphaTauri/Toro Rosso
    'Toro Rosso': 'Toro Rosso',
    'STR Ferrari': 'Toro Rosso',
    'STR Renault': 'Toro Rosso',
    'STR Cosworth': 'Toro Rosso',
    'Toro Rosso Ferrari': 'Toro Rosso',
    'Scuderia Toro Rosso Honda': 'Toro Rosso',
    'AlphaTauri Honda': 'AlphaTauri',
    'AlphaTauri RBPT': 'AlphaTauri',
    'AlphaTauri Honda RBPT': 'AlphaTauri',
    
    # Racing Bulls
    'RB Honda RBPT': 'Racing Bulls',
    
    # Ferrari
    'Ferrari': 'Ferrari',
    'Ferrari Jaguar': 'Ferrari',
    'Thin Wall Ferrari': 'Ferrari',
    
    # Mercedes
    'Mercedes': 'Mercedes',
    'Mercedes-Benz': 'Mercedes',
    
    # Aston Martin
    'Aston Martin Mercedes': 'Aston Martin',
    'Aston Martin Aramco Mercedes': 'Aston Martin',
    'Aston Butterworth': 'Aston Martin',
    'Aston Martin': 'Aston Martin',
    
    # McLaren
    'McLaren Ford': 'McLaren',
    'McLaren TAG': 'McLaren',
    'McLaren Honda': 'McLaren',
    'McLaren Peugeot': 'McLaren',
    'McLaren Renault': 'McLaren',
    'McLaren BRM': 'McLaren',
    'McLaren Mercedes': 'McLaren',
    'McLaren Serenissima': 'McLaren',
    'Mclaren BRM': 'McLaren',
    'McLaren Alfa Romeo': 'McLaren',
    
    # Williams
    'Williams Ford': 'Williams',
    'Williams Renault': 'Williams',
    'Williams Honda': 'Williams',
    'Williams Judd': 'Williams',
    'Williams BMW': 'Williams',
    'Williams Toyota': 'Williams',
    'Williams Cosworth': 'Williams',
    'Williams Mecachrome': 'Williams',
    'Williams Supertec': 'Williams',
    'Williams Mercedes': 'Williams',
    'Frank Williams Racing Cars/Williams': 'Williams',
    
    # Renault
    'Renault': 'Renault',

    # Alpine
    'Alpine Renault': 'Alpine',
    
    # Lotus
    'Lotus Renault': 'Lotus',
    'Lotus Ford': 'Lotus',
    'Lotus Climax': 'Lotus',
    'Lotus BRM': 'Lotus',
    'Lotus Honda': 'Lotus',
    'Lotus Judd': 'Lotus',
    'Lotus Lamborghini': 'Lotus',
    'Lotus Mugen Honda': 'Lotus',
    'Lotus Mercedes': 'Lotus',
    'Lotus Cosworth': 'Lotus',
    'Lotus Maserati': 'Lotus',
    'Lotus Pratt & Whitney': 'Lotus',
    
    # Force India
    'Force India Ferrari': 'Force India',
    'Force India Mercedes': 'Force India',

    # Racing Point
    'Racing Point BWT Mercedes': 'Racing Point',

    # Sauber
    'Sauber': 'Sauber',
    'Sauber Ferrari': 'Sauber',
    'Sauber Petronas': 'Sauber',
    'Sauber BMW': 'Sauber',
    'Sauber Mercedes': 'Sauber',
    'Sauber Ford': 'Sauber',
    'Kick Sauber Ferrari': 'Sauber',

    # Alfa Romeo
    'Alfa Romeo Racing Ferrari': 'Alfa Romeo',
    'Alfa Romeo Ferrari': 'Alfa Romeo',
    'Alfa Romeo': 'Alfa Romeo',
    
    # Haas
    'Haas Ferrari': 'Haas',
    
    # Jordan
    'Jordan Ford': 'Jordan',
    'Jordan Peugeot': 'Jordan',
    'Jordan Hart': 'Jordan',
    'Jordan Honda': 'Jordan',
    'Jordan Yamaha': 'Jordan',
    'Jordan Toyota': 'Jordan',
    'Jordan Mugen Honda': 'Jordan',
    
    # BAR
    'BAR Honda': 'BAR',
    'BAR Supertec': 'BAR',
    
    # Honda
    'Honda': 'Honda',
    
    # Benetton
    'Benetton Ford': 'Benetton',
    'Benetton BMW': 'Benetton',
    'Benetton Renault': 'Benetton',
    'Benetton Playlife': 'Benetton',
    
    # Toyota
    'Toyota': 'Toyota',
    
    # Jaguar
    'Jaguar Cosworth': 'Jaguar',
    
    # Stewart
    'Stewart Ford': 'Stewart',
    
    # BRM
    'BRM': 'BRM',
    'BRM Climax': 'BRM',

    # JBW
    'JBW Maserati': 'JBW',
    'JBW Climax': 'JBW',
    
    # Cooper
    'Cooper Climax': 'Cooper',
    'Cooper Maserati': 'Cooper',
    'Cooper Bristol': 'Cooper',
    'Cooper Castellotti': 'Cooper',
    'Cooper BRM': 'Cooper',
    'Cooper JAP': 'Cooper',
    'Cooper Alta': 'Cooper',
    'Cooper Borgward': 'Cooper',
    'Cooper Alfa Romeo': 'Cooper',
    'Cooper Ferrari': 'Cooper',
    'Cooper ATS': 'Cooper',
    'Cooper Ford': 'Cooper',
    'Cooper OSCA': 'Cooper',
    
    # Brabham
    'Brabham Climax': 'Brabham',
    'Brabham Repco': 'Brabham',
    'Brabham Ford': 'Brabham',
    'Brabham Alfa Romeo': 'Brabham',
    'Brabham BMW': 'Brabham',
    'Brabham BRM': 'Brabham',
    'Brabham Judd': 'Brabham',
    'Brabham Yamaha': 'Brabham',
    
    # Maserati
    'Maserati': 'Maserati',
    'Maserati Offenhauser': 'Maserati',
    'Maserati Milano': 'Maserati',
    'Maserati-Offenhauser': 'Maserati',
    'Maserati OSCA': 'Maserati',
    'Maserati Plate': 'Maserati',
    
    # Ligier
    'Ligier Matra': 'Ligier',
    'Ligier Ford': 'Ligier',
    'Ligier Renault': 'Ligier',
    'Ligier Megatron': 'Ligier',
    'Ligier Mugen Honda': 'Ligier',
    
    # Tyrrell
    'Tyrrell Ford': 'Tyrrell',
    'Tyrrell Renault': 'Tyrrell',
    'Tyrrell Honda': 'Tyrrell',
    'Tyrrell Yamaha': 'Tyrrell',
    'Tyrrell Ilmor': 'Tyrrell',
    
    # Arrows/Footwork
    'Arrows Ford': 'Arrows',
    'Arrows BMW': 'Arrows',
    'Arrows Megatron': 'Arrows',
    'Arrows Yamaha': 'Arrows',
    'Arrows Supertec': 'Arrows',
    'Arrows Asiatech': 'Arrows',
    'Arrows Cosworth': 'Arrows',
    'Arrows': 'Arrows',
    'Footwork Ford': 'Footwork',
    'Footwork Hart': 'Footwork',
    'Footwork Mugen Honda': 'Footwork',
    'Footwork Porsche': 'Footwork',
    
    # Vanwall
    'Vanwall': 'Vanwall',
    
    # Wolf
    'Wolf Ford': 'Wolf',
    'Wolf-Williams': 'Wolf',
    
    # Lola
    'Lola Ford': 'Lola',
    'Lola Lamborghini': 'Lola',
    'Lola Climax': 'Lola',
    'Lola BMW': 'Lola',
    'Lola Hart': 'Lola',
    'Lola Ferrari': 'Lola',

    # March
    'March Ford': 'March',
    'March Judd': 'March',
    'March Ilmor': 'March',
    'March Alfa Romeo': 'March',

    # Minardi
    'Minardi Ford': 'Minardi',
    'Minardi Ferrari': 'Minardi',
    'Minardi Lamborghini': 'Minardi',
    'Minardi Asiatech': 'Minardi',
    'Minardi Cosworth': 'Minardi',
    'Minardi Fondmetal': 'Minardi',
    'Minardi European': 'Minardi',
    'Minardi Hart': 'Minardi',
    'Minardi Motori Moderni': 'Minardi',
    
    # LDS
    'LDS Alfa Romeo': 'LDS',
    'LDS Climax': 'LDS',
    'LDS Repco': 'LDS',

    # Porche
    'Porsche (F2)': 'Porsche',
    'Porsche': 'Porsche',
    'Behra-Porsche': 'Porsche',

    # Scirocco
    'Scirocco BRM': 'Scirocco',
    'Scirocco Climax': 'Scirocco',

    # AFM
    'AFM Kuchen': 'AFM',
    'AFM BMW': 'AFM',
    'AFM Bristol': 'AFM',

    # ATS
    'ATS Ford': 'ATS',
    'ATS': 'ATS',
    'ATS BMW': 'ATS',
    'Derrington-Francis ATS': 'ATS',

    # Leyton House
    'Leyton House Judd': 'Leyton House',
    'Leyton House Ilmor': 'Leyton House',

    # Prost
    'Prost Mugen Honda': 'Prost',
    'Prost Peugeot': 'Prost',
    'Prost Acer': 'Prost',

    # Dallara
    'Dallara Judd': 'Dallara',
    'Dallara Ferrari': 'Dallara',
    'Dallara Ford': 'Dallara',

    # Larrousse
    'Larrousse Lamborghini': 'Larrousse',
    'Larrousse Ford': 'Larrousse',

    # Osella
    'Osella Ford': 'Osella',
    'Osella Alfa Romeo': 'Osella',
    'Osella': 'Osella',
    'Osella Hart': 'Osella',

    # Kurtis Kraft
    'Kurtis Kraft Offenhauser': 'Kurtis Kraft',
    'Kurtis Kraft Novi': 'Kurtis Kraft',
    'Kurtis Kraft Cummins': 'Kurtis Kraft',

    # Marussia
    'Marussia Cosworth': 'Marussia',
    'Marussia Ferrari': 'Marussia',

    # Gordini
    'Simca-Gordini': 'Gordini',
    'Gordini': 'Gordini',

    # Connaught
    'Connaught Lea Francis': 'Connaught',
    'Connaught Alta': 'Connaught',

    # Eagle
    'Eagle Climax': 'Eagle',
    'Eagle Weslake': 'Eagle',

    # RAM
    'RAM Ford': 'RAM',
    'RAM Hart': 'RAM',

    # Shadow
    'Shadow Ford': 'Shadow',
    'Shadow Matra': 'Shadow',

    # Matra
    'Matra Ford': 'Matra',
    'Matra': 'Matra',
    'Matra Cosworth': 'Matra',
    'Matra BRM': 'Matra',

    # ERA
    'ERA': 'ERA',
    'ERA Bristol': 'ERA',

    # Spirit
    'Spirit Honda': 'Spirit',   
    'Spirit Hart': 'Spirit',

    # Frazer Nash
    'Frazer Nash': 'Frazer Nash',
    'Frazer Nash Bristol': 'Frazer Nash',

    # Emeryson
    'Emeryson Alta': 'Emeryson',
    'Emeryson Climax': 'Emeryson',

    # De Tomaso
    'De Tomaso OSCA': 'De Tomaso',
    'De Tomaso Alfa Romeo': 'De Tomaso',
    'De Tomaso Ford': 'De Tomaso',

    # Gilby
    'Gilby Climax': 'Gilby',
    'Gilby BRM': 'Gilby',

    # Tecno
    'Tecno': 'Tecno',
    'Tecno Cosworth': 'Tecno',

    # Ligier
    'Ligier Judd': 'Ligier',
    'Ligier Lamborghini': 'Ligier',

    # Euro Brun
    'Euro Brun Judd': 'Euro Brun',
    'Euro Brun Ford': 'Euro Brun',


    # Other
    'No Team': 'Privateer',
    'Toleman Hart': 'Toleman',       
    'Venturi Lamborghini': 'Venturi',        
    'Onyx Ford': 'Onyx',
    'AGS Ford': 'AGS',   
    'Rial Ford': 'Rial',
    'Zakspeed': 'Zakspeed',
    'Theodore Ford': 'Theodore',
    'Deidt Offenhauser': 'Deidt',
    'Sherman Offenhauser': 'Sherman',
    'Schroeder Offenhauser': 'Schroeder',
    'Kuzma Offenhauser': 'Kuzma',
    'Lesovsky Offenhauser': 'Lesovsky',
    'Watson Offenhauser': 'Watson',
    'Phillips Offenhauser': 'Phillips',
    'Epperly Offenhauser': 'Epperly',
    'Trevis Offenhauser': 'Trevis',
    'HRT Cosworth': 'HRT',
    'Virgin Cosworth': 'Virgin',
    'Caterham Renault': 'Caterham',
    'Milano Speluzzi': 'Milano',
    'Turner Offenhauser': 'Turner',
    'Alta': 'Alta',    
    'Moore Offenhauser': 'Moore',
    'Nichels Offenhauser': 'Nichels',
    'Marchese Offenhauser': 'Marchese',
    'Stevens Offenhauser': 'Stevens',
    'Langley Offenhauser': 'Langley',
    'Ewing Offenhauser': 'Ewing',   
    'Rae Offenhauser': 'Rae',
    'Olson Offenhauser': 'Olson',
    'Wetteroth Offerhauser': 'Wetteroth',
    'Snowberger Offenhauser': 'Snowberger',
    'Adams Offenhauser': 'Adams',
    'HWM Alta': 'HWM',    
    'Lancia': 'Lancia',
    'Talbot-Lago': 'Talbot-Lago',
    'BRP BRM': 'BRP',
    'Hesketh Ford': 'Hesketh',
    'Hill Ford': 'Hill',
    'Ensign Ford': 'Ensign',
    'Penske Ford': 'Penske',
    'Fittipaldi Ford': 'Fittipaldi',
    'ISO Marlboro Ford': 'ISO Marlboro',
    'Iso Marlboro Ford': 'ISO Marlboro',
    'Surtees Ford': 'Surtees',
    'Parnelli Ford': 'Parnelli',
    'Super Aguri Honda': 'Super Aguri',
    'MRT Mercedes': 'Manor',
    'Brawn Mercedes': 'Brawn',
    'Spyker Ferrari': 'Spyker',
    'MF1 Toyota': 'Midland',
    'Veritas': 'Veritas',
    'Pawl Offenhauser': 'Pawl',
    'Hall Offenhauser': 'Hall',
    'Bromme Offenhauser': 'Bromme',
    'OSCA': 'OSCA',
    'BMW': 'BMW',
    'EMW': 'EMW',
    'Pankratz Offenhauser': 'Pankratz',
    'Bugatti': 'Bugatti',
    'Klenk BMW': 'Klenk',
    'Dunn Offenhauser': 'Dunn',    
    'Elder Offenhauser': 'Elder',
    'Christensen Offenhauser': 'Christensen',
    'Sutton Offenhauser': 'Sutton',
    'Tec-Mec Maserati': 'Tec-Mec',
    'Meskowski Offenhauser': 'Meskowski',
    'Scarab': 'Scarab',
    'Ferguson Climax': 'Ferguson',
    'ENB Maserati': 'ENB',
    'Stebro Ford': 'Stebro',               
    'Shannon Climax': 'Shannon',     
    'Protos Cosworth': 'Protos',   
    'Bellasi Ford': 'Bellasi',       
    'Eifelland Ford': 'Eifelland',
    'Politoys Ford': 'Politoys',
    'Connew Ford': 'Connew',
    'Trojan Ford': 'Trojan',
    'Amon Ford': 'Amon',
    'Token Ford': 'Token',
    'Lyncar Ford': 'Lyncar',
    'Boro Ford': 'Boro',
    'Kojima Ford': 'Kojima',
    'LEC Ford': 'LEC',
    'Merzario Ford': 'Merzario',
    'Martini Ford': 'Martini',
    'Rebaque Ford': 'Rebaque',
    'AGS Motori Moderni': 'AGS',
    'Coloni Ford': 'Coloni',
    'Zakspeed Yamaha': 'Zakspeed',
    'Fondmetal Ford': 'Fondmetal',
    'Moda Judd': 'Moda',    
    'Simtek Ford': 'Simtek',
    'Pacific Ilmor': 'Pacific',
    'Forti Ford': 'Forti',
    'Lambo Lamborghini': 'Modena'
}

In [31]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

# Establish year begin and end
year_begin = 2018
year_end = 2025

# Establish empty lists
link_data = []

print('SCRAPING 2018 - 2025 URLS . . .')
while year_begin <= year_end:

    # Use the year begin to be able to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_url = link.get_attribute("href")
            circuit_name = link.text
            link_data.append((race_url, circuit_name))

    year_begin += 1

print('SCRAPING 2018 - 2025 DATA . . .')
# Create a dictionary to map driver, race, circuit, and team names to unique IDs
driver_id_map = {}
race_id_map = {}
circuit_id_map = {}
team_id_map = {}

next_driver_id = max_driver_id + 1
next_race_id = 1
next_circuit_id = 1
next_team_id = 1

# Establish empty lists
driver_id = []
race_id = []
circuit_id = []
team_id = []
year = []
rounds = []
driver_name = []
team_name = []
end_position = []
points = []
laps_completed = []
circuit_names = []


# For each race data tuple, open it and get data from the table
current_year = None
r = 0

for link, circuit in link_data:
    browser.get(link)

    # Extract year from URL
    race_year = link.split('/')[5]
    
    # Reset round counter when year changes
    if race_year != current_year:
        current_year = race_year
        r = 1
    else:
        r += 1

    # Assign unique race ID
    race_key = str(race_year) + '_' + str(r)
    if race_key not in race_id_map:
        race_id_map[race_key] = next_race_id
        next_race_id += 1
    
    # Assign unique circuit ID
    if circuit not in circuit_id_map:
        circuit_id_map[circuit] = next_circuit_id
        next_circuit_id += 1

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            if len(cells) >= 6:
                end_position.append(cells[0].text)
                driver_name.append(cells[2].text)
                team = cells[3].text
                laps_completed.append(cells[4].text)
                points.append(cells[6].text)
                circuit_names.append(circuit)
                year.append(race_year)
                rounds.append(r)
                
                # Assign unique driver, race, and circuit IDs
                current_driver = cells[2].text
                if current_driver not in driver_id_map:
                    driver_id_map[current_driver] = next_driver_id
                    next_driver_id += 1
                
                # Map team name using constructor_mapping for common names
                mapped_team = constructor_mapping.get(team, team)
                team_name.append(mapped_team)
                
                # Assign unique team ID using mapped team name
                if mapped_team not in team_id_map:
                    team_id_map[mapped_team] = next_team_id
                    next_team_id += 1
                
                driver_id.append(driver_id_map[current_driver])
                race_id.append(race_id_map[race_key])
                circuit_id.append(circuit_id_map[circuit])
                team_id.append(team_id_map[mapped_team])

print('COMPLETE')
browser.close()

SCRAPING 2018 - 2025 URLS . . .
SCRAPING 2018 - 2025 DATA . . .
COMPLETE


In [32]:
# Turn the lists into a dataframe
races_2018 = pd.DataFrame({
    'driver_id': driver_id,
    'race_id': race_id,
    'circuit_id': circuit_id,
    'team_id': team_id,
    'year': year,
    'round_number': rounds,
    'circuit_name': circuit_names,
    'driver_name': driver_name,
    'team_name': team_name,
    'end_position': end_position,
    'points': points,
    'laps_completed': laps_completed
})

# Save results
races_2018.to_csv('../data/raw/races_results_raw_2018-2025.csv', encoding='utf-8', index=False)

In [33]:
# Save the ID maps and link_data for future use
with open('../data/raw/driver_id_map.pkl', 'wb') as f:
    pickle.dump(driver_id_map, f)

with open('../data/raw/circuit_id_map.pkl', 'wb') as f:
    pickle.dump(circuit_id_map, f)

with open('../data/raw/race_id_map.pkl', 'wb') as f:
    pickle.dump(race_id_map, f)

with open('../data/raw/team_id_map.pkl', 'wb') as f:
    pickle.dump(team_id_map, f)

with open('../data/raw/link_data.pkl', 'wb') as f:
    pickle.dump(link_data, f)

In [34]:
races_2018['team_name'].unique()

array(['Ferrari', 'Mercedes', 'Red Bull', 'McLaren', 'Renault',
       'Force India', 'Sauber', 'Williams', 'Toro Rosso', 'Haas',
       'Alfa Romeo', 'Racing Point', 'AlphaTauri', 'Aston Martin',
       'Alpine', 'Racing Bulls', 'Kick Sauber'], dtype=object)

# Scrape F1 Website [Practices]

In [35]:
# Reopen the ID maps and link_data from the saved pickle files
with open('../data/raw/driver_id_map.pkl', 'rb') as f:
    driver_id_map = pickle.load(f)

with open('../data/raw/circuit_id_map.pkl', 'rb') as f:
    circuit_id_map = pickle.load(f)

with open('../data/raw/race_id_map.pkl', 'rb') as f:
    race_id_map = pickle.load(f)

with open('../data/raw/team_id_map.pkl', 'rb') as f:
    team_id_map = pickle.load(f)

with open('../data/raw/link_data.pkl', 'rb') as f:
    link_data = pickle.load(f)

In [14]:
link_data

[('https://www.formula1.com/en/results/2018/races/979/australia/race-result',
  'Australia'),
 ('https://www.formula1.com/en/results/2018/races/980/bahrain/race-result',
  'Bahrain'),
 ('https://www.formula1.com/en/results/2018/races/981/china/race-result',
  'China'),
 ('https://www.formula1.com/en/results/2018/races/982/azerbaijan/race-result',
  'Azerbaijan'),
 ('https://www.formula1.com/en/results/2018/races/983/spain/race-result',
  'Spain'),
 ('https://www.formula1.com/en/results/2018/races/984/monaco/race-result',
  'Monaco'),
 ('https://www.formula1.com/en/results/2018/races/985/canada/race-result',
  'Canada'),
 ('https://www.formula1.com/en/results/2018/races/986/france/race-result',
  'France'),
 ('https://www.formula1.com/en/results/2018/races/987/austria/race-result',
  'Austria'),
 ('https://www.formula1.com/en/results/2018/races/988/great-britain/race-result',
  'Great Britain'),
 ('https://www.formula1.com/en/results/2018/races/989/germany/race-result',
  'Germany'),
 (

In [23]:
team_id_map

{'Mercedes': 1,
 'Red Bull Racing Honda': 2,
 'Ferrari': 3,
 'Haas Ferrari': 4,
 'Renault': 5,
 'Alfa Romeo Racing Ferrari': 6,
 'Racing Point BWT Mercedes': 7,
 'Scuderia Toro Rosso Honda': 8,
 'McLaren Renault': 9,
 'Williams Mercedes': 10,
 'AlphaTauri Honda': 11,
 'McLaren Mercedes': 12,
 'Aston Martin Mercedes': 13,
 'Alpine Renault': 14,
 'Alfa Romeo Ferrari': 15,
 'AlphaTauri RBPT': 16,
 'Aston Martin Aramco Mercedes': 17,
 'Red Bull Racing RBPT': 18,
 'Red Bull Racing Honda RBPT': 19,
 'AlphaTauri Honda RBPT': 20,
 'Kick Sauber Ferrari': 21,
 'RB Honda RBPT': 22,
 'McLaren': 23,
 'Red Bull Racing': 24,
 'Williams': 25,
 'Aston Martin': 26,
 'Kick Sauber': 27,
 'Alpine': 28,
 'Racing Bulls': 29,
 'Haas': 30}

In [16]:
race_id_map['2025_5']

154

In [None]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

print('SCRAPING PRACTICE DATA . . .')
# Initiate data lists
race_id = []
driver_id = []
team_id = []
session_type = []
lap_time = []
lap_count = []
position = []

current_year = None
r = 0

for link, circuit in link_data:
    
    # Set the practice number to start at 1
    p = 1
    
    # Extract year from URL
    race_year = link.split('/')[5]

    # Reset round counter when year changes
    if race_year != current_year:
        current_year = race_year
        r = 1
    else:
        r += 1

    while True:
        # Get the practice URL
        practice_url = link.replace('/race-result', '/practice/') + str(p)
        browser.get(practice_url)
        
        try:
            # Find the table
            table = browser.find_elements(By.TAG_NAME, 'table')
            for tr in table:
                # Find the table rows
                rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
                for row in rows:
                    # Find the table data
                    cells = row.find_elements(By.TAG_NAME, 'td')
                    
                    # Append constant data
                    race_id.append(race_id_map[f'{race_year}_{r}'])
                    driver_id.append(driver_id_map[f'{cells[2].text}'])
                    team_id.append()
                    session.append('practice ' + str(p))

                    # Append table data
                    position.append(cells[0].text)
                    driver_name.append(cells[2].text)
                    team_name.append(cells[3].text)
                    
                    # For the first row after header, save that lap_time as the base time
                    if row == rows[0]:
                        # Find raw lap time
                        lap_time = cells[4].text
                        # Split into parts (min, sec, millisec)
                        time_parts = re.split(r"[:.]", lap_time)
                        minutes = int(time_parts[0])
                        seconds = int(time_parts[1])
                        milliseconds = int(time_parts[2])
                        # Convert that into timedelta so it can be added later
                        base_time = timedelta(minutes=minutes, seconds=seconds, milliseconds=milliseconds)
                        # Append it to the list
                        lap_times.append(base_time)
                    else:
                        # Find raw lap time
                        lap_time = cells[4].text
                        # Get rid of the + and s
                        time_clean = lap_time.strip('+s')
                        # Split into parts (sec, millisec)
                        time_parts = time_clean.split('.')
                        gap_seconds = int(time_parts[0])
                        gap_milliseconds = int(time_parts[1])
                        # Convert that into timedelta so it can be added
                        gap = timedelta(seconds=gap_seconds, milliseconds=gap_milliseconds)

                        # Add the time gap to the base time
                        new_time = base_time + gap
                        lap_times.append(new_time)
                    
                    laps.append(cells[5].text)
            p += 1
            if p > 3:
                break
        except:
            p += 1
            if p > 3:
                break

print('COMPLETE')
browser.close()

# Weather Collection

In [32]:
import fastf1
import pandas as pd

# Enable cache (important for performance)
#fastf1.Cache.enable_cache("cache")  # creates a folder "cache" to store data

# Load a session: example Bahrain GP 2023 Qualifying
session = fastf1.get_session(2018, 'australia', 'fp1')
session.load(laps=False, telemetry=False, messages=False)  # downloads and parses the data

# Weather data is stored in session.weather_data (a structured numpy array)
weather_array = session.weather_data

# Convert to DataFrame
weather_df = pd.DataFrame(weather_array)

print(weather_df.head())

core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for weather_data. Loading data...
_api           INFO 	Fetching weather data...
req            INFO 	Data has been written to cache!
core           INFO 	Finished loading data for 20 drivers: ['2', '3', '5', '7', '8', '9', '10', '11', '14', '16', '18', '20', '27', '28', '31', '33', '35', '44', '55', '77']


                    Time  AirTemp  Humidity  Pressure  Rainfall  TrackTemp  \
0 0 days 00:00:24.964000     25.3      36.7    1020.2     False       38.5   
1 0 days 00:01:24.977000     25.4      36.9    1020.0     False       38.5   
2 0 days 00:02:24.990000     25.2      36.8    1020.1     False       38.5   
3 0 days 00:03:25.002000     25.3      36.1    1020.1     False       38.5   
4 0 days 00:04:25.014000     25.4      35.9    1020.0     False       38.6   

   WindDirection  WindSpeed  
0            330        2.6  
1            308        2.5  
2            305        2.7  
3            305        2.5  
4            325        2.8  


In [20]:
weather_df

Unnamed: 0,Time,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 00:00:41.425000,24.7,17.0,1017.5,False,29.8,338,0.8
1,0 days 00:01:41.221000,24.6,17.0,1017.5,False,29.7,17,0.8
2,0 days 00:02:41.235000,24.6,17.0,1017.5,False,29.6,323,0.5
3,0 days 00:03:41.250000,24.6,17.0,1017.5,False,29.6,345,1.0
4,0 days 00:04:41.264000,24.6,18.0,1017.4,False,29.5,355,0.8
...,...,...,...,...,...,...,...,...
90,0 days 01:30:41.640000,23.1,33.0,1017.9,False,26.7,0,1.1
91,0 days 01:31:41.639000,23.1,33.0,1017.9,False,26.7,311,0.4
92,0 days 01:32:41.638000,23.1,33.0,1017.9,False,26.7,318,1.2
93,0 days 01:33:41.637000,23.1,33.0,1017.9,False,26.7,332,1.0


In [None]:
def aggregate_weather(weather_df):
    agg = {}
    for col in ['AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'Pressure']:
        agg[f'{col}_mean'] = weather_df[col].mean()
        agg[f'{col}_min'] = weather_df[col].min()
        agg[f'{col}_max'] = weather_df[col].max()
        agg[f'{col}_std'] = weather_df[col].std()
    
    # Rain flag and proportion
    agg['RainAffected'] = int(weather_df['Rainfall'].any())
    agg['RainFraction'] = weather_df['Rainfall'].mean()
    
    return pd.Series(agg)

# Usage
session_weather_features = aggregate_weather(weather_df)
print(session_weather_features)


AirTemp_mean        23.880000
AirTemp_min         23.000000
AirTemp_max         24.700000
AirTemp_std          0.527660
TrackTemp_mean      28.085263
TrackTemp_min       26.600000
TrackTemp_max       29.800000
TrackTemp_std        0.970629
WindSpeed_mean       0.607368
WindSpeed_min        0.000000
WindSpeed_max        1.300000
WindSpeed_std        0.356174
Humidity_mean       21.652632
Humidity_min        15.000000
Humidity_max        34.000000
Humidity_std         6.799200
Pressure_mean     1017.641053
Pressure_min      1017.400000
Pressure_max      1017.900000
Pressure_std         0.132484
RainAffected         0.000000
RainFraction         0.000000
dtype: float64
