# Scrape Data
Jack Wilson
9/23/2025

# Import Modules

In [2]:
import pandas as pd
import numpy as np
import time, random, re, os
from math import e

import pickle
from datetime import timedelta

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

# Configure DataFrame Display Options

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)

# Define Functions and Mapping

In [None]:
# Constructors common name mapping
constructor_mapping = {
    # Red Bull
    'Red Bull Racing Renault': 'Red Bull',
    'Red Bull Renault': 'Red Bull',
    'RBR Renault': 'Red Bull',
    'RBR Cosworth': 'Red Bull',
    'RBR Ferrari': 'Red Bull',
    'Red Bull Racing TAG Heuer': 'Red Bull',
    'Red Bull Racing Honda': 'Red Bull',
    'Red Bull Racing RBPT': 'Red Bull',
    'Red Bull Racing Honda RBPT': 'Red Bull',
    'Red Bull Racing': 'Red Bull',
    
    # AlphaTauri/Toro Rosso
    'Toro Rosso': 'Toro Rosso',
    'STR Ferrari': 'Toro Rosso',
    'STR Renault': 'Toro Rosso',
    'STR Cosworth': 'Toro Rosso',
    'Toro Rosso Ferrari': 'Toro Rosso',
    'Scuderia Toro Rosso Honda': 'Toro Rosso',
    'AlphaTauri Honda': 'AlphaTauri',
    'AlphaTauri RBPT': 'AlphaTauri',
    'AlphaTauri Honda RBPT': 'AlphaTauri',
    
    # Racing Bulls
    'RB Honda RBPT': 'Racing Bulls',
    
    # Ferrari
    'Ferrari': 'Ferrari',
    'Ferrari Jaguar': 'Ferrari',
    'Thin Wall Ferrari': 'Ferrari',
    
    # Mercedes
    'Mercedes': 'Mercedes',
    'Mercedes-Benz': 'Mercedes',
    
    # Aston Martin
    'Aston Martin Mercedes': 'Aston Martin',
    'Aston Martin Aramco Mercedes': 'Aston Martin',
    'Aston Butterworth': 'Aston Martin',
    'Aston Martin': 'Aston Martin',
    
    # McLaren
    'McLaren Ford': 'McLaren',
    'McLaren TAG': 'McLaren',
    'McLaren Honda': 'McLaren',
    'McLaren Peugeot': 'McLaren',
    'McLaren Renault': 'McLaren',
    'McLaren BRM': 'McLaren',
    'McLaren Mercedes': 'McLaren',
    'McLaren Serenissima': 'McLaren',
    'Mclaren BRM': 'McLaren',
    'McLaren Alfa Romeo': 'McLaren',
    
    # Williams
    'Williams Ford': 'Williams',
    'Williams Renault': 'Williams',
    'Williams Honda': 'Williams',
    'Williams Judd': 'Williams',
    'Williams BMW': 'Williams',
    'Williams Toyota': 'Williams',
    'Williams Cosworth': 'Williams',
    'Williams Mecachrome': 'Williams',
    'Williams Supertec': 'Williams',
    'Williams Mercedes': 'Williams',
    'Frank Williams Racing Cars/Williams': 'Williams',
    
    # Renault
    'Renault': 'Renault',

    # Alpine
    'Alpine Renault': 'Alpine',
    
    # Lotus
    'Lotus Renault': 'Lotus',
    'Lotus Ford': 'Lotus',
    'Lotus Climax': 'Lotus',
    'Lotus BRM': 'Lotus',
    'Lotus Honda': 'Lotus',
    'Lotus Judd': 'Lotus',
    'Lotus Lamborghini': 'Lotus',
    'Lotus Mugen Honda': 'Lotus',
    'Lotus Mercedes': 'Lotus',
    'Lotus Cosworth': 'Lotus',
    'Lotus Maserati': 'Lotus',
    'Lotus Pratt & Whitney': 'Lotus',
    
    # Force India
    'Force India Ferrari': 'Force India',
    'Force India Mercedes': 'Force India',

    # Racing Point
    'Racing Point BWT Mercedes': 'Racing Point',

    # Sauber
    'Sauber': 'Sauber',
    'Sauber Ferrari': 'Sauber',
    'Sauber Petronas': 'Sauber',
    'Sauber BMW': 'Sauber',
    'Sauber Mercedes': 'Sauber',
    'Sauber Ford': 'Sauber',
    'Kick Sauber Ferrari': 'Sauber',

    # Alfa Romeo
    'Alfa Romeo Racing Ferrari': 'Alfa Romeo',
    'Alfa Romeo Ferrari': 'Alfa Romeo',
    'Alfa Romeo': 'Alfa Romeo',
    
    # Haas
    'Haas Ferrari': 'Haas',
    
    # Jordan
    'Jordan Ford': 'Jordan',
    'Jordan Peugeot': 'Jordan',
    'Jordan Hart': 'Jordan',
    'Jordan Honda': 'Jordan',
    'Jordan Yamaha': 'Jordan',
    'Jordan Toyota': 'Jordan',
    'Jordan Mugen Honda': 'Jordan',
    
    # BAR
    'BAR Honda': 'BAR',
    'BAR Supertec': 'BAR',
    
    # Honda
    'Honda': 'Honda',
    
    # Benetton
    'Benetton Ford': 'Benetton',
    'Benetton BMW': 'Benetton',
    'Benetton Renault': 'Benetton',
    'Benetton Playlife': 'Benetton',
    
    # Toyota
    'Toyota': 'Toyota',
    
    # Jaguar
    'Jaguar Cosworth': 'Jaguar',
    
    # Stewart
    'Stewart Ford': 'Stewart',
    
    # BRM
    'BRM': 'BRM',
    'BRM Climax': 'BRM',

    # JBW
    'JBW Maserati': 'JBW',
    'JBW Climax': 'JBW',
    
    # Cooper
    'Cooper Climax': 'Cooper',
    'Cooper Maserati': 'Cooper',
    'Cooper Bristol': 'Cooper',
    'Cooper Castellotti': 'Cooper',
    'Cooper BRM': 'Cooper',
    'Cooper JAP': 'Cooper',
    'Cooper Alta': 'Cooper',
    'Cooper Borgward': 'Cooper',
    'Cooper Alfa Romeo': 'Cooper',
    'Cooper Ferrari': 'Cooper',
    'Cooper ATS': 'Cooper',
    'Cooper Ford': 'Cooper',
    'Cooper OSCA': 'Cooper',
    
    # Brabham
    'Brabham Climax': 'Brabham',
    'Brabham Repco': 'Brabham',
    'Brabham Ford': 'Brabham',
    'Brabham Alfa Romeo': 'Brabham',
    'Brabham BMW': 'Brabham',
    'Brabham BRM': 'Brabham',
    'Brabham Judd': 'Brabham',
    'Brabham Yamaha': 'Brabham',
    
    # Maserati
    'Maserati': 'Maserati',
    'Maserati Offenhauser': 'Maserati',
    'Maserati Milano': 'Maserati',
    'Maserati-Offenhauser': 'Maserati',
    'Maserati OSCA': 'Maserati',
    'Maserati Plate': 'Maserati',
    
    # Ligier
    'Ligier Matra': 'Ligier',
    'Ligier Ford': 'Ligier',
    'Ligier Renault': 'Ligier',
    'Ligier Megatron': 'Ligier',
    'Ligier Mugen Honda': 'Ligier',
    
    # Tyrrell
    'Tyrrell Ford': 'Tyrrell',
    'Tyrrell Renault': 'Tyrrell',
    'Tyrrell Honda': 'Tyrrell',
    'Tyrrell Yamaha': 'Tyrrell',
    'Tyrrell Ilmor': 'Tyrrell',
    
    # Arrows/Footwork
    'Arrows Ford': 'Arrows',
    'Arrows BMW': 'Arrows',
    'Arrows Megatron': 'Arrows',
    'Arrows Yamaha': 'Arrows',
    'Arrows Supertec': 'Arrows',
    'Arrows Asiatech': 'Arrows',
    'Arrows Cosworth': 'Arrows',
    'Arrows': 'Arrows',
    'Footwork Ford': 'Footwork',
    'Footwork Hart': 'Footwork',
    'Footwork Mugen Honda': 'Footwork',
    'Footwork Porsche': 'Footwork',
    
    # Vanwall
    'Vanwall': 'Vanwall',
    
    # Wolf
    'Wolf Ford': 'Wolf',
    'Wolf-Williams': 'Wolf',
    
    # Lola
    'Lola Ford': 'Lola',
    'Lola Lamborghini': 'Lola',
    'Lola Climax': 'Lola',
    'Lola BMW': 'Lola',
    'Lola Hart': 'Lola',
    'Lola Ferrari': 'Lola',

    # March
    'March Ford': 'March',
    'March Judd': 'March',
    'March Ilmor': 'March',
    'March Alfa Romeo': 'March',

    # Minardi
    'Minardi Ford': 'Minardi',
    'Minardi Ferrari': 'Minardi',
    'Minardi Lamborghini': 'Minardi',
    'Minardi Asiatech': 'Minardi',
    'Minardi Cosworth': 'Minardi',
    'Minardi Fondmetal': 'Minardi',
    'Minardi European': 'Minardi',
    'Minardi Hart': 'Minardi',
    'Minardi Motori Moderni': 'Minardi',
    
    # LDS
    'LDS Alfa Romeo': 'LDS',
    'LDS Climax': 'LDS',
    'LDS Repco': 'LDS',

    # Porche
    'Porsche (F2)': 'Porsche',
    'Porsche': 'Porsche',
    'Behra-Porsche': 'Porsche',

    # Scirocco
    'Scirocco BRM': 'Scirocco',
    'Scirocco Climax': 'Scirocco',

    # AFM
    'AFM Kuchen': 'AFM',
    'AFM BMW': 'AFM',
    'AFM Bristol': 'AFM',

    # ATS
    'ATS Ford': 'ATS',
    'ATS': 'ATS',
    'ATS BMW': 'ATS',
    'Derrington-Francis ATS': 'ATS',

    # Leyton House
    'Leyton House Judd': 'Leyton House',
    'Leyton House Ilmor': 'Leyton House',

    # Prost
    'Prost Mugen Honda': 'Prost',
    'Prost Peugeot': 'Prost',
    'Prost Acer': 'Prost',

    # Dallara
    'Dallara Judd': 'Dallara',
    'Dallara Ferrari': 'Dallara',
    'Dallara Ford': 'Dallara',

    # Larrousse
    'Larrousse Lamborghini': 'Larrousse',
    'Larrousse Ford': 'Larrousse',

    # Osella
    'Osella Ford': 'Osella',
    'Osella Alfa Romeo': 'Osella',
    'Osella': 'Osella',
    'Osella Hart': 'Osella',

    # Kurtis Kraft
    'Kurtis Kraft Offenhauser': 'Kurtis Kraft',
    'Kurtis Kraft Novi': 'Kurtis Kraft',
    'Kurtis Kraft Cummins': 'Kurtis Kraft',

    # Marussia
    'Marussia Cosworth': 'Marussia',
    'Marussia Ferrari': 'Marussia',

    # Gordini
    'Simca-Gordini': 'Gordini',
    'Gordini': 'Gordini',

    # Connaught
    'Connaught Lea Francis': 'Connaught',
    'Connaught Alta': 'Connaught',

    # Eagle
    'Eagle Climax': 'Eagle',
    'Eagle Weslake': 'Eagle',

    # RAM
    'RAM Ford': 'RAM',
    'RAM Hart': 'RAM',

    # Shadow
    'Shadow Ford': 'Shadow',
    'Shadow Matra': 'Shadow',

    # Matra
    'Matra Ford': 'Matra',
    'Matra': 'Matra',
    'Matra Cosworth': 'Matra',
    'Matra BRM': 'Matra',

    # ERA
    'ERA': 'ERA',
    'ERA Bristol': 'ERA',

    # Spirit
    'Spirit Honda': 'Spirit',   
    'Spirit Hart': 'Spirit',

    # Frazer Nash
    'Frazer Nash': 'Frazer Nash',
    'Frazer Nash Bristol': 'Frazer Nash',

    # Emeryson
    'Emeryson Alta': 'Emeryson',
    'Emeryson Climax': 'Emeryson',

    # De Tomaso
    'De Tomaso OSCA': 'De Tomaso',
    'De Tomaso Alfa Romeo': 'De Tomaso',
    'De Tomaso Ford': 'De Tomaso',

    # Gilby
    'Gilby Climax': 'Gilby',
    'Gilby BRM': 'Gilby',

    # Tecno
    'Tecno': 'Tecno',
    'Tecno Cosworth': 'Tecno',

    # Ligier
    'Ligier Judd': 'Ligier',
    'Ligier Lamborghini': 'Ligier',

    # Euro Brun
    'Euro Brun Judd': 'Euro Brun',
    'Euro Brun Ford': 'Euro Brun',


    # Other
    'No Team': 'Privateer',
    'Toleman Hart': 'Toleman',       
    'Venturi Lamborghini': 'Venturi',        
    'Onyx Ford': 'Onyx',
    'AGS Ford': 'AGS',   
    'Rial Ford': 'Rial',
    'Zakspeed': 'Zakspeed',
    'Theodore Ford': 'Theodore',
    'Deidt Offenhauser': 'Deidt',
    'Sherman Offenhauser': 'Sherman',
    'Schroeder Offenhauser': 'Schroeder',
    'Kuzma Offenhauser': 'Kuzma',
    'Lesovsky Offenhauser': 'Lesovsky',
    'Watson Offenhauser': 'Watson',
    'Phillips Offenhauser': 'Phillips',
    'Epperly Offenhauser': 'Epperly',
    'Trevis Offenhauser': 'Trevis',
    'HRT Cosworth': 'HRT',
    'Virgin Cosworth': 'Virgin',
    'Caterham Renault': 'Caterham',
    'Milano Speluzzi': 'Milano',
    'Turner Offenhauser': 'Turner',
    'Alta': 'Alta',    
    'Moore Offenhauser': 'Moore',
    'Nichels Offenhauser': 'Nichels',
    'Marchese Offenhauser': 'Marchese',
    'Stevens Offenhauser': 'Stevens',
    'Langley Offenhauser': 'Langley',
    'Ewing Offenhauser': 'Ewing',   
    'Rae Offenhauser': 'Rae',
    'Olson Offenhauser': 'Olson',
    'Wetteroth Offerhauser': 'Wetteroth',
    'Snowberger Offenhauser': 'Snowberger',
    'Adams Offenhauser': 'Adams',
    'HWM Alta': 'HWM',    
    'Lancia': 'Lancia',
    'Talbot-Lago': 'Talbot-Lago',
    'BRP BRM': 'BRP',
    'Hesketh Ford': 'Hesketh',
    'Hill Ford': 'Hill',
    'Ensign Ford': 'Ensign',
    'Penske Ford': 'Penske',
    'Fittipaldi Ford': 'Fittipaldi',
    'ISO Marlboro Ford': 'ISO Marlboro',
    'Iso Marlboro Ford': 'ISO Marlboro',
    'Surtees Ford': 'Surtees',
    'Parnelli Ford': 'Parnelli',
    'Super Aguri Honda': 'Super Aguri',
    'MRT Mercedes': 'Manor',
    'Brawn Mercedes': 'Brawn',
    'Spyker Ferrari': 'Spyker',
    'MF1 Toyota': 'Midland',
    'Veritas': 'Veritas',
    'Pawl Offenhauser': 'Pawl',
    'Hall Offenhauser': 'Hall',
    'Bromme Offenhauser': 'Bromme',
    'OSCA': 'OSCA',
    'BMW': 'BMW',
    'EMW': 'EMW',
    'Pankratz Offenhauser': 'Pankratz',
    'Bugatti': 'Bugatti',
    'Klenk BMW': 'Klenk',
    'Dunn Offenhauser': 'Dunn',    
    'Elder Offenhauser': 'Elder',
    'Christensen Offenhauser': 'Christensen',
    'Sutton Offenhauser': 'Sutton',
    'Tec-Mec Maserati': 'Tec-Mec',
    'Meskowski Offenhauser': 'Meskowski',
    'Scarab': 'Scarab',
    'Ferguson Climax': 'Ferguson',
    'ENB Maserati': 'ENB',
    'Stebro Ford': 'Stebro',               
    'Shannon Climax': 'Shannon',     
    'Protos Cosworth': 'Protos',   
    'Bellasi Ford': 'Bellasi',       
    'Eifelland Ford': 'Eifelland',
    'Politoys Ford': 'Politoys',
    'Connew Ford': 'Connew',
    'Trojan Ford': 'Trojan',
    'Amon Ford': 'Amon',
    'Token Ford': 'Token',
    'Lyncar Ford': 'Lyncar',
    'Boro Ford': 'Boro',
    'Kojima Ford': 'Kojima',
    'LEC Ford': 'LEC',
    'Merzario Ford': 'Merzario',
    'Martini Ford': 'Martini',
    'Rebaque Ford': 'Rebaque',
    'AGS Motori Moderni': 'AGS',
    'Coloni Ford': 'Coloni',
    'Zakspeed Yamaha': 'Zakspeed',
    'Fondmetal Ford': 'Fondmetal',
    'Moda Judd': 'Moda',    
    'Simtek Ford': 'Simtek',
    'Pacific Ilmor': 'Pacific',
    'Forti Ford': 'Forti',
    'Lambo Lamborghini': 'Modena'
}

In [35]:
def load_id_map(path: str):
    """
    Load the pickle file ID maps if they exist, otherwise return an empty dictionary
    
    """
    if os.path.exists(path):
        with open(path, 'rb') as f:
            return pickle.load(f)
    else:
        return {}

def save_id_map(path: str, id_map):
    """
    Save the ID map to a pickle file

    """
    with open(path, 'wb') as f:
        pickle.dump(id_map, f)

In [36]:
def init_col_map(col_map: dict):
    """
    Takes a {column_name: column_index} dictionary as an input and returns a new dictionary with
    indexes and empty lists

    """
    return {col: {'index': index, 'values': []} for col, index in col_map.items()}

In [None]:
def scrape_f1_website(urls: list, total_col: int, col_idx_map: dict, id_cols: list, data_folder: str = '../data/raw') -> pd.DataFrame:
    """
    Scrapes a table from a website and returns a dataframe of scraped values


    Parameters
    ----------
    urls : list
        The webpage URL(s) to scrape
    total_cols : int
        Number of columns in the table
    col_idx_map : dict
        A dictionary mapping desired column names to column indices
        Example: {'race_id': None, 'start_pos': 1, 'driver_name': 3...}
    id_cols : list
        List of the names of ID columns in the col_idx_map
    data_folder : str, optional
        File path of data folder for saving any ID maps
    

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the scraped table
    
    """
    # Initiate data mapping
    col_data = init_col_map(col_idx_map)

    # Establish web browser
    browser = webdriver.Chrome()
    browser.maximize_window()
    
    for url in urls:
        
        # Validate URL
        try:
            browser.get(url)
        except Exception as e:
            print(f'URL ERROR: "{url}"\n{e}')
            continue

        # Extract year from URL
        #race_year = url.split('/')[5]

        try:
            # Find table data
            table = browser.find_elements(By.TAG_NAME, 'table')
            for tr in table:
                rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, 'td')
                    
                    # Validate table has the right number of columns
                    if len(cells) == total_col:
                        
                        # For each column in the column map append the corresponding data
                        for col_name, col_info in col_data.items():
                            
                            # Create IDs and ID maps
                            if col_name in id_cols:
                                
                                # Load or create ID map
                                if data_folder:
                                    id_map = load_id_map(f'{data_folder}/{col_name}_map.pkl')
                                else:
                                    id_map = load_id_map(f'{col_name}_map.pkl')
                                
                                # Get the value from the table cell using the index from col_map
                                scraped_value = cells[col_info['index']].text
                                
                                # Search through ID map keys to find a match
                                matched_key = None
                                for existing_key in id_map.keys():
                                    if scraped_value in existing_key or existing_key.endswith(scraped_value):
                                        matched_key = existing_key
                                        break
                                
                                # Use matched key if found, otherwise use scraped value
                                lookup_key = matched_key if matched_key is not None else scraped_value
                                
                                # Append existing ID or create new key-value pair
                                if lookup_key in id_map:
                                    col_info['values'].append(id_map[lookup_key])
                                else:
                                    new_id = max(id_map.values()) + 1 if id_map else 1
                                    id_map[lookup_key] = new_id
                                    col_info['values'].append(new_id)
                                
                                # Save the updated ID map
                                if data_folder:
                                    save_id_map(f'{data_folder}/{col_name}_map.pkl', id_map)
                                else:
                                    save_id_map(f'{col_name}_map.pkl', id_map)
                            
                            # Handle non-ID columns
                            else:
                                if col_info['index'] is not None and col_info['index'] < len(cells):
                                    col_info['values'].append(cells[col_info['index']].text)
                                else:
                                    col_info['values'].append(None)
        except Exception as e:
            print(f'NO DATA FOUND ERROR: {e}')
    
    browser.close()
    
    # Convert column data to DataFrame
    df_data = {}
    for col_name, col_info in col_data.items():
        df_data[col_name] = col_info['values']
    
    try:
        df = pd.DataFrame(df_data)
    except Exception as e:
        print(f'ARRAY LENGTH ERROR: {e}')
        return(f'ERROR: {e}')
    
    return df

In [53]:
urls = ['https://www.formula1.com/en/results/2025/races/1270/singapore/race-result', 'https://www.formula1.com/en/results/2025/races/1269/azerbaijan/race-result']
total_cols = 7
col_map = {'driver_id': 2, 'end_position': 0, 'driver_name': 2, 'team_name': 3, 'points': 6}
id_cols = ['driver_id']
df = scrape_f1_website(urls, total_cols, col_map, id_cols)
df

Unnamed: 0,driver_id,end_position,driver_name,team_name,points
0,130,1,George Russell,Mercedes,25
1,111,2,Max Verstappen,Red Bull Racing,18
2,127,3,Lando Norris,McLaren,15
3,141,4,Oscar Piastri,McLaren,12
4,146,5,Kimi Antonelli,Mercedes,10
5,118,6,Charles Leclerc,Ferrari,8
6,110,7,Fernando Alonso,Aston Martin,6
7,107,8,Lewis Hamilton,Ferrari,4
8,143,9,Oliver Bearman,Haas,2
9,115,10,Carlos Sainz,Williams,1


---

# Testing

In [None]:

                        # Append constant data with fallback for missing IDs
                        driver_name = cells[2].text
                        driver_name = re.sub(r'\s+', ' ', driver_name).strip()
                        for existing_name in driver_id_map.keys():
                            if driver_name in existing_name or existing_name.endswith(driver_name):
                                driver_name = existing_name
                                break
                        if driver_name in driver_id_map:
                            driver_id.append(driver_id_map[driver_name])
                        else:
                            new_driver_id = max(driver_id_map.values()) + 1 if driver_id_map else 1
                            driver_id_map[driver_name] = new_driver_id
                            driver_id.append(new_driver_id)
                        
                        race_key = circuit + '_' + str(race_year)
                        if race_key in race_id_map:
                            race_id.append(race_id_map[race_key])
                        else:
                            new_race_id = max(race_id_map.values()) + 1 if race_id_map else 1
                            race_id_map[race_key] = new_race_id
                            race_id.append(new_race_id)
                        
                        # Map team name using constructor_mapping for common names
                        team = cells[3].text
                        mapped_team = constructor_mapping.get(team, team)
                        if mapped_team in team_id_map:
                            team_id.append(team_id_map[mapped_team])
                        else:
                            new_team_id = max(team_id_map.values()) + 1 if team_id_map else 1
                            team_id_map[mapped_team] = new_team_id
                            team_id.append(new_team_id)

                        # Append table data
                        session_type.append('Practice ' + str(p))
                        position.append(cells[0].text)
                        lap_count.append(cells[5].text)

                        # For the first row, save that lap_time as the base time, add gaps to that time
                        if row == rows[0]:
                            # Find raw lap time
                            lap_time = cells[4].text
                            
                            # Check if lap time is blank
                            if not lap_time.strip():
                                lap_times.append('NULL')
                            
                            # Distinguish between times over and under a minute
                            if ':' in lap_time:
                                # Time in "min:sec.millisec" format
                                time_parts = re.split(r"[:.]", lap_time)
                                minutes = int(time_parts[0])
                                seconds = int(time_parts[1])
                                milliseconds = int(time_parts[2])
                            else:
                                # Time in "sec.millisec" format
                                time_parts = lap_time.split('.')
                                minutes = 0
                                seconds = int(time_parts[0])
                                milliseconds = int(time_parts[1])
                            
                            # Convert that into timedelta so it can be added later
                            base_time = timedelta(minutes=minutes, seconds=seconds, milliseconds=milliseconds)
                            
                            # Append it to the list
                            lap_times.append(base_time)
                        else:
                            # Find raw lap time
                            lap_time = cells[4].text

                            # Check if lap time is blank
                            if not lap_time.strip():
                                lap_times.append('NULL')
                            else:
                                # Get rid of the + and s
                                time_clean = lap_time.strip('+s')
                                
                                # Distinguish between times over and under a minute
                                if ':' in time_clean:
                                    # Gap time in "min:sec.millisec" format
                                    time_parts = re.split(r"[:.]", time_clean)
                                    gap_minutes = int(time_parts[0])
                                    gap_seconds = int(time_parts[1])
                                    gap_milliseconds = int(time_parts[2])
                                else:
                                    # Gap time in "sec.millisec" format
                                    time_parts = time_clean.split('.')
                                    gap_minutes = 0
                                    gap_seconds = int(time_parts[0])
                                    gap_milliseconds = int(time_parts[1])
                                
                                # Convert that into timedelta so it can be added
                                gap = timedelta(minutes=gap_minutes, seconds=gap_seconds, milliseconds=gap_milliseconds)

                                # Add the time gap to the base time
                                new_time = base_time + gap
                                lap_times.append(new_time)
                    
    except Exception as e:
        print(f'ERROR: {e}')
        return(f'ERROR: {e}')

    print('COMPLETE')
    browser.close()

    

In [52]:
with open('../data/raw/race_id_map.pkl', 'rb') as f:
    driver_id_map = pickle.load(f)
driver_id_map

{'Australia_2018': 1,
 'Bahrain_2018': 2,
 'China_2018': 3,
 'Azerbaijan_2018': 4,
 'Spain_2018': 5,
 'Monaco_2018': 6,
 'Canada_2018': 7,
 'France_2018': 8,
 'Austria_2018': 9,
 'Great Britain_2018': 10,
 'Germany_2018': 11,
 'Hungary_2018': 12,
 'Belgium_2018': 13,
 'Italy_2018': 14,
 'Singapore_2018': 15,
 'Russia_2018': 16,
 'Japan_2018': 17,
 'United States_2018': 18,
 'Mexico_2018': 19,
 'Brazil_2018': 20,
 'Abu Dhabi_2018': 21,
 'Australia_2019': 22,
 'Bahrain_2019': 23,
 'China_2019': 24,
 'Azerbaijan_2019': 25,
 'Spain_2019': 26,
 'Monaco_2019': 27,
 'Canada_2019': 28,
 'France_2019': 29,
 'Austria_2019': 30,
 'Great Britain_2019': 31,
 'Germany_2019': 32,
 'Hungary_2019': 33,
 'Belgium_2019': 34,
 'Italy_2019': 35,
 'Singapore_2019': 36,
 'Russia_2019': 37,
 'Japan_2019': 38,
 'Mexico_2019': 39,
 'United States_2019': 40,
 'Brazil_2019': 41,
 'Abu Dhabi_2019': 42,
 'Austria_2020': 43,
 'Styria_2020': 44,
 'Hungary_2020': 45,
 'Great Britain_2020': 46,
 '70th Anniversary_2020'

# Scrape F1 Website [Race Results 2001-2017]

In [22]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

# Establish year begin and end
year_begin = 2001
year_end = 2017

# Establish empty lists
race_urls = []

print('SCRAPING 2001 - 2017 URLS . . .')
while year_begin <= year_end:

    # Use the year begin to be able to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
    
    year_begin += 1

print('SCRAPING 2001 - 2017 DATA . . .')
# Create a dictionary to map driver names to unique IDs
driver_id_map = {}
next_id = 1

# Establish empty lists
race_url = []
position = []
driver_name = []
points = []
driver_id = []

# For each race link, open it and get data from the table
for link in race_urls:
    browser.get(link)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            if len(cells) >= 6:
                race_url.append(link)
                position.append(cells[0].text)
                driver_name.append(cells[2].text)
                points.append(cells[6].text)
                
                # Assign unique driver ID
                current_driver = cells[2].text
                if current_driver not in driver_id_map:
                    driver_id_map[current_driver] = next_id
                    next_id += 1
                driver_id.append(driver_id_map[current_driver])

print('COMPLETE')
browser.close()

SCRAPING 2001 - 2017 URLS . . .
SCRAPING 2001 - 2017 DATA . . .
COMPLETE


In [23]:
# Turn the lists into a dataframe
races_2001 = pd.DataFrame({
    "race_url": race_url,
    "driver_id": driver_id,
    "driver_name": driver_name,
    "position": position,
    "points": points
})

# Save results
races_2001.to_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8', index=False)

# Scrape F1 Website [Race Results 2001-2017]

In [24]:
races_2001 = pd.read_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8')
max_driver_id = races_2001['driver_id'].max()
print(max_driver_id)

105


In [26]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

# Establish year begin and end
year_begin = 2018
year_end = 2025

# Establish empty lists
link_data = []

print('SCRAPING 2018 - 2025 URLS . . .')
while year_begin <= year_end:

    # Use the year begin to be able to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_url = link.get_attribute("href")
            circuit_name = link.text
            link_data.append((race_url, circuit_name))

    year_begin += 1

print('SCRAPING 2018 - 2025 DATA . . .')
# Create a dictionary to map driver, race, circuit, and team names to unique IDs
driver_id_map = {}
race_id_map = {}
circuit_id_map = {}
team_id_map = {}

next_driver_id = max_driver_id + 1
next_race_id = 1
next_circuit_id = 1
next_team_id = 1

# Establish empty lists
driver_id = []
race_id = []
circuit_id = []
team_id = []
year = []
rounds = []
driver_name = []
team_name = []
end_position = []
points = []
laps_completed = []
circuit_names = []


# For each race data tuple, open it and get data from the table
current_year = None
r = 0

for link, circuit in link_data:
    browser.get(link)

    # Extract year from URL
    race_year = link.split('/')[5]
    
    # Reset round counter when year changes
    if race_year != current_year:
        current_year = race_year
        r = 1
    else:
        r += 1

    # Assign unique race ID
    race_key = circuit + '_' + str(race_year)
    if race_key not in race_id_map:
        race_id_map[race_key] = next_race_id
        next_race_id += 1
    
    # Assign unique circuit ID
    if circuit not in circuit_id_map:
        circuit_id_map[circuit] = next_circuit_id
        next_circuit_id += 1

    # Find the table using the table tag
    table = browser.find_elements(By.TAG_NAME, "table")

    # Find the rows for each tr in the table
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]

        # Find the data in each of the rows and append it to the proper list
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            if len(cells) >= 6:
                end_position.append(cells[0].text)
                driver_name.append(cells[2].text)
                team = cells[3].text
                laps_completed.append(cells[4].text)
                points.append(cells[6].text)
                circuit_names.append(circuit)
                year.append(race_year)
                rounds.append(r)
                
                # Assign unique driver, race, and circuit IDs
                current_driver = cells[2].text
                if current_driver not in driver_id_map:
                    driver_id_map[current_driver] = next_driver_id
                    next_driver_id += 1
                
                # Map team name using constructor_mapping for common names
                mapped_team = constructor_mapping.get(team, team)
                team_name.append(mapped_team)
                
                # Assign unique team ID using mapped team name
                if mapped_team not in team_id_map:
                    team_id_map[mapped_team] = next_team_id
                    next_team_id += 1
                
                driver_id.append(driver_id_map[current_driver])
                race_id.append(race_id_map[race_key])
                circuit_id.append(circuit_id_map[circuit])
                team_id.append(team_id_map[mapped_team])

print('COMPLETE')
browser.close()

SCRAPING 2018 - 2025 URLS . . .
SCRAPING 2018 - 2025 DATA . . .
COMPLETE


In [27]:
# Turn the lists into a dataframe
races_2018 = pd.DataFrame({
    'driver_id': driver_id,
    'race_id': race_id,
    'circuit_id': circuit_id,
    'team_id': team_id,
    'year': year,
    'round_number': rounds,
    'circuit_name': circuit_names,
    'driver_name': driver_name,
    'team_name': team_name,
    'end_position': end_position,
    'points': points,
    'laps_completed': laps_completed
})

# Save results
races_2018.to_csv('../data/raw/races_results_raw_2018-2025.csv', encoding='utf-8', index=False)

In [28]:
# Save the ID maps and link_data for future use
with open('../data/raw/driver_id_map.pkl', 'wb') as f:
    pickle.dump(driver_id_map, f)

with open('../data/raw/circuit_id_map.pkl', 'wb') as f:
    pickle.dump(circuit_id_map, f)

with open('../data/raw/race_id_map.pkl', 'wb') as f:
    pickle.dump(race_id_map, f)

with open('../data/raw/team_id_map.pkl', 'wb') as f:
    pickle.dump(team_id_map, f)

with open('../data/raw/link_data.pkl', 'wb') as f:
    pickle.dump(link_data, f)

In [29]:
races_2018['team_name'].unique()

array(['Ferrari', 'Mercedes', 'Red Bull', 'McLaren', 'Renault',
       'Force India', 'Sauber', 'Williams', 'Toro Rosso', 'Haas',
       'Alfa Romeo', 'Racing Point', 'AlphaTauri', 'Aston Martin',
       'Alpine', 'Racing Bulls', 'Kick Sauber'], dtype=object)

# Scrape F1 Website [Practices]

In [30]:
# Reopen the ID maps and link_data from the saved pickle files
with open('../data/raw/driver_id_map.pkl', 'rb') as f:
    driver_id_map = pickle.load(f)

with open('../data/raw/circuit_id_map.pkl', 'rb') as f:
    circuit_id_map = pickle.load(f)

with open('../data/raw/race_id_map.pkl', 'rb') as f:
    race_id_map = pickle.load(f)

with open('../data/raw/team_id_map.pkl', 'rb') as f:
    team_id_map = pickle.load(f)

with open('../data/raw/link_data.pkl', 'rb') as f:
    link_data = pickle.load(f)

In [None]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

print('SCRAPING PRACTICE DATA . . .')
# Initiate data lists
race_id = []
driver_id = []
team_id = []
session_type = []
lap_times = []
lap_count = []
position = []

for link, circuit in link_data:
    
    # Set the practice number to start at 1
    p = 1

    # Extract year from URL
    race_year = link.split('/')[5]

    while True:
        # Get the practice URL
        practice_url = link.replace('/race-result', '/practice/') + str(p)
        browser.get(practice_url)
        
        try:
            # Find the table
            table = browser.find_elements(By.TAG_NAME, 'table')
            for tr in table:
                # Find the table rows
                rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
                for row in rows:
                    # Find the table data
                    cells = row.find_elements(By.TAG_NAME, 'td')
                    
                    if len(cells) >= 6:
                        # Append constant data with fallback for missing IDs
                        driver_name = cells[2].text
                        driver_name = re.sub(r'\s+', ' ', driver_name).strip()
                        for existing_name in driver_id_map.keys():
                            if driver_name in existing_name or existing_name.endswith(driver_name):
                                driver_name = existing_name
                                break
                        if driver_name in driver_id_map:
                            driver_id.append(driver_id_map[driver_name])
                        else:
                            new_driver_id = max(driver_id_map.values()) + 1 if driver_id_map else 1
                            driver_id_map[driver_name] = new_driver_id
                            driver_id.append(new_driver_id)
                        
                        race_key = circuit + '_' + str(race_year)
                        if race_key in race_id_map:
                            race_id.append(race_id_map[race_key])
                        else:
                            new_race_id = max(race_id_map.values()) + 1 if race_id_map else 1
                            race_id_map[race_key] = new_race_id
                            race_id.append(new_race_id)
                        
                        # Map team name using constructor_mapping for common names
                        team = cells[3].text
                        mapped_team = constructor_mapping.get(team, team)
                        if mapped_team in team_id_map:
                            team_id.append(team_id_map[mapped_team])
                        else:
                            new_team_id = max(team_id_map.values()) + 1 if team_id_map else 1
                            team_id_map[mapped_team] = new_team_id
                            team_id.append(new_team_id)

                        # Append table data
                        session_type.append('Practice ' + str(p))
                        position.append(cells[0].text)
                        lap_count.append(cells[5].text)

                        # For the first row, save that lap_time as the base time, add gaps to that time
                        if row == rows[0]:
                            # Find raw lap time
                            lap_time = cells[4].text
                            
                            # Check if lap time is blank
                            if not lap_time.strip():
                                lap_times.append('NULL')
                            
                            # Distinguish between times over and under a minute
                            if ':' in lap_time:
                                # Time in "min:sec.millisec" format
                                time_parts = re.split(r"[:.]", lap_time)
                                minutes = int(time_parts[0])
                                seconds = int(time_parts[1])
                                milliseconds = int(time_parts[2])
                            else:
                                # Time in "sec.millisec" format
                                time_parts = lap_time.split('.')
                                minutes = 0
                                seconds = int(time_parts[0])
                                milliseconds = int(time_parts[1])
                            
                            # Convert that into timedelta so it can be added later
                            base_time = timedelta(minutes=minutes, seconds=seconds, milliseconds=milliseconds)
                            
                            # Append it to the list
                            lap_times.append(base_time)
                        else:
                            # Find raw lap time
                            lap_time = cells[4].text

                            # Check if lap time is blank
                            if not lap_time.strip():
                                lap_times.append('NULL')
                            else:
                                # Get rid of the + and s
                                time_clean = lap_time.strip('+s')
                                
                                # Distinguish between times over and under a minute
                                if ':' in time_clean:
                                    # Gap time in "min:sec.millisec" format
                                    time_parts = re.split(r"[:.]", time_clean)
                                    gap_minutes = int(time_parts[0])
                                    gap_seconds = int(time_parts[1])
                                    gap_milliseconds = int(time_parts[2])
                                else:
                                    # Gap time in "sec.millisec" format
                                    time_parts = time_clean.split('.')
                                    gap_minutes = 0
                                    gap_seconds = int(time_parts[0])
                                    gap_milliseconds = int(time_parts[1])
                                
                                # Convert that into timedelta so it can be added
                                gap = timedelta(minutes=gap_minutes, seconds=gap_seconds, milliseconds=gap_milliseconds)

                                # Add the time gap to the base time
                                new_time = base_time + gap
                                lap_times.append(new_time)
                    
            p += 1
            if p > 3:
                break
        except:
            p += 1
            if p > 3:
                break

print('COMPLETE') #9040
browser.close()

SCRAPING PRACTICE DATA . . .
COMPLETE


In [33]:
# Turn the lists into a dataframe
practice_data = pd.DataFrame({
    'driver_id': driver_id,
    'race_id': race_id,
    'team_id': team_id,
    'session': session_type,
    'lap_time': lap_times,
    'lap_count': lap_count,
    'position': position
})

# Save results
practice_data.to_csv('../data/raw/practice_data_raw.csv', encoding='utf-8', index=False)

In [34]:
# Save the ID maps and link_data for future use
with open('../data/raw/driver_id_map1.pkl', 'wb') as f:
    pickle.dump(driver_id_map, f)

with open('../data/raw/circuit_id_map1.pkl', 'wb') as f:
    pickle.dump(circuit_id_map, f)

with open('../data/raw/race_id_map1.pkl', 'wb') as f:
    pickle.dump(race_id_map, f)

with open('../data/raw/team_id_map1.pkl', 'wb') as f:
    pickle.dump(team_id_map, f)

with open('../data/raw/link_data1.pkl', 'wb') as f:
    pickle.dump(link_data, f)

# Scrape F1 Website [Qualifying]

In [35]:
# Reopen the ID maps and link_data from the saved pickle files
with open('../data/raw/driver_id_map1.pkl', 'rb') as f:
    driver_id_map = pickle.load(f)

with open('../data/raw/circuit_id_map1.pkl', 'rb') as f:
    circuit_id_map = pickle.load(f)

with open('../data/raw/race_id_map1.pkl', 'rb') as f:
    race_id_map = pickle.load(f)

with open('../data/raw/team_id_map1.pkl', 'rb') as f:
    team_id_map = pickle.load(f)

with open('../data/raw/link_data1.pkl', 'rb') as f:
    link_data = pickle.load(f)

In [36]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

print('SCRAPING QUALIFYING DATA . . .')
# Initiate data lists
race_id = []
driver_id = []
team_id = []
q1_time = []
q2_time = []
q3_time = []
qual_position = []
qual_laps = []

for link, circuit in link_data:
    
    # Extract year from URL
    race_year = link.split('/')[5]

    # Get the qualifying URL
    qual_url = link.replace('/race-result', '/qualifying')
    browser.get(qual_url)
    
    # Find the table
    table = browser.find_elements(By.TAG_NAME, 'table')
    for tr in table:
        # Find the table rows
        rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
        for row in rows:
            # Find the table data
            cells = row.find_elements(By.TAG_NAME, 'td')
            
            if len(cells) >= 8:
                # Append constant data with fallback for missing IDs
                driver_name = cells[2].text
                driver_name = re.sub(r'\s+', ' ', driver_name).strip()
                for existing_name in driver_id_map.keys():
                    if driver_name in existing_name or existing_name.endswith(driver_name):
                        driver_name = existing_name
                        break
                if driver_name in driver_id_map:
                    driver_id.append(driver_id_map[driver_name])
                else:
                    new_driver_id = max(driver_id_map.values()) + 1 if driver_id_map else 1
                    driver_id_map[driver_name] = new_driver_id
                    driver_id.append(new_driver_id)
                
                race_key = circuit + '_' + str(race_year)
                if race_key in race_id_map:
                    race_id.append(race_id_map[race_key])
                else:
                    new_race_id = max(race_id_map.values()) + 1 if race_id_map else 1
                    race_id_map[race_key] = new_race_id
                    race_id.append(new_race_id)
                
                # Map team name using constructor_mapping for common names
                team = cells[3].text
                mapped_team = constructor_mapping.get(team, team)
                if mapped_team in team_id_map:
                    team_id.append(team_id_map[mapped_team])
                else:
                    new_team_id = max(team_id_map.values()) + 1 if team_id_map else 1
                    team_id_map[mapped_team] = new_team_id
                    team_id.append(new_team_id)

                # Append table data
                qual_position.append(cells[0].text)
                qual_laps.append(cells[7].text)

                # Find raw lap times and process each qualifying session
                qual_times = []
                for i in range(4, 7):  # Cells 4, 5, 6 correspond to Q1, Q2, Q3
                    lap_time = cells[i].text
                    
                    # Check if lap time is blank or doesn't contain a period
                    if not lap_time.strip() or '.' not in lap_time:
                        qual_times.append('NULL')
                        continue
                    
                    # Distinguish between times over and under a minute
                    if ':' in lap_time:
                        # Time in "min:sec.millisec" format
                        time_parts = re.split(r"[:.]", lap_time)
                        minutes = int(time_parts[0])
                        seconds = int(time_parts[1])
                        milliseconds = int(time_parts[2])
                    else:
                        # Time in "sec.millisec" format
                        time_parts = lap_time.split('.')
                        minutes = 0
                        seconds = int(time_parts[0])
                        milliseconds = int(time_parts[1])
                    
                    # Convert that into timedelta so it can be added later
                    converted_time = timedelta(minutes=minutes, seconds=seconds, milliseconds=milliseconds)
                    qual_times.append(converted_time)
                
                # Append the converted times to their respective lists
                q1_time.append(qual_times[0])
                q2_time.append(qual_times[1])
                q3_time.append(qual_times[2])

print('COMPLETE')
browser.close()

SCRAPING QUALIFYING DATA . . .
COMPLETE


In [37]:
# Turn the lists into a dataframe
qualifying_data = pd.DataFrame({
    'driver_id': driver_id,
    'race_id': race_id,
    'team_id': team_id,
    'q1_time': q1_time,
    'q2_time': q2_time,
    'q3_time': q3_time,
    'qual_position': qual_position,
    'qual_laps': qual_laps
})

# Save results
qualifying_data.to_csv('../data/raw/qualifying_data_raw.csv', encoding='utf-8', index=False)

In [38]:
# Save the ID maps and link_data for future use
with open('../data/raw/driver_id_map2.pkl', 'wb') as f:
    pickle.dump(driver_id_map, f)

with open('../data/raw/circuit_id_map2.pkl', 'wb') as f:
    pickle.dump(circuit_id_map, f)

with open('../data/raw/race_id_map2.pkl', 'wb') as f:
    pickle.dump(race_id_map, f)

with open('../data/raw/team_id_map2.pkl', 'wb') as f:
    pickle.dump(team_id_map, f)

with open('../data/raw/link_data2.pkl', 'wb') as f:
    pickle.dump(link_data, f)

# Scrape F1 Website [Starting Grid]

In [13]:
# Reopen the ID maps and link_data from the saved pickle files
with open('../data/raw/driver_id_map2.pkl', 'rb') as f:
    driver_id_map = pickle.load(f)

with open('../data/raw/circuit_id_map2.pkl', 'rb') as f:
    circuit_id_map = pickle.load(f)

with open('../data/raw/race_id_map2.pkl', 'rb') as f:
    race_id_map = pickle.load(f)

with open('../data/raw/team_id_map2.pkl', 'rb') as f:
    team_id_map = pickle.load(f)

with open('../data/raw/link_data2.pkl', 'rb') as f:
    link_data = pickle.load(f)

In [None]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

print('SCRAPING STARTING GRID DATA . . .')
# Initiate data lists
race_id = []
driver_id = []
team_id = []
start_position = []

for link, circuit in link_data:
    
    # Extract year from URL
    race_year = link.split('/')[5]

    # Get the starting grid URL
    qual_url = link.replace('/race-result', '/starting-grid')
    browser.get(qual_url)
    
    # Find the table
    table = browser.find_elements(By.TAG_NAME, 'table')
    for tr in table:
        # Find the table rows
        rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
        for row in rows:
            # Find the table data
            cells = row.find_elements(By.TAG_NAME, 'td')
            
            if len(cells) >= 5:
                # Append constant data with fallback for missing IDs
                driver_name = cells[2].text
                driver_name = re.sub(r'\s+', ' ', driver_name).strip()
                for existing_name in driver_id_map.keys():
                    if driver_name in existing_name or existing_name.endswith(driver_name):
                        driver_name = existing_name
                        break
                if driver_name in driver_id_map:
                    driver_id.append(driver_id_map[driver_name])
                else:
                    new_driver_id = max(driver_id_map.values()) + 1 if driver_id_map else 1
                    driver_id_map[driver_name] = new_driver_id
                    driver_id.append(new_driver_id)
                
                race_key = circuit + '_' + str(race_year)
                if race_key in race_id_map:
                    race_id.append(race_id_map[race_key])
                else:
                    new_race_id = max(race_id_map.values()) + 1 if race_id_map else 1
                    race_id_map[race_key] = new_race_id
                    race_id.append(new_race_id)
                
                # Map team name using constructor_mapping for common names
                team = cells[3].text
                mapped_team = constructor_mapping.get(team, team)
                if mapped_team in team_id_map:
                    team_id.append(team_id_map[mapped_team])
                else:
                    new_team_id = max(team_id_map.values()) + 1 if team_id_map else 1
                    team_id_map[mapped_team] = new_team_id
                    team_id.append(new_team_id)

                # Append table data
                start_position.append(cells[0].text)

print('COMPLETE')
browser.close()

SCRAPING STARTING GRID DATA . . .
COMPLETE


In [17]:
# Turn the lists into a dataframe
starting_grid_data = pd.DataFrame({
    'driver_id': driver_id,
    'race_id': race_id,
    'team_id': team_id,
    'start_position': start_position
})

# Save results
starting_grid_data.to_csv('../data/raw/starting_grid_data_raw.csv', encoding='utf-8', index=False)

In [18]:
# Save the ID maps and link_data for future use
with open('../data/raw/driver_id_map3.pkl', 'wb') as f:
    pickle.dump(driver_id_map, f)

with open('../data/raw/circuit_id_map3.pkl', 'wb') as f:
    pickle.dump(circuit_id_map, f)

with open('../data/raw/race_id_map3.pkl', 'wb') as f:
    pickle.dump(race_id_map, f)

with open('../data/raw/team_id_map3.pkl', 'wb') as f:
    pickle.dump(team_id_map, f)

with open('../data/raw/link_data3.pkl', 'wb') as f:
    pickle.dump(link_data, f)

# Scrape F1 Website [Pit Stops]

In [None]:
# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

print('SCRAPING QUALIFYING DATA . . .')
# Initiate data lists
race_id = []
driver_id = []
team_id = []
stop_number = []
stop_lap = []
pits_time = []

for link, circuit in link_data:
    
    # Extract year from URL
    race_year = link.split('/')[5]

    # Get the pit stop summary URL
    qual_url = link.replace('/race-result', '/pit-stop-summary')
    browser.get(qual_url)
    
    # Find the table
    table = browser.find_elements(By.TAG_NAME, 'table')
    for tr in table:
        # Find the table rows
        rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
        for row in rows:
            # Find the table data
            cells = row.find_elements(By.TAG_NAME, 'td')
            
            if len(cells) >= 8:
                # Append constant data with fallback for missing IDs
                driver_name = cells[2].text
                driver_name = re.sub(r'\s+', ' ', driver_name).strip()
                for existing_name in driver_id_map.keys():
                    if driver_name in existing_name or existing_name.endswith(driver_name):
                        driver_name = existing_name
                        break
                if driver_name in driver_id_map:
                    driver_id.append(driver_id_map[driver_name])
                else:
                    new_driver_id = max(driver_id_map.values()) + 1 if driver_id_map else 1
                    driver_id_map[driver_name] = new_driver_id
                    driver_id.append(new_driver_id)
                
                race_key = circuit + '_' + str(race_year)
                if race_key in race_id_map:
                    race_id.append(race_id_map[race_key])
                else:
                    new_race_id = max(race_id_map.values()) + 1 if race_id_map else 1
                    race_id_map[race_key] = new_race_id
                    race_id.append(new_race_id)
                
                # Map team name using constructor_mapping for common names
                team = cells[3].text
                mapped_team = constructor_mapping.get(team, team)
                if mapped_team in team_id_map:
                    team_id.append(team_id_map[mapped_team])
                else:
                    new_team_id = max(team_id_map.values()) + 1 if team_id_map else 1
                    team_id_map[mapped_team] = new_team_id
                    team_id.append(new_team_id)

                # Append table data
                qual_position.append(cells[0].text)
                qual_laps.append(cells[7].text)

                # Find raw lap times and process each qualifying session
                qual_times = []
                for i in range(4, 7):  # Cells 4, 5, 6 correspond to Q1, Q2, Q3
                    lap_time = cells[i].text
                    
                    # Check if lap time is blank or doesn't contain a period
                    if not lap_time.strip() or '.' not in lap_time:
                        qual_times.append('NULL')
                        continue
                    
                    # Distinguish between times over and under a minute
                    if ':' in lap_time:
                        # Time in "min:sec.millisec" format
                        time_parts = re.split(r"[:.]", lap_time)
                        minutes = int(time_parts[0])
                        seconds = int(time_parts[1])
                        milliseconds = int(time_parts[2])
                    else:
                        # Time in "sec.millisec" format
                        time_parts = lap_time.split('.')
                        minutes = 0
                        seconds = int(time_parts[0])
                        milliseconds = int(time_parts[1])
                    
                    # Convert that into timedelta so it can be added later
                    converted_time = timedelta(minutes=minutes, seconds=seconds, milliseconds=milliseconds)
                    qual_times.append(converted_time)
                
                # Append the converted times to their respective lists
                q1_time.append(qual_times[0])
                q2_time.append(qual_times[1])
                q3_time.append(qual_times[2])

print('COMPLETE')
browser.close()

# Weather Collection

In [40]:
import fastf1
import pandas as pd

# Enable cache (important for performance)
#fastf1.Cache.enable_cache("cache")  # creates a folder "cache" to store data

# Load a session: example Bahrain GP 2023 Qualifying
session = fastf1.get_session(2018, 'australia', 'fp1')
session.load(laps=False, telemetry=False, messages=False)  # downloads and parses the data

# Weather data is stored in session.weather_data (a structured numpy array)
weather_array = session.weather_data

# Convert to DataFrame
weather_df = pd.DataFrame(weather_array)

print(weather_df.head())

core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for weather_data
core           INFO 	Finished loading data for 20 drivers: ['2', '3', '5', '7', '8', '9', '10', '11', '14', '16', '18', '20', '27', '28', '31', '33', '35', '44', '55', '77']


                    Time  AirTemp  Humidity  Pressure  Rainfall  TrackTemp  \
0 0 days 00:00:24.964000     25.3      36.7    1020.2     False       38.5   
1 0 days 00:01:24.977000     25.4      36.9    1020.0     False       38.5   
2 0 days 00:02:24.990000     25.2      36.8    1020.1     False       38.5   
3 0 days 00:03:25.002000     25.3      36.1    1020.1     False       38.5   
4 0 days 00:04:25.014000     25.4      35.9    1020.0     False       38.6   

   WindDirection  WindSpeed  
0            330        2.6  
1            308        2.5  
2            305        2.7  
3            305        2.5  
4            325        2.8  


In [41]:
weather_df

Unnamed: 0,Time,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 00:00:24.964000,25.3,36.7,1020.2,False,38.5,330,2.6
1,0 days 00:01:24.977000,25.4,36.9,1020.0,False,38.5,308,2.5
2,0 days 00:02:24.990000,25.2,36.8,1020.1,False,38.5,305,2.7
3,0 days 00:03:25.002000,25.3,36.1,1020.1,False,38.5,305,2.5
4,0 days 00:04:25.014000,25.4,35.9,1020.0,False,38.6,325,2.8
5,0 days 00:05:25.026000,25.4,36.6,1020.1,False,38.7,325,2.8
6,0 days 00:06:25.039000,25.4,36.4,1020.0,False,38.9,301,2.4
7,0 days 00:07:25.050000,25.5,36.5,1020.1,False,38.9,334,2.0
8,0 days 00:08:25.062000,25.4,36.5,1020.0,False,39.0,298,1.7
9,0 days 00:09:25.075000,25.5,36.3,1020.0,False,39.4,312,2.0


In [42]:
def aggregate_weather(weather_df):
    agg = {}
    for col in ['AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'Pressure']:
        agg[f'{col}_mean'] = weather_df[col].mean()
        agg[f'{col}_min'] = weather_df[col].min()
        agg[f'{col}_max'] = weather_df[col].max()
        agg[f'{col}_std'] = weather_df[col].std()
    
    # Rain flag and proportion
    agg['RainAffected'] = int(weather_df['Rainfall'].any())
    agg['RainFraction'] = weather_df['Rainfall'].mean()
    
    return pd.Series(agg)

# Usage
session_weather_features = aggregate_weather(weather_df)
print(session_weather_features)


AirTemp_mean        26.781373
AirTemp_min         25.200000
AirTemp_max         28.900000
AirTemp_std          0.917781
TrackTemp_mean      42.300000
TrackTemp_min       38.500000
TrackTemp_max       45.200000
TrackTemp_std        1.937642
WindSpeed_mean       1.917647
WindSpeed_min        0.400000
WindSpeed_max        3.500000
WindSpeed_std        0.681933
Humidity_mean       33.859804
Humidity_min        30.600000
Humidity_max        36.900000
Humidity_std         1.612099
Pressure_mean     1019.061765
Pressure_min      1017.700000
Pressure_max      1020.200000
Pressure_std         0.679843
RainAffected         0.000000
RainFraction         0.000000
dtype: float64
