# Introduction
Jack Wilson
9/23/2025

This notebook outlines scraping and collecting of all data raw data used in the model

# Import Modules

In [2]:
import pandas as pd
import numpy as np
import time, random, re, os, pickle, tempfile, shutil
from math import e

from datetime import timedelta, datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

# DataFrame Display Options

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)

# Functions and Mapping

## Constructor Common Name Map

In [4]:
# Constructors common name mapping
constructor_mapping = {'team_id': {
    # Red Bull
    'Red Bull Racing Renault': 'Red Bull',
    'Red Bull Renault': 'Red Bull',
    'RBR Renault': 'Red Bull',
    'RBR Cosworth': 'Red Bull',
    'RBR Ferrari': 'Red Bull',
    'Red Bull Racing TAG Heuer': 'Red Bull',
    'Red Bull Racing Honda': 'Red Bull',
    'Red Bull Racing RBPT': 'Red Bull',
    'Red Bull Racing Honda RBPT': 'Red Bull',
    'Red Bull Racing': 'Red Bull',
    
    # AlphaTauri/Toro Rosso
    'Toro Rosso': 'Toro Rosso',
    'STR Ferrari': 'Toro Rosso',
    'STR Renault': 'Toro Rosso',
    'STR Cosworth': 'Toro Rosso',
    'Toro Rosso Ferrari': 'Toro Rosso',
    'Scuderia Toro Rosso Honda': 'Toro Rosso',
    'AlphaTauri Honda': 'AlphaTauri',
    'AlphaTauri RBPT': 'AlphaTauri',
    'AlphaTauri Honda RBPT': 'AlphaTauri',
    
    # Racing Bulls
    'RB Honda RBPT': 'Racing Bulls',
    
    # Ferrari
    'Ferrari': 'Ferrari',
    'Ferrari Jaguar': 'Ferrari',
    'Thin Wall Ferrari': 'Ferrari',
    
    # Mercedes
    'Mercedes': 'Mercedes',
    'Mercedes-Benz': 'Mercedes',
    
    # Aston Martin
    'Aston Martin Mercedes': 'Aston Martin',
    'Aston Martin Aramco Mercedes': 'Aston Martin',
    'Aston Butterworth': 'Aston Martin',
    'Aston Martin': 'Aston Martin',
    
    # McLaren
    'McLaren Ford': 'McLaren',
    'McLaren TAG': 'McLaren',
    'McLaren Honda': 'McLaren',
    'McLaren Peugeot': 'McLaren',
    'McLaren Renault': 'McLaren',
    'McLaren BRM': 'McLaren',
    'McLaren Mercedes': 'McLaren',
    'McLaren Serenissima': 'McLaren',
    'Mclaren BRM': 'McLaren',
    'McLaren Alfa Romeo': 'McLaren',
    
    # Williams
    'Williams Ford': 'Williams',
    'Williams Renault': 'Williams',
    'Williams Honda': 'Williams',
    'Williams Judd': 'Williams',
    'Williams BMW': 'Williams',
    'Williams Toyota': 'Williams',
    'Williams Cosworth': 'Williams',
    'Williams Mecachrome': 'Williams',
    'Williams Supertec': 'Williams',
    'Williams Mercedes': 'Williams',
    'Frank Williams Racing Cars/Williams': 'Williams',
    
    # Renault
    'Renault': 'Renault',

    # Alpine
    'Alpine Renault': 'Alpine',
    
    # Lotus
    'Lotus Renault': 'Lotus',
    'Lotus Ford': 'Lotus',
    'Lotus Climax': 'Lotus',
    'Lotus BRM': 'Lotus',
    'Lotus Honda': 'Lotus',
    'Lotus Judd': 'Lotus',
    'Lotus Lamborghini': 'Lotus',
    'Lotus Mugen Honda': 'Lotus',
    'Lotus Mercedes': 'Lotus',
    'Lotus Cosworth': 'Lotus',
    'Lotus Maserati': 'Lotus',
    'Lotus Pratt & Whitney': 'Lotus',
    
    # Force India
    'Force India Ferrari': 'Force India',
    'Force India Mercedes': 'Force India',

    # Racing Point
    'Racing Point BWT Mercedes': 'Racing Point',

    # Sauber
    'Sauber': 'Sauber',
    'Sauber Ferrari': 'Sauber',
    'Sauber Petronas': 'Sauber',
    'Sauber BMW': 'Sauber',
    'Sauber Mercedes': 'Sauber',
    'Sauber Ford': 'Sauber',
    'Kick Sauber Ferrari': 'Sauber',

    # Alfa Romeo
    'Alfa Romeo Racing Ferrari': 'Alfa Romeo',
    'Alfa Romeo Ferrari': 'Alfa Romeo',
    'Alfa Romeo': 'Alfa Romeo',
    
    # Haas
    'Haas Ferrari': 'Haas',
    'Haas F1 Team': 'Haas',
    
    # Jordan
    'Jordan Ford': 'Jordan',
    'Jordan Peugeot': 'Jordan',
    'Jordan Hart': 'Jordan',
    'Jordan Honda': 'Jordan',
    'Jordan Yamaha': 'Jordan',
    'Jordan Toyota': 'Jordan',
    'Jordan Mugen Honda': 'Jordan',
    
    # BAR
    'BAR Honda': 'BAR',
    'BAR Supertec': 'BAR',
    
    # Honda
    'Honda': 'Honda',
    
    # Benetton
    'Benetton Ford': 'Benetton',
    'Benetton BMW': 'Benetton',
    'Benetton Renault': 'Benetton',
    'Benetton Playlife': 'Benetton',
    
    # Toyota
    'Toyota': 'Toyota',
    
    # Jaguar
    'Jaguar Cosworth': 'Jaguar',
    
    # Stewart
    'Stewart Ford': 'Stewart',
    
    # BRM
    'BRM': 'BRM',
    'BRM Climax': 'BRM',

    # JBW
    'JBW Maserati': 'JBW',
    'JBW Climax': 'JBW',
    
    # Cooper
    'Cooper Climax': 'Cooper',
    'Cooper Maserati': 'Cooper',
    'Cooper Bristol': 'Cooper',
    'Cooper Castellotti': 'Cooper',
    'Cooper BRM': 'Cooper',
    'Cooper JAP': 'Cooper',
    'Cooper Alta': 'Cooper',
    'Cooper Borgward': 'Cooper',
    'Cooper Alfa Romeo': 'Cooper',
    'Cooper Ferrari': 'Cooper',
    'Cooper ATS': 'Cooper',
    'Cooper Ford': 'Cooper',
    'Cooper OSCA': 'Cooper',
    
    # Brabham
    'Brabham Climax': 'Brabham',
    'Brabham Repco': 'Brabham',
    'Brabham Ford': 'Brabham',
    'Brabham Alfa Romeo': 'Brabham',
    'Brabham BMW': 'Brabham',
    'Brabham BRM': 'Brabham',
    'Brabham Judd': 'Brabham',
    'Brabham Yamaha': 'Brabham',
    
    # Maserati
    'Maserati': 'Maserati',
    'Maserati Offenhauser': 'Maserati',
    'Maserati Milano': 'Maserati',
    'Maserati-Offenhauser': 'Maserati',
    'Maserati OSCA': 'Maserati',
    'Maserati Plate': 'Maserati',
    
    # Ligier
    'Ligier Matra': 'Ligier',
    'Ligier Ford': 'Ligier',
    'Ligier Renault': 'Ligier',
    'Ligier Megatron': 'Ligier',
    'Ligier Mugen Honda': 'Ligier',
    
    # Tyrrell
    'Tyrrell Ford': 'Tyrrell',
    'Tyrrell Renault': 'Tyrrell',
    'Tyrrell Honda': 'Tyrrell',
    'Tyrrell Yamaha': 'Tyrrell',
    'Tyrrell Ilmor': 'Tyrrell',
    
    # Arrows/Footwork
    'Arrows Ford': 'Arrows',
    'Arrows BMW': 'Arrows',
    'Arrows Megatron': 'Arrows',
    'Arrows Yamaha': 'Arrows',
    'Arrows Supertec': 'Arrows',
    'Arrows Asiatech': 'Arrows',
    'Arrows Cosworth': 'Arrows',
    'Arrows': 'Arrows',
    'Footwork Ford': 'Footwork',
    'Footwork Hart': 'Footwork',
    'Footwork Mugen Honda': 'Footwork',
    'Footwork Porsche': 'Footwork',
    
    # Vanwall
    'Vanwall': 'Vanwall',
    
    # Wolf
    'Wolf Ford': 'Wolf',
    'Wolf-Williams': 'Wolf',
    
    # Lola
    'Lola Ford': 'Lola',
    'Lola Lamborghini': 'Lola',
    'Lola Climax': 'Lola',
    'Lola BMW': 'Lola',
    'Lola Hart': 'Lola',
    'Lola Ferrari': 'Lola',

    # March
    'March Ford': 'March',
    'March Judd': 'March',
    'March Ilmor': 'March',
    'March Alfa Romeo': 'March',

    # Minardi
    'Minardi Ford': 'Minardi',
    'Minardi Ferrari': 'Minardi',
    'Minardi Lamborghini': 'Minardi',
    'Minardi Asiatech': 'Minardi',
    'Minardi Cosworth': 'Minardi',
    'Minardi Fondmetal': 'Minardi',
    'Minardi European': 'Minardi',
    'Minardi Hart': 'Minardi',
    'Minardi Motori Moderni': 'Minardi',
    
    # LDS
    'LDS Alfa Romeo': 'LDS',
    'LDS Climax': 'LDS',
    'LDS Repco': 'LDS',

    # Porche
    'Porsche (F2)': 'Porsche',
    'Porsche': 'Porsche',
    'Behra-Porsche': 'Porsche',

    # Scirocco
    'Scirocco BRM': 'Scirocco',
    'Scirocco Climax': 'Scirocco',

    # AFM
    'AFM Kuchen': 'AFM',
    'AFM BMW': 'AFM',
    'AFM Bristol': 'AFM',

    # ATS
    'ATS Ford': 'ATS',
    'ATS': 'ATS',
    'ATS BMW': 'ATS',
    'Derrington-Francis ATS': 'ATS',

    # Leyton House
    'Leyton House Judd': 'Leyton House',
    'Leyton House Ilmor': 'Leyton House',

    # Prost
    'Prost Mugen Honda': 'Prost',
    'Prost Peugeot': 'Prost',
    'Prost Acer': 'Prost',

    # Dallara
    'Dallara Judd': 'Dallara',
    'Dallara Ferrari': 'Dallara',
    'Dallara Ford': 'Dallara',

    # Larrousse
    'Larrousse Lamborghini': 'Larrousse',
    'Larrousse Ford': 'Larrousse',

    # Osella
    'Osella Ford': 'Osella',
    'Osella Alfa Romeo': 'Osella',
    'Osella': 'Osella',
    'Osella Hart': 'Osella',

    # Kurtis Kraft
    'Kurtis Kraft Offenhauser': 'Kurtis Kraft',
    'Kurtis Kraft Novi': 'Kurtis Kraft',
    'Kurtis Kraft Cummins': 'Kurtis Kraft',

    # Marussia
    'Marussia Cosworth': 'Marussia',
    'Marussia Ferrari': 'Marussia',

    # Gordini
    'Simca-Gordini': 'Gordini',
    'Gordini': 'Gordini',

    # Connaught
    'Connaught Lea Francis': 'Connaught',
    'Connaught Alta': 'Connaught',

    # Eagle
    'Eagle Climax': 'Eagle',
    'Eagle Weslake': 'Eagle',

    # RAM
    'RAM Ford': 'RAM',
    'RAM Hart': 'RAM',

    # Shadow
    'Shadow Ford': 'Shadow',
    'Shadow Matra': 'Shadow',

    # Matra
    'Matra Ford': 'Matra',
    'Matra': 'Matra',
    'Matra Cosworth': 'Matra',
    'Matra BRM': 'Matra',

    # ERA
    'ERA': 'ERA',
    'ERA Bristol': 'ERA',

    # Spirit
    'Spirit Honda': 'Spirit',   
    'Spirit Hart': 'Spirit',

    # Frazer Nash
    'Frazer Nash': 'Frazer Nash',
    'Frazer Nash Bristol': 'Frazer Nash',

    # Emeryson
    'Emeryson Alta': 'Emeryson',
    'Emeryson Climax': 'Emeryson',

    # De Tomaso
    'De Tomaso OSCA': 'De Tomaso',
    'De Tomaso Alfa Romeo': 'De Tomaso',
    'De Tomaso Ford': 'De Tomaso',

    # Gilby
    'Gilby Climax': 'Gilby',
    'Gilby BRM': 'Gilby',

    # Tecno
    'Tecno': 'Tecno',
    'Tecno Cosworth': 'Tecno',

    # Ligier
    'Ligier Judd': 'Ligier',
    'Ligier Lamborghini': 'Ligier',

    # Euro Brun
    'Euro Brun Judd': 'Euro Brun',
    'Euro Brun Ford': 'Euro Brun',


    # Other
    'No Team': 'Privateer',
    'Toleman Hart': 'Toleman',       
    'Venturi Lamborghini': 'Venturi',        
    'Onyx Ford': 'Onyx',
    'AGS Ford': 'AGS',   
    'Rial Ford': 'Rial',
    'Zakspeed': 'Zakspeed',
    'Theodore Ford': 'Theodore',
    'Deidt Offenhauser': 'Deidt',
    'Sherman Offenhauser': 'Sherman',
    'Schroeder Offenhauser': 'Schroeder',
    'Kuzma Offenhauser': 'Kuzma',
    'Lesovsky Offenhauser': 'Lesovsky',
    'Watson Offenhauser': 'Watson',
    'Phillips Offenhauser': 'Phillips',
    'Epperly Offenhauser': 'Epperly',
    'Trevis Offenhauser': 'Trevis',
    'HRT Cosworth': 'HRT',
    'Virgin Cosworth': 'Virgin',
    'Caterham Renault': 'Caterham',
    'Milano Speluzzi': 'Milano',
    'Turner Offenhauser': 'Turner',
    'Alta': 'Alta',    
    'Moore Offenhauser': 'Moore',
    'Nichels Offenhauser': 'Nichels',
    'Marchese Offenhauser': 'Marchese',
    'Stevens Offenhauser': 'Stevens',
    'Langley Offenhauser': 'Langley',
    'Ewing Offenhauser': 'Ewing',   
    'Rae Offenhauser': 'Rae',
    'Olson Offenhauser': 'Olson',
    'Wetteroth Offerhauser': 'Wetteroth',
    'Snowberger Offenhauser': 'Snowberger',
    'Adams Offenhauser': 'Adams',
    'HWM Alta': 'HWM',    
    'Lancia': 'Lancia',
    'Talbot-Lago': 'Talbot-Lago',
    'BRP BRM': 'BRP',
    'Hesketh Ford': 'Hesketh',
    'Hill Ford': 'Hill',
    'Ensign Ford': 'Ensign',
    'Penske Ford': 'Penske',
    'Fittipaldi Ford': 'Fittipaldi',
    'ISO Marlboro Ford': 'ISO Marlboro',
    'Iso Marlboro Ford': 'ISO Marlboro',
    'Surtees Ford': 'Surtees',
    'Parnelli Ford': 'Parnelli',
    'Super Aguri Honda': 'Super Aguri',
    'MRT Mercedes': 'Manor',
    'Brawn Mercedes': 'Brawn',
    'Spyker Ferrari': 'Spyker',
    'MF1 Toyota': 'Midland',
    'Veritas': 'Veritas',
    'Pawl Offenhauser': 'Pawl',
    'Hall Offenhauser': 'Hall',
    'Bromme Offenhauser': 'Bromme',
    'OSCA': 'OSCA',
    'BMW': 'BMW',
    'EMW': 'EMW',
    'Pankratz Offenhauser': 'Pankratz',
    'Bugatti': 'Bugatti',
    'Klenk BMW': 'Klenk',
    'Dunn Offenhauser': 'Dunn',    
    'Elder Offenhauser': 'Elder',
    'Christensen Offenhauser': 'Christensen',
    'Sutton Offenhauser': 'Sutton',
    'Tec-Mec Maserati': 'Tec-Mec',
    'Meskowski Offenhauser': 'Meskowski',
    'Scarab': 'Scarab',
    'Ferguson Climax': 'Ferguson',
    'ENB Maserati': 'ENB',
    'Stebro Ford': 'Stebro',               
    'Shannon Climax': 'Shannon',     
    'Protos Cosworth': 'Protos',   
    'Bellasi Ford': 'Bellasi',       
    'Eifelland Ford': 'Eifelland',
    'Politoys Ford': 'Politoys',
    'Connew Ford': 'Connew',
    'Trojan Ford': 'Trojan',
    'Amon Ford': 'Amon',
    'Token Ford': 'Token',
    'Lyncar Ford': 'Lyncar',
    'Boro Ford': 'Boro',
    'Kojima Ford': 'Kojima',
    'LEC Ford': 'LEC',
    'Merzario Ford': 'Merzario',
    'Martini Ford': 'Martini',
    'Rebaque Ford': 'Rebaque',
    'AGS Motori Moderni': 'AGS',
    'Coloni Ford': 'Coloni',
    'Zakspeed Yamaha': 'Zakspeed',
    'Fondmetal Ford': 'Fondmetal',
    'Moda Judd': 'Moda',    
    'Simtek Ford': 'Simtek',
    'Pacific Ilmor': 'Pacific',
    'Forti Ford': 'Forti',
    'Lambo Lamborghini': 'Modena'
}}

## ID Maps

In [5]:
def load_id_map(path: str, default: dict | list | None = None):
    """
    Load the pickle file ID maps if they exist, otherwise return an empty dictionary or list
    
    """
    if os.path.exists(path):
        with open(path, 'rb') as f:
            return pickle.load(f)
    else:
        return {} if default is None else default

def is_file_locked(filepath: str) -> bool:
    """
    Check if a file is currently locked by another process
    
    """
    try:
        with open(filepath, 'r+b') as f:
            pass
        return False
    except (IOError, OSError):
        return True

def save_id_map(path: str, id_map, max_retries: int = 3):
    """
    Save the ID map to a pickle file with retry mechanism for permission errors

    """
    # Check if file is locked before attempting to save
    if os.path.exists(path) and is_file_locked(path):
        print(f"Warning: {path} appears to be locked by another process")
    
    for attempt in range(max_retries):
        try:
            # Try to save directly first
            with open(path, 'wb') as f:
                pickle.dump(id_map, f)
            return  # Success, exit function
            
        except PermissionError as e:
            if attempt < max_retries - 1:
                print(f"Permission denied on attempt {attempt + 1}, retrying in 1 second...")
                time.sleep(1)
            else:
                # Final attempt: try using a temporary file and then moving it
                try:
                    temp_dir = os.path.dirname(path)
                    temp_file = tempfile.NamedTemporaryFile(mode='wb', dir=temp_dir, delete=False, suffix='.pkl')
                    
                    with temp_file as f:
                        pickle.dump(id_map, f)
                    
                    # Move the temporary file to the target location
                    shutil.move(temp_file.name, path)
                    print(f"Successfully saved {path} using temporary file method")
                    return
                    
                except Exception as final_e:
                    print(f"Failed to save {path} after {max_retries} attempts: {final_e}")
                    raise final_e
        except Exception as e:
            print(f"Unexpected error saving {path}: {e}")
            raise e

## Column-Index Mapping

In [6]:
def init_col_map(col_map: dict):
    """
    Takes a {column_name: column_index} dictionary as an input and returns a new dictionary with
    indexes and empty lists

    """
    return {col: {'index': index, 'values': []} for col, index in col_map.items()}

## Scrape URL

In [7]:
def scrape_url_table(urls: list, total_col: int, col_idx_map: dict, id_cols: list, page_lvl_cols: list = None, data_folder: str = '../data/raw', id_mask: dict = None, auto_url_id: bool = False) -> pd.DataFrame:
    """
    Scrapes a table from a website and returns a dataframe of scraped values


    Parameters
    ----------
    urls : list
        The webpage URL(s) to scrape
    total_cols : int
        Number of columns in the table
    col_idx_map : dict
        A dictionary mapping desired column names to column indices
        Example: {'race_id': None, 'start_pos': 1, 'driver_name': 3...}
    id_cols : list
        List of the names of ID columns in the col_idx_map
    page_lvl_cols : list, optional
        List of columns that need scraping on the page level, index will
        contain path to scrape that data
    data_folder : str, optional
        File path of data folder for saving any ID maps
        Default: ../data/raw
    id_mask : dict, optional
        Dictionary mapping column names to value mapping dictionaries
        Example: {'team_name': {'Red Bull Racing': 'Red Bull'}}
    auto_url_id : bool, optional
        Whether to automatically create URL IDs for each row
        Default: False
    

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the scraped table
    
    """
    # Initiate data mapping
    col_data = init_col_map(col_idx_map)

    # Load URL ID map only if url_id is True
    if auto_url_id:
        if data_folder:
            url_id_map = load_id_map(f'{data_folder}/url_id_map.pkl')
        else:
            url_id_map = load_id_map('url_id_map.pkl')

    # Establish web browser
    browser = webdriver.Chrome()
    browser.maximize_window()
    
    for url in urls:
        
        # Validate URL
        try:
            browser.get(url)
        except Exception as e:
            print(f'URL ERROR: "{url}"\n{e}')
            continue

        # Get or create URL ID only if auto_url_id is True
        if auto_url_id:
            if url in url_id_map:
                url_id_val = url_id_map[url]
            else:
                url_id_val = max(url_id_map.values()) + 1 if url_id_map else 1
                url_id_map[url] = url_id_val
                # Save the updated URL ID map
                if data_folder:
                    save_id_map(f'{data_folder}/url_id_map.pkl', url_id_map)
                else:
                    save_id_map('url_id_map.pkl', url_id_map)

        try:
            # Find table data
            table = browser.find_elements(By.TAG_NAME, 'table')
            for tr in table:
                rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, 'td')
                    
                    # Validate table has the right number of columns
                    if len(cells) == total_col:
                        
                        # For each column in the column map append the corresponding data
                        for col_name, col_info in col_data.items():
                            
                            # Skip indexes with None
                            if col_info['index'] == None:
                                continue
                            
                            # Create IDs and ID maps
                            if col_name in id_cols:
                                
                                # Load or create ID map
                                if data_folder:
                                    id_map = load_id_map(f'{data_folder}/{col_name}_map.pkl')
                                else:
                                    id_map = load_id_map(f'{col_name}_map.pkl')
                                
                                # Get the value from the table cell using the index from col_map
                                if isinstance(col_info['index'], int):
                                    scraped_value = cells[col_info['index']].text.strip()
                                elif page_lvl_cols and col_name in page_lvl_cols:
                                    scraped_value = col_info['index'](browser)
                                else:
                                    raise ValueError(f"Unsupported index type for {col_name}: {type(col_info['index'])}")

                                # Apply ID mask if provided
                                if id_mask and col_name in id_mask:
                                    scraped_value = id_mask[col_name].get(scraped_value, scraped_value)
                                
                                # Search through ID map keys to find a match
                                matched_key = None
                                for existing_key in id_map.keys():
                                    if scraped_value in existing_key:
                                        matched_key = existing_key
                                        break
                                
                                # Use matched key if found, otherwise use scraped value
                                lookup_key = matched_key if matched_key is not None else scraped_value
                                
                                # Append existing ID or create new key-value pair
                                if lookup_key in id_map:
                                    col_info['values'].append(id_map[lookup_key])
                                else:
                                    new_id = max(id_map.values()) + 1 if id_map else 1
                                    id_map[lookup_key] = new_id
                                    col_info['values'].append(new_id)
                                
                                # Save the updated ID map
                                if data_folder:
                                    save_id_map(f'{data_folder}/{col_name}_map.pkl', id_map)
                                else:
                                    save_id_map(f'{col_name}_map.pkl', id_map)
                            
                            # Handle non-ID columns
                            else:
                                if isinstance(col_info['index'], int):
                                    scraped_value = cells[col_info['index']].text.strip()
                                elif page_lvl_cols and col_name in page_lvl_cols:
                                    scraped_value = col_info['index'](browser)
                                else:
                                    raise ValueError(f"Unsupported index type for {col_name}: {type(col_info['index'])}")
                                col_info['values'].append(scraped_value)
                        
                        # Append the same URL ID for every row from this URL only if auto_url_id is True
                        if auto_url_id:
                            if 'url_id' not in col_data:
                                col_data['url_id'] = {'index': None, 'values': []}
                            col_data['url_id']['values'].append(url_id_val)
                                
        except Exception as e:
            print(f'NO DATA FOUND ERROR: {e}')
    
    browser.close()
    
    # Convert column data to DataFrame
    df_data = {}
    for col_name, col_info in col_data.items():
        df_data[col_name] = col_info['values']
    
    try:
        df = pd.DataFrame(df_data)
    except Exception as e:
        print(f'ARRAY LENGTH ERROR: {e}')
        return(f'ERROR: {e}')
    
    return df

## Aggregate Columns

In [20]:
def aggregate_columns(df, columns: list = None, boolean_columns: list = None):
    """
    Universal aggregation function that returns mean, min, max, and std values for numeric columns
    and boolean aggregation for True/False columns
    
    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame
    columns : list
        List of numeric column names to aggregate. If None, aggregates all numeric columns.
    boolean_columns : list
        List of boolean column names to check for any True values.
    
    Returns
    -------
    pd.Series
        Aggregated statistics for the specified columns

    """
    agg = {}
    
    # Handle numeric columns
    if columns is None:
        # Get all numeric columns if none specified
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    for col in columns:
        if col in df.columns:
            agg[f'{col}_mean'] = df[col].mean()
            agg[f'{col}_min'] = df[col].min()
            agg[f'{col}_max'] = df[col].max()
            agg[f'{col}_std'] = df[col].std()
    
    # Handle boolean columns
    if boolean_columns is not None:
        for col in boolean_columns:
            if col in df.columns:
                agg[f'{col}_any'] = bool(df[col].any())
                agg[f'{col}_mean'] = df[col].mean()
    
    return pd.Series(agg)

# F1 Site 2001-2017

## Race Links

In [26]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2001
year_end = 2017
race_urls = []

while year_begin <= year_end:

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
    
    year_begin += 1

browser.close()

# Save links to file
load_id_map('../data/raw/links_2001_2017.pkl')
save_id_map('../data/raw/links_2001_2017.pkl', race_urls)

## Race Results

In [None]:
# Establish variables
urls = load_id_map('../data/raw/links_2001_2017.pkl')
total_cols = 7
col_idx_map = {
    'driver_id': 2,
    'position': 0,
    'driver_name': 2,
    'points': 6}
id_cols = ['driver_id']

# Scrape 2001-2017 results
df = scrape_url_table(
    urls,
    total_cols,
    col_idx_map,
    id_cols)
df.to_csv('../data/raw/race_results_raw_2001-2017.csv', encoding='utf-8', index=False)

# F1 Site 2018+

## Race Links & Circuit Data

In [28]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2018
year_end = datetime.now().year
race_urls = []
round_number = []

while year_begin <= year_end:
    r = 1  

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
            round_number.append(r)
            r += 1 

    year_begin += 1

browser.close()

link_data = pd.DataFrame({'race_url': race_urls, 'round_number': round_number})
link_data.to_csv('../data/raw/rounds_raw.csv', encoding='utf-8', index=False)

# Save links to file
load_id_map('../data/raw/links_2018+.pkl')
save_id_map('../data/raw/links_2018+.pkl', race_urls)

## Race Results

In [None]:
# Establish variables
urls = load_id_map('../data/raw/links_2018+.pkl')
total_cols = 7
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'circuit_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'team_id': 3,
    'year': lambda browser: int(browser.current_url.split("/")[5]),
    'race_url': lambda browser: browser.current_url,
    'circuit_name': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'driver_name': 2,
    'team_name': 3,
    'end_position': 0,
    'points': 6,
    'laps_completed': 4}
id_cols = ['race_id', 'driver_id', 'circuit_id', 'team_id']
page_lvl_cols = ['race_id', 'circuit_id', 'year', 'race_url', 'circuit_name']

# Scrape 2018+ results
df = scrape_url_table(
    urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/race_results_raw_2018+.csv', encoding='utf-8', index=False)

## Practices

In [None]:
# Create practice URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
practice_urls = []
for url in urls:
    for practice_num in [1, 2, 3]:
        practice_url = url.replace('/race-result', f'/practice/{practice_num}')
        practice_urls.append(practice_url)

# Establish other variables
total_cols = 6
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'session_type': lambda browser: browser.current_url.split("/")[9] + browser.current_url.split("/")[10],
    'lap_time': 4,
    'lap_count': 5,
    'position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id', 'session_type']

# Scrape practice results
df = scrape_url_table(
    practice_urls,
    total_cols, col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pratice_results_raw.csv', encoding='utf-8', index=False)

## Qualifying

In [None]:
# Create qualifying URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
qualifying_urls = []
for url in urls:
    qual_url = url.replace('/race-result', '/qualifying')
    qualifying_urls.append(qual_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'q1_time': 4,
    'q2_time': 5,
    'q3_time': 6,
    'qual_position': 0,
    'qual_laps': 7}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape qualifying results
df = scrape_url_table(
    qualifying_urls,
    total_cols, col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/qualifying_results_raw.csv', encoding='utf-8', index=False)

## Starting Grid

In [None]:
# Create starting grid URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
starting_urls = []
for url in urls:
    start_url = url.replace('/race-result', '/starting-grid')
    starting_urls.append(start_url)

# Establish other variables
total_cols = 5
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'start_position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape starting grid results
df = scrape_url_table(
    starting_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/starting_grid_results_raw.csv', encoding='utf-8', index=False)

## Pit Stops

In [None]:
# Create pit stop URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
pit_urls = []
for url in urls:
    ps_url = url.replace('/race-result', '/pit-stop-summary')
    pit_urls.append(ps_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'stop_number': 0,
    'stop_lap': 4,
    'pits_time': 6}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape pit stop results
df = scrape_url_table(
    pit_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pit_stop_results_raw.csv', encoding='utf-8', index=False)

## Fastest Laps

In [None]:
# Create fastest lap URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
fastest_lap_urls = []
for url in urls:
    fastest_url = url.replace('/race-result', '/fastest-laps')
    fastest_lap_urls.append(fastest_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'fastest_lap_time': 6,
    'lap_number': 4}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape fastest lap results
df = scrape_url_table(
    fastest_lap_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/fastest_lap_results_raw.csv', encoding='utf-8', index=False)

Permission denied on attempt 1, retrying in 1 second...


# Weather

In [None]:
import fastf1

urls = load_id_map('../data/raw/links_2018+.pkl')
sessions_collected = ['FP1', 'FP2', 'FP3', 'Qualifying', 'Race']
for url in urls:
    year = url.split('/')[5]
    gp = url.split('/')[8].replace('-', ' ')
    for s in sessions_collected:
        try:
            session = fastf1.get_session(year, gp, s)
        except Exception as e:
            print(f'No Data: {e}')

In [40]:

# Enable cache (important for performance)
#fastf1.Cache.enable_cache("cache")  # creates a folder "cache" to store data

# Load a session: example Bahrain GP 2023 Qualifying
session = fastf1.get_session(2025, 'australia', 'race')
session.load(laps=True, telemetry=True, weather=True, messages=True)  # downloads and parses the data

# Weather data is stored in session.weather_data (a structured numpy array)
weather_array = session.weather_data
laps_df = session.laps
telemetry_df = session.car_data
messages_df = session.race_control_messages

# Convert to DataFrames (laps is already a DataFrame, others may need conversion)
weather_df = pd.DataFrame(weather_array)
#telemetry_df = pd.DataFrame(telemetry_df)
messages_df = pd.DataFrame(messages_df)

weather_df
laps_df
#telemetry_df
#messages_df

core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate
0,0 days 01:13:00.002000,VER,1,0 days 00:01:59.392000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:20.705000,0 days 00:00:55.250000,NaT,0 days 01:12:04.853000,0 days 01:13:00.058000,249.0,292.0,247.0,215.0,False,INTERMEDIATE,1.0,True,Red Bull Racing,0 days 01:11:00.355000,2025-03-16 04:18:22.974,124,2.0,False,,False,False
1,0 days 01:15:49.358000,VER,1,NaT,2.0,1.0,NaT,0 days 01:15:38.205000,0 days 00:00:58.141000,0 days 00:00:37.976000,0 days 00:01:13.205000,0 days 01:13:58.233000,0 days 01:14:36.257000,0 days 01:15:49.446000,66.0,222.0,,210.0,False,INTERMEDIATE,2.0,True,Red Bull Racing,0 days 01:13:00.002000,2025-03-16 04:20:22.621,4,2.0,False,,False,False
2,0 days 01:18:31.526000,VER,1,NaT,3.0,2.0,0 days 01:15:51.658000,0 days 01:18:20.223000,0 days 00:00:56.230000,0 days 00:00:33.683000,0 days 00:01:12.232000,0 days 01:16:45.667000,0 days 01:17:19.489000,0 days 01:18:31.643000,78.0,203.0,,180.0,False,INTERMEDIATE,3.0,False,Red Bull Racing,0 days 01:15:49.358000,2025-03-16 04:23:11.977,4,2.0,False,,False,False
3,0 days 01:21:07.226000,VER,1,NaT,4.0,3.0,0 days 01:18:34.029000,0 days 01:20:56.543000,0 days 00:00:54.351000,0 days 00:00:32.712000,0 days 00:01:08.712000,0 days 01:19:26.041000,0 days 01:19:58.680000,0 days 01:21:07.378000,103.0,240.0,,187.0,False,INTERMEDIATE,4.0,False,Red Bull Racing,0 days 01:18:31.526000,2025-03-16 04:25:54.145,4,2.0,False,,False,False
4,0 days 01:23:30.835000,VER,1,0 days 00:02:23.609000,5.0,4.0,0 days 01:21:09.534000,NaT,0 days 00:00:53.513000,0 days 00:00:32.627000,0 days 00:00:57.469000,0 days 01:22:00.795000,0 days 01:22:33.422000,0 days 01:23:30.891000,79.0,130.0,237.0,194.0,False,INTERMEDIATE,5.0,False,Red Bull Racing,0 days 01:21:07.226000,2025-03-16 04:28:29.845,4,2.0,False,,False,False
5,0 days 01:25:50.489000,VER,1,0 days 00:02:19.654000,6.0,4.0,NaT,NaT,0 days 00:00:47.887000,0 days 00:00:33.228000,0 days 00:00:58.539000,0 days 01:24:18.778000,0 days 01:24:52.006000,0 days 01:25:50.545000,158.0,183.0,208.0,127.0,True,INTERMEDIATE,6.0,False,Red Bull Racing,0 days 01:23:30.835000,2025-03-16 04:30:53.454,4,2.0,False,,False,False
6,0 days 01:28:06.936000,VER,1,0 days 00:02:16.447000,7.0,4.0,NaT,NaT,0 days 00:00:45.898000,0 days 00:00:29.735000,0 days 00:01:00.814000,0 days 01:26:36.443000,0 days 01:27:06.178000,0 days 01:28:06.992000,174.0,197.0,277.0,208.0,True,INTERMEDIATE,7.0,False,Red Bull Racing,0 days 01:25:50.489000,2025-03-16 04:33:13.108,41,2.0,False,,False,False
7,0 days 01:29:43.766000,VER,1,0 days 00:01:36.830000,8.0,4.0,NaT,NaT,0 days 00:00:35.249000,0 days 00:00:20.145000,0 days 00:00:41.436000,0 days 01:28:42.241000,0 days 01:29:02.386000,0 days 01:29:43.822000,257.0,297.0,278.0,245.0,True,INTERMEDIATE,8.0,False,Red Bull Racing,0 days 01:28:06.936000,2025-03-16 04:35:29.555,1,2.0,False,,False,True
8,0 days 01:31:18.081000,VER,1,0 days 00:01:34.315000,9.0,4.0,NaT,NaT,0 days 00:00:33.850000,0 days 00:00:19.608000,0 days 00:00:40.857000,0 days 01:30:17.672000,0 days 01:30:37.280000,0 days 01:31:18.137000,,301.0,279.0,258.0,True,INTERMEDIATE,9.0,False,Red Bull Racing,0 days 01:29:43.766000,2025-03-16 04:37:06.385,1,2.0,False,,False,True
9,0 days 01:32:50.731000,VER,1,0 days 00:01:32.650000,10.0,4.0,NaT,NaT,0 days 00:00:33.117000,0 days 00:00:19.414000,0 days 00:00:40.119000,0 days 01:31:51.254000,0 days 01:32:10.668000,0 days 01:32:50.787000,259.0,302.0,278.0,263.0,True,INTERMEDIATE,10.0,False,Red Bull Racing,0 days 01:31:18.081000,2025-03-16 04:38:40.700,1,2.0,False,,False,True


In [35]:

# Usage example with weather data
numeric_columns = ['AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'Pressure']
boolean_columns = ['Rainfall']

session_weather_features = aggregate_columns(
    weather_df, 
    columns=numeric_columns, 
    boolean_columns=boolean_columns
)

print(session_weather_features)

AirTemp_mean        15.707865
AirTemp_min              15.1
AirTemp_max              16.6
AirTemp_std           0.37574
TrackTemp_mean      18.942135
TrackTemp_min            18.3
TrackTemp_max            19.4
TrackTemp_std        0.276315
WindSpeed_mean       3.475281
WindSpeed_min             0.7
WindSpeed_max             6.9
WindSpeed_std        1.242267
Humidity_mean       78.421348
Humidity_min             68.0
Humidity_max             92.0
Humidity_std          6.50658
Pressure_mean     1009.901685
Pressure_min           1009.0
Pressure_max           1010.7
Pressure_std         0.444994
Rainfall_any             True
Rainfall_mean        0.325843
dtype: object
