# Introduction
Jack Wilson
9/23/2025

This notebook outlines scraping and collecting of all data raw data used in the model

# Import Modules

In [3]:
import pandas as pd
import numpy as np
import time, random, re, os, gc, shutil, pickle, tempfile, os, sys
from math import e

import fastf1
import logging

from datetime import timedelta, datetime

from selenium import webdriver
from selenium.webdriver.common.by import By

from sklearn.linear_model import LinearRegression

In [4]:
# Path Setup: Connects Notebook to 'src' Package

# Add the project root (one level up from 'notebooks/') to the system path
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.data_functions import load_id_map, save_id_map, init_col_map, scrape_url_table, constructor_mapping

# DataFrame Display Options

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)

# F1 Site 2001-2017

## Race Links

In [26]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2001
year_end = 2017
race_urls = []

while year_begin <= year_end:

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
    
    year_begin += 1

browser.close()

# Save links to file
load_id_map('../data/raw/links_2001_2017.pkl')
save_id_map('../data/raw/links_2001_2017.pkl', race_urls)

## Race Results

In [None]:
# Establish variables
urls = load_id_map('../data/raw/links_2001_2017.pkl')
total_cols = 7
col_idx_map = {
    'driver_id': 2,
    'position': 0,
    'driver_name': 2,
    'points': 6}
id_cols = ['driver_id']

# Scrape 2001-2017 results
df = scrape_url_table(
    urls,
    total_cols,
    col_idx_map,
    id_cols)
df.to_csv('../data/raw/race_results_raw_2001-2017.csv', encoding='utf-8', index=False)

# F1 Site 2018+

## Race Links & Circuit Data

In [28]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2018
year_end = datetime.now().year
race_urls = []
round_number = []

while year_begin <= year_end:
    r = 1  

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
            round_number.append(r)
            r += 1 

    year_begin += 1

browser.close()

link_data = pd.DataFrame({'race_url': race_urls, 'round_number': round_number})
link_data.to_csv('../data/raw/rounds_raw.csv', encoding='utf-8', index=False)

# Save links to file
load_id_map('../data/raw/links_2018+.pkl')
save_id_map('../data/raw/links_2018+.pkl', race_urls)

## Race Results

In [None]:
# Establish variables
urls = load_id_map('../data/raw/links_2018+.pkl')
total_cols = 7
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'circuit_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'team_id': 3,
    'year': lambda browser: int(browser.current_url.split("/")[5]),
    'race_url': lambda browser: browser.current_url,
    'circuit_name': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'driver_name': 2,
    'team_name': 3,
    'end_position': 0,
    'points': 6,
    'laps_completed': 4}
id_cols = ['race_id', 'driver_id', 'circuit_id', 'team_id']
page_lvl_cols = ['race_id', 'circuit_id', 'year', 'race_url', 'circuit_name']

# Scrape 2018+ results
df = scrape_url_table(
    urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/race_results_raw_2018+.csv', encoding='utf-8', index=False)

## Practices

In [None]:
# Create practice URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
practice_urls = []
for url in urls:
    for practice_num in [1, 2, 3]:
        practice_url = url.replace('/race-result', f'/practice/{practice_num}')
        practice_urls.append(practice_url)

# Establish other variables
total_cols = 6
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'session_type': lambda browser: browser.current_url.split("/")[9] + browser.current_url.split("/")[10],
    'lap_time': 4,
    'lap_count': 5,
    'position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id', 'session_type']

# Scrape practice results
df = scrape_url_table(
    practice_urls,
    total_cols, col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pratice_results_raw.csv', encoding='utf-8', index=False)

## Qualifying

In [None]:
# Create qualifying URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
qualifying_urls = []
for url in urls:
    qual_url = url.replace('/race-result', '/qualifying')
    qualifying_urls.append(qual_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'q1_time': 4,
    'q2_time': 5,
    'q3_time': 6,
    'qual_position': 0,
    'qual_laps': 7}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape qualifying results
df = scrape_url_table(
    qualifying_urls,
    total_cols, col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/qualifying_results_raw.csv', encoding='utf-8', index=False)

## Starting Grid

In [None]:
# Create starting grid URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
starting_urls = []
for url in urls:
    start_url = url.replace('/race-result', '/starting-grid')
    starting_urls.append(start_url)

# Establish other variables
total_cols = 5
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'start_position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape starting grid results
df = scrape_url_table(
    starting_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/starting_grid_results_raw.csv', encoding='utf-8', index=False)

## Pit Stops

In [None]:
# Create pit stop URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
pit_urls = []
for url in urls:
    ps_url = url.replace('/race-result', '/pit-stop-summary')
    pit_urls.append(ps_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'stop_number': 0,
    'stop_lap': 4,
    'pits_time': 6}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape pit stop results
df = scrape_url_table(
    pit_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pit_stop_results_raw.csv', encoding='utf-8', index=False)

## Fastest Laps

In [None]:
# Create fastest lap URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
fastest_lap_urls = []
for url in urls:
    fastest_url = url.replace('/race-result', '/fastest-laps')
    fastest_lap_urls.append(fastest_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'fastest_lap_time': 6,
    'lap_number': 4}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape fastest lap results
df = scrape_url_table(
    fastest_lap_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/fastest_lap_results_raw.csv', encoding='utf-8', index=False)

Permission denied on attempt 1, retrying in 1 second...


# FastF1

## Create Driver Code Map

In [None]:
# Establish web browser and initial variables
urls = load_id_map('../data/raw/links_2018+.pkl')
practice_urls = []
for url in urls:
    for practice_num in [1, 2, 3]:
        practice_url = url.replace('/race-result', f'/practice/{practice_num}')
        practice_urls.append(practice_url)

browser = webdriver.Chrome()
browser.maximize_window()

driver_code_map = {}
failed_urls = []

# Parse urls
for url in practice_urls:
    try:
        browser.get(url)
        time.sleep(1)
        
        tables = browser.find_elements(By.TAG_NAME, "table")
        if not tables:
            print(f"No tables found on {url}")
            continue
            
        for table in tables:
            rows = table.find_elements(By.TAG_NAME, "tr")[1:]
            
            for row in rows:
                try:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    
                    if len(cells) < 3:
                        continue

                    driver_cell = cells[2]
                    
                    # Get all text content from the cell
                    cell_html = driver_cell.get_attribute('innerHTML')
                    
                    # Extract name and code
                    try:
                        # Get all span text
                        all_spans = driver_cell.find_elements(By.TAG_NAME, 'span')
                        
                        # Collect name parts
                        name_parts = []
                        driver_code = ''
                        
                        for span in all_spans:
                            classes = span.get_attribute('class') or ''
                            text = span.get_attribute('textContent').strip()
                            
                            if not text:
                                continue
                            
                            # Name spans
                            if 'max-lg:hidden' in classes or 'max-md:hidden' in classes:
                                name_parts.append(text)
                            # Code span
                            elif 'md:hidden' in classes and not driver_code:
                                driver_code = text
                        
                        full_name = " ".join(name_parts)
                        
                        # Save if we have both name and code and haven't seen this name
                        if full_name and driver_code and full_name not in driver_code_map:
                            driver_code_map[full_name] = driver_code
                            print(f"{full_name}: {driver_code}")
                    
                    except Exception as e:
                        # Skip rows with parsing issues
                        continue
                        
                except Exception as e:
                    # Skip problematic rows
                    continue
                    
    except Exception as e:
        print(f"Failed to load {url}: {e}")
        failed_urls.append(url)
        continue

browser.close()

print(f"\n{'='*50}")
print(f"Total drivers found: {len(driver_code_map)}")
print(f"Failed URLs: {len(failed_urls)}")
if failed_urls:
    print("Failed URLs:")
    for url in failed_urls[:5]:
        print(f"  - {url}")

# Save driver code map to file
save_id_map('../data/raw/driver_code_map.pkl', driver_code_map)

✓ Lewis Hamilton: HAM
✓ Valtteri Bottas: BOT
✓ Max Verstappen: VER
✓ Kimi Räikkönen: RAI
✓ Sebastian Vettel: VET
✓ Daniel Ricciardo: RIC
✓ Romain Grosjean: GRO
✓ Fernando Alonso: ALO
✓ Carlos Sainz: SAI
✓ Stoffel Vandoorne: VAN
✓ Pierre Gasly: GAS
✓ Sergey Sirotkin: SIR
✓ Nico Hulkenberg: HUL
✓ Esteban Ocon: OCO
✓ Lance Stroll: STR
✓ Sergio Perez: PER
✓ Kevin Magnussen: MAG
✓ Brendon Hartley: HAR
✓ Marcus Ericsson: ERI
✓ Charles Leclerc: LEC
✓ Robert Kubica: KUB
✓ Nicholas Latifi: LAT
✓ Antonio Giovinazzi: GIO
✓ Lando Norris: NOR
✓ Artem Markelov: MAR
✓ Sean Gelael: GEL
✓ Daniil Kvyat: KVY
✓ Alexander Albon: ALB
✓ George Russell: RUS
✓ Naoki Yamamoto: YAM
✓ Jack Aitken: AIT
✓ Roy Nissany: NIS
✓ Pietro Fittipaldi: FIT
✓ Mick Schumacher: MSC
✓ Yuki Tsunoda: TSU
✓ Nikita Mazepin: MAZ
✓ Callum Ilott: ILO
✓ Zhou Guanyu: ZHO
✓ Nyck De Vries: DEV
✓ Juri Vips: VIP
✓ Liam Lawson: LAW
✓ Robert Shwartzman: SHW
✓ Alex Palou: PAL
✓ Theo Pourchaire: POU
✓ Logan Sargeant: SAR
✓ Jack Doohan: DOO
✓ Pat

In [23]:
driver_code_map = load_id_map('../data/raw/driver_code_map.pkl')
driver_id_map = pd.read_pickle('../data/raw/driver_id_map.pkl')

# Compare driver names between the two maps
code_names = set(driver_code_map.keys())
id_names = set(driver_id_map.keys())

# Find names in driver_code_map but not in driver_id_map
missing_in_id = code_names - id_names
print(f"Names in driver_code_map but not in driver_id_map ({len(missing_in_id)}):")
for name in sorted(missing_in_id):
    print(f"  {name} -> {driver_code_map[name]}")

# Find names in driver_id_map but not in driver_code_map
missing_in_code = id_names - code_names
print(f"\nNames in driver_id_map but not in driver_code_map ({len(missing_in_code)}):")
for name in sorted(missing_in_code):
    print(f"  {name} -> {driver_id_map[name]}")

print(f"\nTotal driver_code_map entries: {len(driver_code_map)}")
print(f"Total driver_id_map entries: {len(driver_id_map)}")

Names in driver_code_map but not in driver_id_map (0):

Names in driver_id_map but not in driver_code_map (90):
  Adrian Sutil -> 59
  Alex Yoong -> 26
  Alexander Dunne -> 148
  Alexander Rossi -> 96
  Alexander Wurz -> 48
  Allan McNish -> 31
  Andre Lotterer -> 90
  Anthony Davidson -> 32
  Antonio Pizzonia -> 33
  Arvid Lindblad -> 149
  Bruno Senna -> 71
  Charles Pic -> 81
  Christian Klien -> 40
  Christijan Albers -> 47
  Cristiano da Matta -> 35
  David Coulthard -> 2
  Dino Beganovic -> 146
  Eddie Irvine -> 11
  Enrique Bernoldi -> 21
  Esteban Gutierrez -> 82
  Felipe Massa -> 30
  Felipe Nasr -> 92
  Franck Montagny -> 54
  Gabriel Bortoleto -> 126
  Gaston Mazzacane -> 22
  Giancarlo Fisichella -> 13
  Gianmaria Bruni -> 42
  Giedo van der Garde -> 86
  Giorgio Pantano -> 41
  Heikki Kovalainen -> 58
  Heinz-Harald Frentzen -> 5
  Jacques Villeneuve -> 19
  Jaime Alguersuari -> 66
  Jarno Trulli -> 16
  Jean Alesi -> 9
  Jean-Eric Vergne -> 80
  Jenson Button -> 14
  Jero

## Aggregate Data

In [28]:
x = pd.read_parquet('../data/raw/fastf1/2025_Singapore_Race_laps.parquet')

# Filter rows where trackstatus is 1 and both pitouttime and pitintime are NaT
filtered_data = x[
    (x['TrackStatus'] == '1')
    & x['PitOutTime'].isna()
    & x['PitInTime'].isna()
    & x['IsAccurate'] == True
    & (x['LapTime'] < x['LapTime'].quantile(0.95))
].copy()

filtered_data['LapTime'] = filtered_data['LapTime'].dt.total_seconds()

# Load maps
driver_id_map = pd.read_pickle('../data/raw/driver_id_map.pkl')
race_id_map = pd.read_pickle('../data/raw/race_id_map.pkl')
circuit_id_map = pd.read_pickle('../data/raw/circuit_id_map.pkl')

# Add driver_id column
filtered_data['driver_id'] = filtered_data['Driver'].map(
    lambda name: driver_id_map.get(name[:3].upper(), np.nan)
)

# Add race_id and session columns
if 'race_id' in x.columns:
    filtered_data['race_id'] = x.loc[filtered_data.index, 'race_id']
if 'session' in x.columns:
    filtered_data['session'] = x.loc[filtered_data.index, 'session']

# Add circuit_id column
if 'race_id' in filtered_data.columns:
    filtered_data['circuit_id'] = filtered_data['race_id'].map(
        lambda rid: circuit_id_map.get(
            ''.join([c for c in race_id_map.get(rid, '') if not c.isdigit() and c != '_']).strip(),
            np.nan
        ) if rid in race_id_map else np.nan
    )

# Check if we have any data after filtering
if len(filtered_data) == 0:
    print("No data found after filtering. Check your filter conditions.")
    print(f"Original data shape: {x.shape}")
    print(f"TrackStatus=1 count: {len(x[x['TrackStatus'] == 1])}")
    print(f"PitOutTime isna count: {len(x[x['PitOutTime'].isna()])}")
    print(f"PitInTime isna count: {len(x[x['PitInTime'].isna()])}")
else:
    print(f"Found {len(filtered_data)} rows after filtering")

# Helper functions for summary computation
def get_degradation_rate(lap_times, lap_numbers):
    if len(lap_times) > 3:
        X = np.array(lap_numbers).reshape(-1, 1)
        y = np.array(lap_times)
        model = LinearRegression().fit(X, y)
        return model.coef_[0]
    return np.nan

def compute_compound_stats(comp_data, compound):
    if comp_data.empty:
        return {
            f'avg_pace_{compound.lower()}': np.nan,
            f'std_pace_{compound.lower()}': np.nan,
            f'laps_on_{compound.lower()}': 0,
            f'deg_rate_{compound.lower()}': np.nan,
        }
    
    lap_times = comp_data['LapTime'].values
    lap_numbers = comp_data['LapNumber'].values
    
    return {
        f'avg_pace_{compound.lower()}': np.nanmean(lap_times),
        f'std_pace_{compound.lower()}': np.nanstd(lap_times),
        f'laps_on_{compound.lower()}': len(lap_times),
        f'deg_rate_{compound.lower()}': get_degradation_rate(lap_times, lap_numbers),
    }

# Build summary dataframe
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
summaries = []

for driver, group in filtered_data.groupby('Driver'):
    summary = {'driver_name': driver}
    
    # Add IDs from the first row of this driver's group (same for all rows)
    if 'driver_id' in group.columns:
        summary['driver_id'] = group['driver_id'].iloc[0]
    if 'race_id' in group.columns:
        summary['race_id'] = group['race_id'].iloc[0]
    if 'circuit_id' in group.columns:
        summary['circuit_id'] = group['circuit_id'].iloc[0]
    if 'session' in group.columns:
        summary['session'] = group['session'].iloc[0]
    
    # Add compound-specific stats
    for comp in compounds:
        comp_data = group[group['Compound'] == comp]
        summary.update(compute_compound_stats(comp_data, comp))
    
    summaries.append(summary)

result = pd.DataFrame(summaries)
result

Found 1180 rows after filtering


Unnamed: 0,driver_name,driver_id,race_id,circuit_id,session,avg_pace_soft,std_pace_soft,laps_on_soft,deg_rate_soft,avg_pace_medium,std_pace_medium,laps_on_medium,deg_rate_medium,avg_pace_hard,std_pace_hard,laps_on_hard,deg_rate_hard,avg_pace_intermediate,std_pace_intermediate,laps_on_intermediate,deg_rate_intermediate,avg_pace_wet,std_pace_wet,laps_on_wet,deg_rate_wet
0,ALB,,167,,Race,98.376167,4.715031,18,-0.469674,99.533585,3.187425,41,-0.072078,,,0,,,,0,,,,0,
1,ALO,,167,,Race,99.135769,2.682735,26,-0.088043,97.925529,4.407023,34,-0.194301,,,0,,,,0,,,,0,
2,ANT,,167,,Race,,,0,,98.304652,2.216822,23,-0.183065,97.172278,3.012149,36,-0.047907,,,0,,,,0,
3,BEA,,167,,Race,,,0,,99.463955,2.736229,22,-0.144508,98.132789,2.906845,38,-0.103551,,,0,,,,0,
4,BOR,,167,,Race,,,0,,101.087083,4.192109,12,-0.431698,99.11817,3.808767,47,-0.018198,,,0,,,,0,
5,COL,,167,,Race,100.676769,3.755982,13,-0.341254,99.005711,2.517406,45,-0.02223,,,0,,,,0,,,,0,
6,GAS,,167,,Race,98.1532,5.213129,10,-0.462618,100.21313,3.667841,23,-0.207696,99.402731,3.824296,26,0.058163,,,0,,,,0,
7,HAD,,167,,Race,99.721105,2.75283,19,-0.137761,,,0,,98.7292,2.958193,40,-0.066699,,,0,,,,0,
8,HAM,,167,,Race,96.814357,4.200845,14,-0.093629,98.779043,2.508297,23,-0.105267,97.1708,4.017064,20,-0.275383,,,0,,,,0,
9,HUL,,167,,Race,98.061882,6.046062,17,-0.318838,99.598708,2.76241,24,-0.118865,100.818368,6.393824,19,0.16363,,,0,,,,0,


In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

x = pd.read_parquet('../data/raw/fastf1/2025_Singapore_Race_laps.parquet')

# Filter rows where trackstatus is 1 and both pitouttime and pitintime are NaT
filtered_data = x[
    (x['TrackStatus'] == '1')
    & x['PitOutTime'].isna()
    & x['PitInTime'].isna()
    & x['IsAccurate'] == True
    & (x['LapTime'] < x['LapTime'].quantile(0.95))
].copy()

filtered_data['LapTime'] = filtered_data['LapTime'].dt.total_seconds()

# Load maps
driver_id_map = pd.read_pickle('../data/raw/driver_id_map.pkl')
race_id_map = pd.read_pickle('../data/raw/race_id_map.pkl')
circuit_id_map = pd.read_pickle('../data/raw/circuit_id_map.pkl')

# Debug: Check what we have
print("Sample driver names in data:", filtered_data['Driver'].unique()[:5])
print("Sample driver_id_map keys:", list(driver_id_map.keys())[:5])

# Add driver_id column - try exact match first, then fuzzy matching if needed
filtered_data['driver_id'] = filtered_data['Driver'].map(
    lambda name: driver_id_map.get(name, np.nan)
)

# Add race_id and session columns
if 'race_id' in x.columns:
    filtered_data['race_id'] = x.loc[filtered_data.index, 'race_id']
if 'session' in x.columns:
    filtered_data['session'] = x.loc[filtered_data.index, 'session']

# Add circuit_id column
if 'race_id' in filtered_data.columns:
    filtered_data['circuit_id'] = filtered_data['race_id'].map(
        lambda rid: circuit_id_map.get(
            ''.join([c for c in race_id_map.get(rid, '') if not c.isdigit() and c != '_']).strip(),
            np.nan
        ) if rid in race_id_map else np.nan
    )

# Check if we have any data after filtering
if len(filtered_data) == 0:
    print("No data found after filtering. Check your filter conditions.")
    print(f"Original data shape: {x.shape}")
    print(f"TrackStatus=1 count: {len(x[x['TrackStatus'] == 1])}")
    print(f"PitOutTime isna count: {len(x[x['PitOutTime'].isna()])}")
    print(f"PitInTime isna count: {len(x[x['PitInTime'].isna()])}")
else:
    print(f"Found {len(filtered_data)} rows after filtering")

# Helper functions for summary computation
def get_degradation_rate(lap_times, lap_numbers):
    if len(lap_times) > 3:
        X = np.array(lap_numbers).reshape(-1, 1)
        y = np.array(lap_times)
        model = LinearRegression().fit(X, y)
        return model.coef_[0]
    return np.nan

def compute_compound_stats(comp_data, compound):
    if comp_data.empty:
        return {
            f'avg_pace_{compound.lower()}': np.nan,
            f'std_pace_{compound.lower()}': np.nan,
            f'laps_on_{compound.lower()}': 0,
            f'deg_rate_{compound.lower()}': np.nan,
        }
    
    lap_times = comp_data['LapTime'].values
    lap_numbers = comp_data['LapNumber'].values
    
    return {
        f'avg_pace_{compound.lower()}': np.nanmean(lap_times),
        f'std_pace_{compound.lower()}': np.nanstd(lap_times),
        f'laps_on_{compound.lower()}': len(lap_times),
        f'deg_rate_{compound.lower()}': get_degradation_rate(lap_times, lap_numbers),
    }

# Build summary dataframe
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
summaries = []

for driver, group in filtered_data.groupby('Driver'):
    summary = {'driver_name': driver}
    
    # Add IDs from the first row of this driver's group (same for all rows)
    if 'driver_id' in group.columns:
        summary['driver_id'] = group['driver_id'].iloc[0]
    if 'race_id' in group.columns:
        summary['race_id'] = group['race_id'].iloc[0]
    if 'circuit_id' in group.columns:
        summary['circuit_id'] = group['circuit_id'].iloc[0]
    if 'session' in group.columns:
        summary['session'] = group['session'].iloc[0]
    
    # Add compound-specific stats
    for comp in compounds:
        comp_data = group[group['Compound'] == comp]
        summary.update(compute_compound_stats(comp_data, comp))
    
    summaries.append(summary)

result = pd.DataFrame(summaries)
result

Sample driver names in data: ['RUS' 'VER' 'NOR' 'PIA' 'ANT']
Sample driver_id_map keys: ['Michael Schumacher', 'David Coulthard', 'Rubens Barrichello', 'Nick Heidfeld', 'Heinz-Harald Frentzen']
Found 1180 rows after filtering


Unnamed: 0,driver_name,driver_id,race_id,circuit_id,session,avg_pace_soft,std_pace_soft,laps_on_soft,deg_rate_soft,avg_pace_medium,std_pace_medium,laps_on_medium,deg_rate_medium,avg_pace_hard,std_pace_hard,laps_on_hard,deg_rate_hard,avg_pace_intermediate,std_pace_intermediate,laps_on_intermediate,deg_rate_intermediate,avg_pace_wet,std_pace_wet,laps_on_wet,deg_rate_wet
0,ALB,,167,,Race,98.376167,4.715031,18,-0.469674,99.533585,3.187425,41,-0.072078,,,0,,,,0,,,,0,
1,ALO,,167,,Race,99.135769,2.682735,26,-0.088043,97.925529,4.407023,34,-0.194301,,,0,,,,0,,,,0,
2,ANT,,167,,Race,,,0,,98.304652,2.216822,23,-0.183065,97.172278,3.012149,36,-0.047907,,,0,,,,0,
3,BEA,,167,,Race,,,0,,99.463955,2.736229,22,-0.144508,98.132789,2.906845,38,-0.103551,,,0,,,,0,
4,BOR,,167,,Race,,,0,,101.087083,4.192109,12,-0.431698,99.11817,3.808767,47,-0.018198,,,0,,,,0,
5,COL,,167,,Race,100.676769,3.755982,13,-0.341254,99.005711,2.517406,45,-0.02223,,,0,,,,0,,,,0,
6,GAS,,167,,Race,98.1532,5.213129,10,-0.462618,100.21313,3.667841,23,-0.207696,99.402731,3.824296,26,0.058163,,,0,,,,0,
7,HAD,,167,,Race,99.721105,2.75283,19,-0.137761,,,0,,98.7292,2.958193,40,-0.066699,,,0,,,,0,
8,HAM,,167,,Race,96.814357,4.200845,14,-0.093629,98.779043,2.508297,23,-0.105267,97.1708,4.017064,20,-0.275383,,,0,,,,0,
9,HUL,,167,,Race,98.061882,6.046062,17,-0.318838,99.598708,2.76241,24,-0.118865,100.818368,6.393824,19,0.16363,,,0,,,,0,


In [29]:
y = load_id_map('../data/raw/driver_id_map.pkl')
y

{'Michael Schumacher': 1,
 'David Coulthard': 2,
 'Rubens Barrichello': 3,
 'Nick Heidfeld': 4,
 'Heinz-Harald Frentzen': 5,
 'Kimi Räikkönen': 6,
 'Olivier Panis': 7,
 'Luciano Burti': 8,
 'Jean Alesi': 9,
 'Jos Verstappen': 10,
 'Eddie Irvine': 11,
 'Fernando Alonso': 12,
 'Giancarlo Fisichella': 13,
 'Jenson Button': 14,
 'Juan Pablo Montoya': 15,
 'Jarno Trulli': 16,
 'Mika Hakkinen': 17,
 'Ralf Schumacher': 18,
 'Jacques Villeneuve': 19,
 'Tarso Marques': 20,
 'Enrique Bernoldi': 21,
 'Gaston Mazzacane': 22,
 'Pedro de la Rosa': 23,
 'Ricardo Zonta': 24,
 'Tomas Enge': 25,
 'Alex Yoong': 26,
 'Mark Webber': 27,
 'Mika Salo': 28,
 'Takuma Sato': 29,
 'Felipe Massa': 30,
 'Allan McNish': 31,
 'Anthony Davidson': 32,
 'Antonio Pizzonia': 33,
 'Justin Wilson': 34,
 'Cristiano da Matta': 35,
 'Ralph Firman': 36,
 'Nicolas Kiesa': 37,
 'Zsolt Baumgartner': 38,
 'Marc Gene': 39,
 'Christian Klien': 40,
 'Giorgio Pantano': 41,
 'Gianmaria Bruni': 42,
 'Timo Glock': 43,
 'Narain Karthikeya

## Weather

In [None]:
# Initialize urls and sessions
urls = load_id_map('../data/raw/links_2018+.pkl')
sessions_collected = ['FP1', 'FP2', 'FP3', 'Qualifying', 'Race']
fastf1.Cache.disabled = True

# Suppress FastF1 logging output
fastf1_logger = logging.getLogger('fastf1')
fastf1_logger.setLevel(logging.CRITICAL)

# Initialize empty DataFrames to collect all data
all_laps = pd.DataFrame()
all_weather = pd.DataFrame()
all_messages = pd.DataFrame()

race_id_map = load_id_map('../data/raw/race_id_map.pkl')

for url_idx, url in enumerate(urls):
    
    # Sort year and grand prix from the url
    year = int(url.split('/')[5])
    gp = url.split('/')[8].replace('-', ' ').title().replace('Emilia Romagna', 'Emilia-Romagna')
    
    print(f"\nProcessing race {url_idx + 1}/{len(urls)}: {year} {gp}")
    
    for s in sessions_collected:
        max_retries = 5
        retry_count = 0
        success = False
        
        laps_df = None
        weather_df = None
        messages_df = None
        session = None

        # Load session with retry
        while retry_count < max_retries and not success:
            try:
                gc.collect()
                
                session = fastf1.get_session(year, gp, s)
                if retry_count > 0:
                    time.sleep(3)
                session.load(laps=True, telemetry=False, weather=True, messages=True)
                
                # Extract data with error handling
                try:
                    laps_df = session.laps.copy() if hasattr(session, 'laps') and session.laps is not None else None
                except:
                    laps_df = None
                try:
                    weather_df = pd.DataFrame(session.weather_data) if hasattr(session, 'weather_data') and session.weather_data is not None else None
                except:
                    weather_df = None
                try:
                    messages_df = pd.DataFrame(session.race_control_messages) if hasattr(session, 'race_control_messages') and session.race_control_messages is not None else None
                except:
                    messages_df = None
                
                success = True
                print(f"  Loaded {s}")

            except Exception as e:
                retry_count += 1
                if retry_count < max_retries:
                    sleep_time = 2 ** retry_count
                    print(f'  Retry {retry_count}/{max_retries} for {year} {gp} {s}: {e}. Sleeping for {sleep_time}s...')
                    time.sleep(sleep_time)
                else:
                    print(f'  Failed after {max_retries} retries for {year} {gp} {s}: {e}')
            
            finally:
                # Delete session object immediately to release resources
                if session is not None:
                    del session
                gc.collect()
        
        if not success:
            continue
        
        # Get race ID
        race_key = f'{gp}_{year}'
        race_id_value = race_id_map.get(race_key)
        if race_id_value is None:
            print(f'  Warning: No race_id found for: {race_key}')
        
        # Add race_id and session columns to each DataFrame and merge
        if laps_df is not None and not laps_df.empty:
            laps_df['race_id'] = race_id_value
            laps_df['session'] = s
            all_laps = pd.concat([all_laps, laps_df], ignore_index=True)
        if weather_df is not None and not weather_df.empty:
            weather_df['race_id'] = race_id_value
            weather_df['session'] = s
            all_weather = pd.concat([all_weather, weather_df], ignore_index=True)
        if messages_df is not None and not messages_df.empty:
            messages_df['race_id'] = race_id_value
            messages_df['session'] = s
            all_messages = pd.concat([all_messages, messages_df], ignore_index=True)

        # Clean up DataFrames after each session
        del laps_df, weather_df, messages_df
    
    # Force garbage collection after each race
    gc.collect()
    
    # Save intermediate results every 5 races
    if (url_idx + 1) % 5 == 0:
        print(f"Saving intermediate results after race {url_idx + 1}...")
        all_laps.to_csv('../data/raw/lap_data_raw_temp.csv', index=False)
        all_weather.to_csv('../data/raw/weather_data_raw_temp.csv', index=False)
        all_messages.to_csv('../data/raw/messages_data_raw_temp.csv', index=False)
        print(f"  Saved: {len(all_laps)} laps, {len(all_weather)} weather records, {len(all_messages)} messages")
    
# Save final DataFrames to CSVs
all_laps.to_csv('../data/raw/lap_data_raw.csv', index=False)
all_weather.to_csv('../data/raw/weather_data_raw.csv', index=False)
all_messages.to_csv('../data/raw/messages_data_raw.csv', index=False)


Processing race 1/167: 2018 Australia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 2/167: 2018 Bahrain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 3/167: 2018 China
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 4/167: 2018 Azerbaijan


In [18]:
import os
import sys
import fastf1
import pandas as pd
import time
import gc
import subprocess
import pickle

# If we're inside a subprocess, these will be passed in as args
if len(sys.argv) > 1 and sys.argv[1] == "--race":
    # Subprocess mode: handle a single race
    year = int(sys.argv[2])
    gp = sys.argv[3]
    race_id_value = sys.argv[4]
    cache_dir = sys.argv[5]
    output_dir = sys.argv[6]

    sessions = ['FP1', 'FP2', 'FP3', 'Qualifying', 'Race']
    fastf1.Cache.enable_cache(cache_dir)

    for s in sessions:
        try:
            gc.collect()
            session = fastf1.get_session(year, gp, s)
            session.load(laps=True, telemetry=False, weather=True, messages=True)
            print(f" Loaded {year} {gp} {s}")

            # Extract safely
            laps = getattr(session, 'laps', pd.DataFrame())
            weather = getattr(session, 'weather_data', pd.DataFrame())
            messages = getattr(session, 'race_control_messages', pd.DataFrame())

            for df in [laps, weather, messages]:
                if isinstance(df, pd.DataFrame) and not df.empty:
                    df["race_id"] = race_id_value
                    df["session"] = s

            # Save
            prefix = f"{output_dir}/{year}_{gp}_{s}"
            if not laps.empty:
                laps.to_parquet(f"{prefix}_laps.parquet")
            if not weather.empty:
                weather.to_parquet(f"{prefix}_weather.parquet")
            if not messages.empty:
                messages.to_parquet(f"{prefix}_messages.parquet")

            print(f" Saved {year} {gp} {s}")
            del session
            gc.collect()
            time.sleep(2)

        except Exception as e:
            print(f" Error for {year} {gp} {s}: {e}")
            time.sleep(3)
    sys.exit(0)

# === Main controller ===
CACHE_DIR = "../data/cache"
OUTPUT_DIR = "../data/raw/fastf1"
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

urls = load_id_map('../data/raw/links_2018+.pkl')
race_id_map = load_id_map('../data/raw/race_id_map.pkl')

for idx, url in enumerate(urls):
    year = int(url.split('/')[5])
    gp = (
        url.split('/')[8]
        .replace('-', ' ')
        .title()
        .replace('Emilia Romagna', 'Emilia-Romagna')
    )
    race_key = f"{gp}_{year}"
    race_id_value = race_id_map.get(race_key, "unknown")

    print(f"\n=== {idx+1}/{len(urls)} | {year} {gp} ===")

    # Launch a new Python process for this race
    subprocess.run(
        [sys.executable, __file__, "--race", str(year), gp, str(race_id_value), CACHE_DIR, OUTPUT_DIR],
        check=False,
    )

    # Pause briefly between races
    time.sleep(5)



=== 1/167 | 2018 Australia ===


NameError: name '__file__' is not defined

In [11]:
import fastf1
import pandas as pd
import logging
import time
import gc

# Initialize urls and sessions
urls = load_id_map('../data/raw/links_2018+.pkl')
sessions_collected = ['FP1', 'FP2', 'FP3', 'Qualifying', 'Race']
fastf1.Cache.disabled = True

# Suppress FastF1 logging output
fastf1_logger = logging.getLogger('fastf1')
fastf1_logger.setLevel(logging.CRITICAL)

# Initialize empty DataFrames to collect all data
all_laps = pd.DataFrame()
all_weather = pd.DataFrame()
all_messages = pd.DataFrame()

race_id_map = load_id_map('../data/raw/race_id_map.pkl')

for url_idx, url in enumerate(urls):
    
    # Sort year and grand prix from the url
    year = int(url.split('/')[5])
    gp = url.split('/')[8].replace('-', ' ').title().replace('Emilia Romagna', 'Emilia-Romagna')
    
    print(f"\nProcessing race {url_idx + 1}/{len(urls)}: {year} {gp}")
    
    for s in sessions_collected:
        max_retries = 5
        retry_count = 0
        success = False
        
        laps_df = None
        weather_df = None
        messages_df = None
        session = None

        # Load session with retry
        while retry_count < max_retries and not success:
            try:
                gc.collect()
                
                session = fastf1.get_session(year, gp, s)
                session.load(laps=True, telemetry=False, weather=True, messages=True)
                
                # Extract data with error handling
                try:
                    laps_df = session.laps.copy() if hasattr(session, 'laps') and session.laps is not None else None
                except:
                    laps_df = None
                try:
                    weather_df = pd.DataFrame(session.weather_data) if hasattr(session, 'weather_data') and session.weather_data is not None else None
                except:
                    weather_df = None
                try:
                    messages_df = pd.DataFrame(session.race_control_messages) if hasattr(session, 'race_control_messages') and session.race_control_messages is not None else None
                except:
                    messages_df = None
                
                success = True
                print(f"  Loaded {s}")

            except Exception as e:
                retry_count += 1
                if retry_count < max_retries:
                    sleep_time = 2 ** retry_count
                    print(f'  Retry {retry_count}/{max_retries} for {year} {gp} {s}: {e}. Sleeping for {sleep_time}s...')
                    time.sleep(sleep_time)
                else:
                    print(f'  Failed after {max_retries} retries for {year} {gp} {s}: {e}')
            
            finally:
                # Delete session object immediately to release resources
                if session is not None:
                    del session
                gc.collect()
        
        if not success:
            continue
        
        # Get race ID
        race_key = f'{gp}_{year}'
        race_id_value = race_id_map.get(race_key)
        if race_id_value is None:
            print(f'  Warning: No race_id found for: {race_key}')
        
        # Add race_id and session columns to each DataFrame and merge
        if laps_df is not None and not laps_df.empty:
            laps_df['race_id'] = race_id_value
            laps_df['session'] = s
            all_laps = pd.concat([all_laps, laps_df], ignore_index=True)
        if weather_df is not None and not weather_df.empty:
            weather_df['race_id'] = race_id_value
            weather_df['session'] = s
            all_weather = pd.concat([all_weather, weather_df], ignore_index=True)
        if messages_df is not None and not messages_df.empty:
            messages_df['race_id'] = race_id_value
            messages_df['session'] = s
            all_messages = pd.concat([all_messages, messages_df], ignore_index=True)

        # Clean up DataFrames after each session
        del laps_df, weather_df, messages_df
        
        # Add delay between sessions to avoid rate limiting
        time.sleep(3)
    
    # Force garbage collection after each race
    gc.collect()
    
    # Longer delay between races
    time.sleep(5)
    
    # Save intermediate results every 5 races
    if (url_idx + 1) % 5 == 0:
        print(f"Saving intermediate results after race {url_idx + 1}...")
        all_laps.to_csv('../data/raw/lap_data_raw_temp.csv', index=False)
        all_weather.to_csv('../data/raw/weather_data_raw_temp.csv', index=False)
        all_messages.to_csv('../data/raw/messages_data_raw_temp.csv', index=False)
        print(f"  Saved: {len(all_laps)} laps, {len(all_weather)} weather records, {len(all_messages)} messages")
    
# Save final DataFrames to CSVs
all_laps.to_csv('../data/raw/lap_data_raw.csv', index=False)
all_weather.to_csv('../data/raw/weather_data_raw.csv', index=False)
all_messages.to_csv('../data/raw/messages_data_raw.csv', index=False)


Processing race 1/167: 2018 Australia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 2/167: 2018 Bahrain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 3/167: 2018 China
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 4/167: 2018 Azerbaijan
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 5/167: 2018 Spain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 5...
  Saved: 13193 laps, 2186 weather records, 1001 messages

Processing race 6/167: 2018 Monaco
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 7/167: 2018 Canada
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 8/167: 2018 France
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 9/16

  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 15/167: 2018 Singapore
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 15...
  Saved: 42036 laps, 6709 weather records, 2666 messages

Processing race 16/167: 2018 Russia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 17/167: 2018 Japan
  Loaded FP1
  Loaded FP2


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 18/167: 2018 United States
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 19/167: 2018 Mexico
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 20/167: 2018 Brazil
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 20...
  Saved: 55359 laps, 9187 weather records, 3459 messages

Processing race 21/167: 2018 Abu Dhabi
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 22/167: 2019 Australia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 23/167: 2019 Bahrain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 24/167: 2019 China
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 25/167: 2019 Azerbaijan
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 25...
  Saved: 68148 laps, 11698 weather records, 4273 messages

Processing race 26/167: 2019 Spain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 27/167: 2019 Monaco
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 28/167: 2019 Canada
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 29/167: 2019 France
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 30/167: 2019 Austria
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


Saving intermediate results after race 30...
  Saved: 84657 laps, 14213 weather records, 5178 messages

Processing race 31/167: 2019 Great Britain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 32/167: 2019 Germany
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying
  Loaded Race

Processing race 33/167: 2019 Hungary
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 34/167: 2019 Belgium
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 35/167: 2019 Italy
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 35...
  Saved: 97870 laps, 16744 weather records, 6032 messages

Processing race 36/167: 2019 Singapore
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 37/167: 2019 Russia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 38/167: 2019 Japan
  Loaded FP1


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 39/167: 2019 Mexico
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying
  Loaded Race

Processing race 40/167: 2019 United States
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 40...
  Saved: 110061 laps, 19022 weather records, 6843 messages

Processing race 41/167: 2019 Brazil
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 42/167: 2019 Abu Dhabi
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 43/167: 2020 Austria
  Retry 1/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 2s...
  Retry 2/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 4s...
  Retry 3/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 8s...
  Retry 4/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 16s...
  Failed after 5 retries for 2020 Austria FP1: Failed to load any schedule data.
  Retry 1/5 for 2020 Austria FP2: Failed to load a

KeyboardInterrupt: 

In [7]:
# Enable cache (important for performance)
# First ensure the cache directory exists
#import os
#cache_dir = "cache"
#if not os.path.exists(cache_dir):
    #os.makedirs(cache_dir)
    #print(f"Created cache directory: {cache_dir}")

#fastf1.Cache.enable_cache(cache_dir)  # uses the created "cache" folder to store data

# Load a session: example Bahrain GP 2023 Qualifying
session = fastf1.get_session(2020, 'styria', 'fp1')
session.load(weather=True)  # only load weather data as requested

# Weather data is stored in session.weather_data (a structured numpy array)
weather_array = session.weather_data

# Convert weather data to DataFrame
weather_df = pd.DataFrame(weather_array)

# Display weather data
weather_df

# Save weather dataframe to CSV file
weather_df.to_csv("example_weather.csv", index=False)


core           INFO 	Loading data for Styrian Grand Prix - Practice 1 [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 

In [35]:

# Usage example with weather data
numeric_columns = ['AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'Pressure']
boolean_columns = ['Rainfall']

session_weather_features = aggregate_columns(
    weather_df, 
    columns=numeric_columns, 
    boolean_columns=boolean_columns
)

print(session_weather_features)

AirTemp_mean        15.707865
AirTemp_min              15.1
AirTemp_max              16.6
AirTemp_std           0.37574
TrackTemp_mean      18.942135
TrackTemp_min            18.3
TrackTemp_max            19.4
TrackTemp_std        0.276315
WindSpeed_mean       3.475281
WindSpeed_min             0.7
WindSpeed_max             6.9
WindSpeed_std        1.242267
Humidity_mean       78.421348
Humidity_min             68.0
Humidity_max             92.0
Humidity_std          6.50658
Pressure_mean     1009.901685
Pressure_min           1009.0
Pressure_max           1010.7
Pressure_std         0.444994
Rainfall_any             True
Rainfall_mean        0.325843
dtype: object
