# Introduction
Jack Wilson
9/23/2025

This notebook outlines scraping and collecting of all data raw data used in the model

# Import Modules

In [2]:
import pandas as pd
import numpy as np
import time, random, re, os, gc, shutil, pickle, tempfile, os, sys
from math import e

import fastf1
import logging

from datetime import timedelta, datetime

from selenium import webdriver
from selenium.webdriver.common.by import By

In [3]:
# Path Setup: Connects Notebook to 'src' Package

# Add the project root (one level up from 'notebooks/') to the system path
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.data_functions import load_id_map, save_id_map, init_col_map, scrape_url_table, constructor_mapping

# DataFrame Display Options

In [8]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)

# F1 Site 2001-2017

## Race Links

In [26]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2001
year_end = 2017
race_urls = []

while year_begin <= year_end:

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
    
    year_begin += 1

browser.close()

# Save links to file
load_id_map('../data/raw/links_2001_2017.pkl')
save_id_map('../data/raw/links_2001_2017.pkl', race_urls)

## Race Results

In [None]:
# Establish variables
urls = load_id_map('../data/raw/links_2001_2017.pkl')
total_cols = 7
col_idx_map = {
    'driver_id': 2,
    'position': 0,
    'driver_name': 2,
    'points': 6}
id_cols = ['driver_id']

# Scrape 2001-2017 results
df = scrape_url_table(
    urls,
    total_cols,
    col_idx_map,
    id_cols)
df.to_csv('../data/raw/race_results_raw_2001-2017.csv', encoding='utf-8', index=False)

# F1 Site 2018+

## Race Links & Circuit Data

In [28]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2018
year_end = datetime.now().year
race_urls = []
round_number = []

while year_begin <= year_end:
    r = 1  

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
            round_number.append(r)
            r += 1 

    year_begin += 1

browser.close()

link_data = pd.DataFrame({'race_url': race_urls, 'round_number': round_number})
link_data.to_csv('../data/raw/rounds_raw.csv', encoding='utf-8', index=False)

# Save links to file
load_id_map('../data/raw/links_2018+.pkl')
save_id_map('../data/raw/links_2018+.pkl', race_urls)

## Race Results

In [None]:
# Establish variables
urls = load_id_map('../data/raw/links_2018+.pkl')
total_cols = 7
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'circuit_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'team_id': 3,
    'year': lambda browser: int(browser.current_url.split("/")[5]),
    'race_url': lambda browser: browser.current_url,
    'circuit_name': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'driver_name': 2,
    'team_name': 3,
    'end_position': 0,
    'points': 6,
    'laps_completed': 4}
id_cols = ['race_id', 'driver_id', 'circuit_id', 'team_id']
page_lvl_cols = ['race_id', 'circuit_id', 'year', 'race_url', 'circuit_name']

# Scrape 2018+ results
df = scrape_url_table(
    urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/race_results_raw_2018+.csv', encoding='utf-8', index=False)

## Practices

In [None]:
# Create practice URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
practice_urls = []
for url in urls:
    for practice_num in [1, 2, 3]:
        practice_url = url.replace('/race-result', f'/practice/{practice_num}')
        practice_urls.append(practice_url)

# Establish other variables
total_cols = 6
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'session_type': lambda browser: browser.current_url.split("/")[9] + browser.current_url.split("/")[10],
    'lap_time': 4,
    'lap_count': 5,
    'position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id', 'session_type']

# Scrape practice results
df = scrape_url_table(
    practice_urls,
    total_cols, col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pratice_results_raw.csv', encoding='utf-8', index=False)

## Qualifying

In [None]:
# Create qualifying URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
qualifying_urls = []
for url in urls:
    qual_url = url.replace('/race-result', '/qualifying')
    qualifying_urls.append(qual_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'q1_time': 4,
    'q2_time': 5,
    'q3_time': 6,
    'qual_position': 0,
    'qual_laps': 7}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape qualifying results
df = scrape_url_table(
    qualifying_urls,
    total_cols, col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/qualifying_results_raw.csv', encoding='utf-8', index=False)

## Starting Grid

In [None]:
# Create starting grid URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
starting_urls = []
for url in urls:
    start_url = url.replace('/race-result', '/starting-grid')
    starting_urls.append(start_url)

# Establish other variables
total_cols = 5
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'start_position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape starting grid results
df = scrape_url_table(
    starting_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/starting_grid_results_raw.csv', encoding='utf-8', index=False)

## Pit Stops

In [None]:
# Create pit stop URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
pit_urls = []
for url in urls:
    ps_url = url.replace('/race-result', '/pit-stop-summary')
    pit_urls.append(ps_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'stop_number': 0,
    'stop_lap': 4,
    'pits_time': 6}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape pit stop results
df = scrape_url_table(
    pit_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pit_stop_results_raw.csv', encoding='utf-8', index=False)

## Fastest Laps

In [None]:
# Create fastest lap URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
fastest_lap_urls = []
for url in urls:
    fastest_url = url.replace('/race-result', '/fastest-laps')
    fastest_lap_urls.append(fastest_url)

# Establish other variables
total_cols = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'fastest_lap_time': 6,
    'lap_number': 4}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape fastest lap results
df = scrape_url_table(
    fastest_lap_urls,
    total_cols,
    col_idx_map,
    id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/fastest_lap_results_raw.csv', encoding='utf-8', index=False)

Permission denied on attempt 1, retrying in 1 second...


# FastF1

In [9]:
x = pd.read_parquet('../data/raw/fastf1/2025_Singapore_Race_laps.parquet')
x.head(50)

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,race_id,session
0,0 days 00:56:41.841000,RUS,63,0 days 00:01:43.905000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:42.718000,0 days 00:00:29.061000,NaT,0 days 00:56:13.007000,0 days 00:56:42.036000,281.0,258.0,239.0,185.0,False,MEDIUM,1.0,True,Mercedes,0 days 00:54:57.735000,NaT,1,1.0,False,,False,False,167,Race
1,0 days 00:58:20.990000,RUS,63,0 days 00:01:39.149000,2.0,1.0,NaT,NaT,0 days 00:00:29.263000,0 days 00:00:41.666000,0 days 00:00:28.220000,0 days 00:57:11.136000,0 days 00:57:52.802000,0 days 00:58:21.022000,283.0,259.0,244.0,279.0,True,MEDIUM,2.0,True,Mercedes,0 days 00:56:41.841000,NaT,1,1.0,False,,False,True,167,Race
2,0 days 00:59:59.025000,RUS,63,0 days 00:01:38.035000,3.0,1.0,NaT,NaT,0 days 00:00:28.980000,0 days 00:00:41.190000,0 days 00:00:27.865000,0 days 00:58:50.002000,0 days 00:59:31.192000,0 days 00:59:59.057000,284.0,259.0,246.0,278.0,True,MEDIUM,3.0,True,Mercedes,0 days 00:58:20.990000,NaT,1,1.0,False,,False,True,167,Race
3,0 days 01:01:36.595000,RUS,63,0 days 00:01:37.570000,4.0,1.0,NaT,NaT,0 days 00:00:28.833000,0 days 00:00:41.147000,0 days 00:00:27.590000,0 days 01:00:27.890000,0 days 01:01:09.037000,0 days 01:01:36.627000,285.0,259.0,248.0,,True,MEDIUM,4.0,True,Mercedes,0 days 00:59:59.025000,NaT,1,1.0,False,,False,True,167,Race
4,0 days 01:03:13.779000,RUS,63,0 days 00:01:37.184000,5.0,1.0,NaT,NaT,0 days 00:00:28.639000,0 days 00:00:40.996000,0 days 00:00:27.549000,0 days 01:02:05.266000,0 days 01:02:46.262000,0 days 01:03:13.811000,284.0,259.0,248.0,281.0,True,MEDIUM,5.0,True,Mercedes,0 days 01:01:36.595000,NaT,1,1.0,False,,False,True,167,Race
5,0 days 01:04:50.633000,RUS,63,0 days 00:01:36.854000,6.0,1.0,NaT,NaT,0 days 00:00:28.614000,0 days 00:00:40.866000,0 days 00:00:27.374000,0 days 01:03:42.425000,0 days 01:04:23.291000,0 days 01:04:50.665000,286.0,260.0,249.0,280.0,True,MEDIUM,6.0,True,Mercedes,0 days 01:03:13.779000,NaT,1,1.0,False,,False,True,167,Race
6,0 days 01:06:27.401000,RUS,63,0 days 00:01:36.768000,7.0,1.0,NaT,NaT,0 days 00:00:28.599000,0 days 00:00:40.750000,0 days 00:00:27.419000,0 days 01:05:19.264000,0 days 01:06:00.014000,0 days 01:06:27.433000,287.0,260.0,249.0,281.0,True,MEDIUM,7.0,True,Mercedes,0 days 01:04:50.633000,NaT,12,1.0,False,,False,True,167,Race
7,0 days 01:08:04.056000,RUS,63,0 days 00:01:36.655000,8.0,1.0,NaT,NaT,0 days 00:00:28.581000,0 days 00:00:40.764000,0 days 00:00:27.310000,0 days 01:06:56.014000,0 days 01:07:36.778000,0 days 01:08:04.088000,286.0,259.0,249.0,282.0,True,MEDIUM,8.0,True,Mercedes,0 days 01:06:27.401000,NaT,1,1.0,False,,False,True,167,Race
8,0 days 01:09:40.782000,RUS,63,0 days 00:01:36.726000,9.0,1.0,NaT,NaT,0 days 00:00:28.598000,0 days 00:00:40.836000,0 days 00:00:27.292000,0 days 01:08:32.686000,0 days 01:09:13.522000,0 days 01:09:40.814000,,260.0,251.0,281.0,False,MEDIUM,9.0,True,Mercedes,0 days 01:08:04.056000,NaT,1,1.0,False,,False,True,167,Race
9,0 days 01:11:17.447000,RUS,63,0 days 00:01:36.665000,10.0,1.0,NaT,NaT,0 days 00:00:28.559000,0 days 00:00:40.825000,0 days 00:00:27.281000,0 days 01:10:09.373000,0 days 01:10:50.198000,0 days 01:11:17.479000,,260.0,250.0,283.0,False,MEDIUM,10.0,True,Mercedes,0 days 01:09:40.782000,NaT,1,1.0,False,,False,True,167,Race


In [5]:
y = load_id_map('../data/raw/race_id_map.pkl')
y

{'Australia_2018': 1,
 'Bahrain_2018': 2,
 'China_2018': 3,
 'Azerbaijan_2018': 4,
 'Spain_2018': 5,
 'Monaco_2018': 6,
 'Canada_2018': 7,
 'France_2018': 8,
 'Austria_2018': 9,
 'Great Britain_2018': 10,
 'Germany_2018': 11,
 'Hungary_2018': 12,
 'Belgium_2018': 13,
 'Italy_2018': 14,
 'Singapore_2018': 15,
 'Russia_2018': 16,
 'Japan_2018': 17,
 'United States_2018': 18,
 'Mexico_2018': 19,
 'Brazil_2018': 20,
 'Abu Dhabi_2018': 21,
 'Australia_2019': 22,
 'Bahrain_2019': 23,
 'China_2019': 24,
 'Azerbaijan_2019': 25,
 'Spain_2019': 26,
 'Monaco_2019': 27,
 'Canada_2019': 28,
 'France_2019': 29,
 'Austria_2019': 30,
 'Great Britain_2019': 31,
 'Germany_2019': 32,
 'Hungary_2019': 33,
 'Belgium_2019': 34,
 'Italy_2019': 35,
 'Singapore_2019': 36,
 'Russia_2019': 37,
 'Japan_2019': 38,
 'Mexico_2019': 39,
 'United States_2019': 40,
 'Brazil_2019': 41,
 'Abu Dhabi_2019': 42,
 'Austria_2020': 43,
 'Styria_2020': 44,
 'Hungary_2020': 45,
 'Great Britain_2020': 46,
 '70th Anniversary_2020'

## Weather

In [None]:
# Initialize urls and sessions
urls = load_id_map('../data/raw/links_2018+.pkl')
sessions_collected = ['FP1', 'FP2', 'FP3', 'Qualifying', 'Race']
fastf1.Cache.disabled = True

# Suppress FastF1 logging output
fastf1_logger = logging.getLogger('fastf1')
fastf1_logger.setLevel(logging.CRITICAL)

# Initialize empty DataFrames to collect all data
all_laps = pd.DataFrame()
all_weather = pd.DataFrame()
all_messages = pd.DataFrame()

race_id_map = load_id_map('../data/raw/race_id_map.pkl')

for url_idx, url in enumerate(urls):
    
    # Sort year and grand prix from the url
    year = int(url.split('/')[5])
    gp = url.split('/')[8].replace('-', ' ').title().replace('Emilia Romagna', 'Emilia-Romagna')
    
    print(f"\nProcessing race {url_idx + 1}/{len(urls)}: {year} {gp}")
    
    for s in sessions_collected:
        max_retries = 5
        retry_count = 0
        success = False
        
        laps_df = None
        weather_df = None
        messages_df = None
        session = None

        # Load session with retry
        while retry_count < max_retries and not success:
            try:
                gc.collect()
                
                session = fastf1.get_session(year, gp, s)
                if retry_count > 0:
                    time.sleep(3)
                session.load(laps=True, telemetry=False, weather=True, messages=True)
                
                # Extract data with error handling
                try:
                    laps_df = session.laps.copy() if hasattr(session, 'laps') and session.laps is not None else None
                except:
                    laps_df = None
                try:
                    weather_df = pd.DataFrame(session.weather_data) if hasattr(session, 'weather_data') and session.weather_data is not None else None
                except:
                    weather_df = None
                try:
                    messages_df = pd.DataFrame(session.race_control_messages) if hasattr(session, 'race_control_messages') and session.race_control_messages is not None else None
                except:
                    messages_df = None
                
                success = True
                print(f"  Loaded {s}")

            except Exception as e:
                retry_count += 1
                if retry_count < max_retries:
                    sleep_time = 2 ** retry_count
                    print(f'  Retry {retry_count}/{max_retries} for {year} {gp} {s}: {e}. Sleeping for {sleep_time}s...')
                    time.sleep(sleep_time)
                else:
                    print(f'  Failed after {max_retries} retries for {year} {gp} {s}: {e}')
            
            finally:
                # Delete session object immediately to release resources
                if session is not None:
                    del session
                gc.collect()
        
        if not success:
            continue
        
        # Get race ID
        race_key = f'{gp}_{year}'
        race_id_value = race_id_map.get(race_key)
        if race_id_value is None:
            print(f'  Warning: No race_id found for: {race_key}')
        
        # Add race_id and session columns to each DataFrame and merge
        if laps_df is not None and not laps_df.empty:
            laps_df['race_id'] = race_id_value
            laps_df['session'] = s
            all_laps = pd.concat([all_laps, laps_df], ignore_index=True)
        if weather_df is not None and not weather_df.empty:
            weather_df['race_id'] = race_id_value
            weather_df['session'] = s
            all_weather = pd.concat([all_weather, weather_df], ignore_index=True)
        if messages_df is not None and not messages_df.empty:
            messages_df['race_id'] = race_id_value
            messages_df['session'] = s
            all_messages = pd.concat([all_messages, messages_df], ignore_index=True)

        # Clean up DataFrames after each session
        del laps_df, weather_df, messages_df
    
    # Force garbage collection after each race
    gc.collect()
    
    # Save intermediate results every 5 races
    if (url_idx + 1) % 5 == 0:
        print(f"Saving intermediate results after race {url_idx + 1}...")
        all_laps.to_csv('../data/raw/lap_data_raw_temp.csv', index=False)
        all_weather.to_csv('../data/raw/weather_data_raw_temp.csv', index=False)
        all_messages.to_csv('../data/raw/messages_data_raw_temp.csv', index=False)
        print(f"  Saved: {len(all_laps)} laps, {len(all_weather)} weather records, {len(all_messages)} messages")
    
# Save final DataFrames to CSVs
all_laps.to_csv('../data/raw/lap_data_raw.csv', index=False)
all_weather.to_csv('../data/raw/weather_data_raw.csv', index=False)
all_messages.to_csv('../data/raw/messages_data_raw.csv', index=False)


Processing race 1/167: 2018 Australia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 2/167: 2018 Bahrain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 3/167: 2018 China
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 4/167: 2018 Azerbaijan


In [18]:
import os
import sys
import fastf1
import pandas as pd
import time
import gc
import subprocess
import pickle

# If we're inside a subprocess, these will be passed in as args
if len(sys.argv) > 1 and sys.argv[1] == "--race":
    # Subprocess mode: handle a single race
    year = int(sys.argv[2])
    gp = sys.argv[3]
    race_id_value = sys.argv[4]
    cache_dir = sys.argv[5]
    output_dir = sys.argv[6]

    sessions = ['FP1', 'FP2', 'FP3', 'Qualifying', 'Race']
    fastf1.Cache.enable_cache(cache_dir)

    for s in sessions:
        try:
            gc.collect()
            session = fastf1.get_session(year, gp, s)
            session.load(laps=True, telemetry=False, weather=True, messages=True)
            print(f" Loaded {year} {gp} {s}")

            # Extract safely
            laps = getattr(session, 'laps', pd.DataFrame())
            weather = getattr(session, 'weather_data', pd.DataFrame())
            messages = getattr(session, 'race_control_messages', pd.DataFrame())

            for df in [laps, weather, messages]:
                if isinstance(df, pd.DataFrame) and not df.empty:
                    df["race_id"] = race_id_value
                    df["session"] = s

            # Save
            prefix = f"{output_dir}/{year}_{gp}_{s}"
            if not laps.empty:
                laps.to_parquet(f"{prefix}_laps.parquet")
            if not weather.empty:
                weather.to_parquet(f"{prefix}_weather.parquet")
            if not messages.empty:
                messages.to_parquet(f"{prefix}_messages.parquet")

            print(f" Saved {year} {gp} {s}")
            del session
            gc.collect()
            time.sleep(2)

        except Exception as e:
            print(f" Error for {year} {gp} {s}: {e}")
            time.sleep(3)
    sys.exit(0)

# === Main controller ===
CACHE_DIR = "../data/cache"
OUTPUT_DIR = "../data/raw/fastf1"
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

urls = load_id_map('../data/raw/links_2018+.pkl')
race_id_map = load_id_map('../data/raw/race_id_map.pkl')

for idx, url in enumerate(urls):
    year = int(url.split('/')[5])
    gp = (
        url.split('/')[8]
        .replace('-', ' ')
        .title()
        .replace('Emilia Romagna', 'Emilia-Romagna')
    )
    race_key = f"{gp}_{year}"
    race_id_value = race_id_map.get(race_key, "unknown")

    print(f"\n=== {idx+1}/{len(urls)} | {year} {gp} ===")

    # Launch a new Python process for this race
    subprocess.run(
        [sys.executable, __file__, "--race", str(year), gp, str(race_id_value), CACHE_DIR, OUTPUT_DIR],
        check=False,
    )

    # Pause briefly between races
    time.sleep(5)



=== 1/167 | 2018 Australia ===


NameError: name '__file__' is not defined

In [11]:
import fastf1
import pandas as pd
import logging
import time
import gc

# Initialize urls and sessions
urls = load_id_map('../data/raw/links_2018+.pkl')
sessions_collected = ['FP1', 'FP2', 'FP3', 'Qualifying', 'Race']
fastf1.Cache.disabled = True

# Suppress FastF1 logging output
fastf1_logger = logging.getLogger('fastf1')
fastf1_logger.setLevel(logging.CRITICAL)

# Initialize empty DataFrames to collect all data
all_laps = pd.DataFrame()
all_weather = pd.DataFrame()
all_messages = pd.DataFrame()

race_id_map = load_id_map('../data/raw/race_id_map.pkl')

for url_idx, url in enumerate(urls):
    
    # Sort year and grand prix from the url
    year = int(url.split('/')[5])
    gp = url.split('/')[8].replace('-', ' ').title().replace('Emilia Romagna', 'Emilia-Romagna')
    
    print(f"\nProcessing race {url_idx + 1}/{len(urls)}: {year} {gp}")
    
    for s in sessions_collected:
        max_retries = 5
        retry_count = 0
        success = False
        
        laps_df = None
        weather_df = None
        messages_df = None
        session = None

        # Load session with retry
        while retry_count < max_retries and not success:
            try:
                gc.collect()
                
                session = fastf1.get_session(year, gp, s)
                session.load(laps=True, telemetry=False, weather=True, messages=True)
                
                # Extract data with error handling
                try:
                    laps_df = session.laps.copy() if hasattr(session, 'laps') and session.laps is not None else None
                except:
                    laps_df = None
                try:
                    weather_df = pd.DataFrame(session.weather_data) if hasattr(session, 'weather_data') and session.weather_data is not None else None
                except:
                    weather_df = None
                try:
                    messages_df = pd.DataFrame(session.race_control_messages) if hasattr(session, 'race_control_messages') and session.race_control_messages is not None else None
                except:
                    messages_df = None
                
                success = True
                print(f"  Loaded {s}")

            except Exception as e:
                retry_count += 1
                if retry_count < max_retries:
                    sleep_time = 2 ** retry_count
                    print(f'  Retry {retry_count}/{max_retries} for {year} {gp} {s}: {e}. Sleeping for {sleep_time}s...')
                    time.sleep(sleep_time)
                else:
                    print(f'  Failed after {max_retries} retries for {year} {gp} {s}: {e}')
            
            finally:
                # Delete session object immediately to release resources
                if session is not None:
                    del session
                gc.collect()
        
        if not success:
            continue
        
        # Get race ID
        race_key = f'{gp}_{year}'
        race_id_value = race_id_map.get(race_key)
        if race_id_value is None:
            print(f'  Warning: No race_id found for: {race_key}')
        
        # Add race_id and session columns to each DataFrame and merge
        if laps_df is not None and not laps_df.empty:
            laps_df['race_id'] = race_id_value
            laps_df['session'] = s
            all_laps = pd.concat([all_laps, laps_df], ignore_index=True)
        if weather_df is not None and not weather_df.empty:
            weather_df['race_id'] = race_id_value
            weather_df['session'] = s
            all_weather = pd.concat([all_weather, weather_df], ignore_index=True)
        if messages_df is not None and not messages_df.empty:
            messages_df['race_id'] = race_id_value
            messages_df['session'] = s
            all_messages = pd.concat([all_messages, messages_df], ignore_index=True)

        # Clean up DataFrames after each session
        del laps_df, weather_df, messages_df
        
        # Add delay between sessions to avoid rate limiting
        time.sleep(3)
    
    # Force garbage collection after each race
    gc.collect()
    
    # Longer delay between races
    time.sleep(5)
    
    # Save intermediate results every 5 races
    if (url_idx + 1) % 5 == 0:
        print(f"Saving intermediate results after race {url_idx + 1}...")
        all_laps.to_csv('../data/raw/lap_data_raw_temp.csv', index=False)
        all_weather.to_csv('../data/raw/weather_data_raw_temp.csv', index=False)
        all_messages.to_csv('../data/raw/messages_data_raw_temp.csv', index=False)
        print(f"  Saved: {len(all_laps)} laps, {len(all_weather)} weather records, {len(all_messages)} messages")
    
# Save final DataFrames to CSVs
all_laps.to_csv('../data/raw/lap_data_raw.csv', index=False)
all_weather.to_csv('../data/raw/weather_data_raw.csv', index=False)
all_messages.to_csv('../data/raw/messages_data_raw.csv', index=False)


Processing race 1/167: 2018 Australia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 2/167: 2018 Bahrain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 3/167: 2018 China
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 4/167: 2018 Azerbaijan
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 5/167: 2018 Spain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 5...
  Saved: 13193 laps, 2186 weather records, 1001 messages

Processing race 6/167: 2018 Monaco
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 7/167: 2018 Canada
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 8/167: 2018 France
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 9/16

  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 15/167: 2018 Singapore
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 15...
  Saved: 42036 laps, 6709 weather records, 2666 messages

Processing race 16/167: 2018 Russia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 17/167: 2018 Japan
  Loaded FP1
  Loaded FP2


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 18/167: 2018 United States
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 19/167: 2018 Mexico
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 20/167: 2018 Brazil
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 20...
  Saved: 55359 laps, 9187 weather records, 3459 messages

Processing race 21/167: 2018 Abu Dhabi
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 22/167: 2019 Australia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 23/167: 2019 Bahrain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 24/167: 2019 China
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 25/167: 2019 Azerbaijan
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 25...
  Saved: 68148 laps, 11698 weather records, 4273 messages

Processing race 26/167: 2019 Spain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 27/167: 2019 Monaco
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 28/167: 2019 Canada
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 29/167: 2019 France
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 30/167: 2019 Austria
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


Saving intermediate results after race 30...
  Saved: 84657 laps, 14213 weather records, 5178 messages

Processing race 31/167: 2019 Great Britain
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Race

Processing race 32/167: 2019 Germany
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying
  Loaded Race

Processing race 33/167: 2019 Hungary
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 34/167: 2019 Belgium
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 35/167: 2019 Italy
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 35...
  Saved: 97870 laps, 16744 weather records, 6032 messages

Processing race 36/167: 2019 Singapore
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 37/167: 2019 Russia
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 38/167: 2019 Japan
  Loaded FP1


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 39/167: 2019 Mexico
  Loaded FP1
  Loaded FP2
  Loaded FP3


  all_messages = pd.concat([all_messages, messages_df], ignore_index=True)


  Loaded Qualifying
  Loaded Race

Processing race 40/167: 2019 United States
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race
Saving intermediate results after race 40...
  Saved: 110061 laps, 19022 weather records, 6843 messages

Processing race 41/167: 2019 Brazil
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 42/167: 2019 Abu Dhabi
  Loaded FP1
  Loaded FP2
  Loaded FP3
  Loaded Qualifying
  Loaded Race

Processing race 43/167: 2020 Austria
  Retry 1/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 2s...
  Retry 2/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 4s...
  Retry 3/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 8s...
  Retry 4/5 for 2020 Austria FP1: Failed to load any schedule data.. Sleeping for 16s...
  Failed after 5 retries for 2020 Austria FP1: Failed to load any schedule data.
  Retry 1/5 for 2020 Austria FP2: Failed to load a

KeyboardInterrupt: 

In [7]:
# Enable cache (important for performance)
# First ensure the cache directory exists
#import os
#cache_dir = "cache"
#if not os.path.exists(cache_dir):
    #os.makedirs(cache_dir)
    #print(f"Created cache directory: {cache_dir}")

#fastf1.Cache.enable_cache(cache_dir)  # uses the created "cache" folder to store data

# Load a session: example Bahrain GP 2023 Qualifying
session = fastf1.get_session(2020, 'styria', 'fp1')
session.load(weather=True)  # only load weather data as requested

# Weather data is stored in session.weather_data (a structured numpy array)
weather_array = session.weather_data

# Convert weather data to DataFrame
weather_df = pd.DataFrame(weather_array)

# Display weather data
weather_df

# Save weather dataframe to CSV file
weather_df.to_csv("example_weather.csv", index=False)


core           INFO 	Loading data for Styrian Grand Prix - Practice 1 [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 

In [35]:

# Usage example with weather data
numeric_columns = ['AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'Pressure']
boolean_columns = ['Rainfall']

session_weather_features = aggregate_columns(
    weather_df, 
    columns=numeric_columns, 
    boolean_columns=boolean_columns
)

print(session_weather_features)

AirTemp_mean        15.707865
AirTemp_min              15.1
AirTemp_max              16.6
AirTemp_std           0.37574
TrackTemp_mean      18.942135
TrackTemp_min            18.3
TrackTemp_max            19.4
TrackTemp_std        0.276315
WindSpeed_mean       3.475281
WindSpeed_min             0.7
WindSpeed_max             6.9
WindSpeed_std        1.242267
Humidity_mean       78.421348
Humidity_min             68.0
Humidity_max             92.0
Humidity_std          6.50658
Pressure_mean     1009.901685
Pressure_min           1009.0
Pressure_max           1010.7
Pressure_std         0.444994
Rainfall_any             True
Rainfall_mean        0.325843
dtype: object
