# Introduction
Jack Wilson
9/23/2025

This notebook outlines scraping and collecting of all data raw data used in the model

# Import Modules

In [None]:
import pandas as pd
import numpy as np
import time, os, sys, re

from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from sklearn.linear_model import LinearRegression

from pathlib import Path

In [2]:
# Connects notebook to 'src' Package
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.utils.utils import load_id_map, save_id_map, scrape_url_table, aggregate_columns, get_location_data
from src.utils.project_functions import constructor_mapping

# DataFrame Display Options

In [22]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)

# F1 Site 2001-2017

## Race Links

In [9]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2001
year_end = 2017
race_urls = []

while year_begin <= year_end:
    round_number = 1

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))

    year_begin += 1

browser.close()

# Save links to file
load_id_map('../data/raw/links_2001_2017.pkl')
save_id_map('../data/raw/links_2001_2017.pkl', race_urls)

## Race Results

In [23]:
def get_date(browser):
    """Extract date with wait and retry"""
    try:
        # Wait for the element to be present
        element = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='Container-module_container__0e4ac']//p[contains(@class, 'display-s-bold')]"))
        )
        
        # Wait a bit for any re-rendering to finish
        time.sleep(0.5)
        
        # Get fresh reference to avoid stale element
        element = browser.find_element(By.XPATH, "//div[@class='Container-module_container__0e4ac']//p[contains(@class, 'display-s-bold')]")
        return element.text
    except Exception as e:
        print(f"Failed to extract date: {e}")
        return None

In [24]:
# Establish variables
urls = load_id_map('../data/raw/links_2001_2017.pkl')
min_col = 7
max_col = 7
col_idx_map = {
    'date': get_date,
    'driver_id': 2,
    'team_id': 3,
    'position': 0,
    'driver_name': 2,
    'points': 6}
id_cols = ['driver_id', 'team_id']
page_lvl_cols = ['date']

# Scrape 2001-2017 results
df = scrape_url_table(
    urls=urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/race_results_raw_2001-2017.csv', encoding='utf-8', index=False)

## Pit Stops

In [25]:
# Create pit stop URLs
urls = load_id_map('../data/raw/links_2001_2017.pkl')
pit_urls = []
for url in urls:
    if url.split('/')[5] in ['2016', '2017']:
        ps_url = url.replace('/race-result', '/pit-stop-summary')
        pit_urls.append(ps_url)

# Establish other variables
min_col = 8
max_col = 8
col_idx_map = {
    'date': get_date,
    'driver_id': 2,
    'team_id': 3,
    'stop_number': 0,
    'stop_lap': 4,
    'pits_time': 6}
id_cols = ['driver_id', 'team_id']
page_lvl_cols = ['date']

# Scrape pit stop results
df = scrape_url_table(
    urls=pit_urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pit_stop_results_raw_2016-2017.csv', encoding='utf-8', index=False)

# F1 Site 2018+

## Race Links & Circuit Data

In [28]:
# Establish web browser and initial variables
browser = webdriver.Chrome()
browser.maximize_window()
year_begin = 2018
year_end = datetime.now().year
race_urls = []
round_number = []

while year_begin <= year_end:
    r = 1  

    # Use the years to crawl across season pages
    url = "https://www.formula1.com/en/results/" + str(year_begin) + "/races"
    browser.get(url)
    
    table = browser.find_elements(By.TAG_NAME, "table")
    for tr in table:
        rows = tr.find_elements(By.TAG_NAME, "tr")[1:]
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            
            # Url for each specific race
            link = cells[0].find_element(By.TAG_NAME, "a")
            race_urls.append(link.get_attribute("href"))
            round_number.append(r)
            r += 1 

    year_begin += 1

browser.close()

link_data = pd.DataFrame({'race_url': race_urls, 'round_number': round_number})
link_data.to_csv('../data/raw/rounds_raw.csv', encoding='utf-8', index=False)

# Save links to file
load_id_map('../data/raw/links_2018+.pkl')
save_id_map('../data/raw/links_2018+.pkl', race_urls)

## Race Results

In [26]:
# Establish variables
urls = load_id_map('../data/raw/links_2018+.pkl')
min_col = 7
max_col = 7
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'circuit_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'team_id': 3,
    'year': lambda browser: int(browser.current_url.split("/")[5]),
    'race_url': lambda browser: browser.current_url,
    'circuit_name': lambda browser: browser.find_element(By.ID, "content-dropdown").text,
    'driver_name': 2,
    'team_name': 3,
    'end_position': 0,
    'points': 6,
    'laps_completed': 4}
id_cols = ['race_id', 'driver_id', 'circuit_id', 'team_id']
page_lvl_cols = ['race_id', 'circuit_id', 'year', 'race_url', 'circuit_name']

# Scrape 2018+ results
df = scrape_url_table(
    urls=urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/race_results_raw_2018+.csv', encoding='utf-8', index=False)

## Practices

In [27]:
# Create practice URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
practice_urls = []
for url in urls:
    for practice_num in [1, 2, 3]:
        practice_url = url.replace('/race-result', f'/practice/{practice_num}')
        practice_urls.append(practice_url)

# Establish other variables
min_col = 6
max_col = 6
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'session_type': lambda browser: browser.current_url.split("/")[9] + browser.current_url.split("/")[10],
    'lap_time': 4,
    'lap_count': 5,
    'position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id', 'session_type']

# Scrape practice results
df = scrape_url_table(
    urls=practice_urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pratice_results_raw.csv', encoding='utf-8', index=False)

In [28]:
# Emilia-Romagna 2020 has a different URL and didnt get scraped correctly
urls = ['https://www.formula1.com/en/results/2020/races/1057/emilia-romagna/practice/0']

# Establish other variables
min_col = 6
max_col = 6
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'session_type': lambda browser: browser.current_url.split("/")[9] + browser.current_url.split("/")[10],
    'lap_time': 4,
    'lap_count': 5,
    'position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id', 'session_type']

# Scrape practice results
df = scrape_url_table(
    urls=urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pratice_results_ER2020_raw.csv', encoding='utf-8', index=False)

## Qualifying

In [29]:
# Create qualifying URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
qualifying_urls = []
for url in urls:
    qual_url = url.replace('/race-result', '/qualifying')
    qualifying_urls.append(qual_url)

# Establish other variables
min_col = 8
max_col = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'q1_time': 4,
    'q2_time': 5,
    'q3_time': 6,
    'qual_position': 0,
    'qual_laps': 7}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape qualifying results
df = scrape_url_table(
    urls=qualifying_urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/qualifying_results_raw.csv', encoding='utf-8', index=False)

## Starting Grid

In [30]:
# Create starting grid URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
starting_urls = []
for url in urls:
    start_url = url.replace('/race-result', '/starting-grid')
    starting_urls.append(start_url)

# Establish other variables
min_col = 4
max_col = 5
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'start_position': 0}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape starting grid results
df = scrape_url_table(
    urls=starting_urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/starting_grid_results_raw.csv', encoding='utf-8', index=False)

## Pit Stops

In [31]:
# Create pit stop URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
pit_urls = []
for url in urls:
    ps_url = url.replace('/race-result', '/pit-stop-summary')
    pit_urls.append(ps_url)

# Establish other variables
min_col = 8
max_col = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'stop_number': 0,
    'stop_lap': 4,
    'pits_time': 6}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape pit stop results
df = scrape_url_table(
    urls=pit_urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/pit_stop_results_raw.csv', encoding='utf-8', index=False)

## Fastest Laps

In [32]:
# Create fastest lap URLs
urls = load_id_map('../data/raw/links_2018+.pkl')
fastest_lap_urls = []
for url in urls:
    fastest_url = url.replace('/race-result', '/fastest-laps')
    fastest_lap_urls.append(fastest_url)

# Establish other variables
min_col = 8
max_col = 8
col_idx_map = {
    'race_id': lambda browser: browser.find_element(By.ID, "content-dropdown").text + '_' + browser.current_url.split("/")[5],
    'driver_id': 2,
    'team_id': 3,
    'fastest_lap_time': 6,
    'lap_number': 4}
id_cols = ['race_id', 'driver_id', 'team_id']
page_lvl_cols = ['race_id']

# Scrape fastest lap results
df = scrape_url_table(
    urls=fastest_lap_urls,
    min_col=min_col,
    max_col=max_col,
    col_idx_map=col_idx_map,
    id_cols=id_cols,
    page_lvl_cols=page_lvl_cols,
    id_mask=constructor_mapping)
df.to_csv('../data/raw/fastest_lap_results_raw.csv', encoding='utf-8', index=False)

# FastF1

## Driver Code Map

In [None]:
x = load_id_map('../data/raw/successful_urls_fastf1.pkl')
x

['https://www.formula1.com/en/results/2018/races/979/australia/race-result',
 'https://www.formula1.com/en/results/2018/races/980/bahrain/race-result',
 'https://www.formula1.com/en/results/2018/races/981/china/race-result',
 'https://www.formula1.com/en/results/2018/races/982/azerbaijan/race-result',
 'https://www.formula1.com/en/results/2018/races/983/spain/race-result',
 'https://www.formula1.com/en/results/2018/races/984/monaco/race-result',
 'https://www.formula1.com/en/results/2018/races/985/canada/race-result',
 'https://www.formula1.com/en/results/2018/races/986/france/race-result',
 'https://www.formula1.com/en/results/2018/races/987/austria/race-result',
 'https://www.formula1.com/en/results/2018/races/988/great-britain/race-result',
 'https://www.formula1.com/en/results/2018/races/989/germany/race-result',
 'https://www.formula1.com/en/results/2018/races/990/hungary/race-result',
 'https://www.formula1.com/en/results/2018/races/991/belgium/race-result',
 'https://www.formula

In [4]:
# Establish web browser and initial variables
urls = load_id_map('../data/raw/links_2018+.pkl')
practice_urls = []
for url in urls:
    for practice_num in [1, 2, 3]:
        practice_url = url.replace('/race-result', f'/practice/{practice_num}')
        practice_urls.append(practice_url)

browser = webdriver.Chrome()
browser.maximize_window()

driver_code_map = {}
failed_urls = []

# Parse urls
for url in practice_urls:
    try:
        browser.get(url)
        time.sleep(1)
        
        tables = browser.find_elements(By.TAG_NAME, "table")
        if not tables:
            print(f"No tables found on {url}")
            continue
            
        for table in tables:
            rows = table.find_elements(By.TAG_NAME, "tr")[1:]
            
            for row in rows:
                try:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    
                    if len(cells) < 3:
                        continue

                    driver_cell = cells[2]
                    
                    # Get all text content from the cell
                    cell_html = driver_cell.get_attribute('innerHTML')
                    
                    # Extract name and code
                    try:
                        # Get all span text
                        all_spans = driver_cell.find_elements(By.TAG_NAME, 'span')
                        
                        # Collect name parts
                        name_parts = []
                        driver_code = ''
                        
                        for span in all_spans:
                            classes = span.get_attribute('class') or ''
                            text = span.get_attribute('textContent').strip()
                            
                            if not text:
                                continue
                            
                            # Name spans
                            if 'max-lg:hidden' in classes or 'max-md:hidden' in classes:
                                name_parts.append(text)
                            # Code span
                            elif 'md:hidden' in classes and not driver_code:
                                driver_code = text
                        
                        full_name = " ".join(name_parts)
                        
                        # Save if we have both name and code and haven't seen this name
                        if full_name and driver_code and full_name not in driver_code_map:
                            driver_code_map[full_name] = driver_code
                            print(f"{full_name}: {driver_code}")
                    
                    except Exception as e:
                        # Skip rows with parsing issues
                        continue
                        
                except Exception as e:
                    # Skip problematic rows
                    continue
                    
    except Exception as e:
        print(f"Failed to load {url}: {e}")
        failed_urls.append(url)
        continue

browser.close()

print(f"\n{'='*50}")
print(f"Total drivers found: {len(driver_code_map)}")
print(f"Failed URLs: {len(failed_urls)}")
if failed_urls:
    print("Failed URLs:")
    for url in failed_urls[:5]:
        print(f"  - {url}")

# Save driver code map to file
save_id_map('../data/raw/driver_code_map.pkl', driver_code_map)

Lewis Hamilton: HAM
Valtteri Bottas: BOT
Max Verstappen: VER
Kimi Räikkönen: RAI
Sebastian Vettel: VET
Daniel Ricciardo: RIC
Romain Grosjean: GRO
Fernando Alonso: ALO
Carlos Sainz: SAI
Stoffel Vandoorne: VAN
Pierre Gasly: GAS
Sergey Sirotkin: SIR
Nico Hulkenberg: HUL
Esteban Ocon: OCO
Lance Stroll: STR
Sergio Perez: PER
Kevin Magnussen: MAG
Brendon Hartley: HAR
Marcus Ericsson: ERI
Charles Leclerc: LEC
Robert Kubica: KUB
Nicholas Latifi: LAT
Antonio Giovinazzi: GIO
Lando Norris: NOR
Artem Markelov: MAR
Sean Gelael: GEL
Daniil Kvyat: KVY
Alexander Albon: ALB
George Russell: RUS
Naoki Yamamoto: YAM
Jack Aitken: AIT
Roy Nissany: NIS
Pietro Fittipaldi: FIT
Mick Schumacher: MSC
Yuki Tsunoda: TSU
Nikita Mazepin: MAZ
Callum Ilott: ILO
Zhou Guanyu: ZHO
Nyck De Vries: DEV
Juri Vips: VIP
Liam Lawson: LAW
Robert Shwartzman: SHW
Alex Palou: PAL
Theo Pourchaire: POU
Logan Sargeant: SAR
Jack Doohan: DOO
Patricio O'Ward: OWA
Felipe Drugovich: DRU
Oscar Piastri: PIA
Oliver Bearman: BEA
Isack Hadjar: H

## Laps

In [47]:
# Load maps
driver_id_map = pd.read_pickle('../data/raw/driver_id_map.pkl')
race_id_map = pd.read_pickle('../data/raw/race_id_map.pkl')
circuit_id_map = pd.read_pickle('../data/raw/circuit_id_map.pkl')
driver_code_map = pd.read_pickle('../data/raw/driver_code_map.pkl')

code_to_name_map = {code: name for name, code in driver_code_map.items()}

# Degredation function
def get_degradation_rate(lap_times, lap_numbers):
    valid_mask = ~np.isnan(lap_times)
    clean_lap_times = lap_times[valid_mask]
    clean_lap_numbers = lap_numbers[valid_mask]
    
    if len(clean_lap_times) > 3:
        X = np.array(clean_lap_numbers).reshape(-1, 1)
        y = np.array(clean_lap_times)
        model = LinearRegression().fit(X, y)
        return model.coef_[0]
    return np.nan

# Compound stat function
def compute_compound_stats(comp_data, compound):
    if comp_data.empty:
        return {
            f'avg_pace_{compound.lower()}': np.nan,
            f'std_pace_{compound.lower()}': np.nan,
            f'laps_on_{compound.lower()}': 0,
            f'deg_rate_{compound.lower()}': np.nan,
        }
    
    lap_times = comp_data['LapTime'].values
    lap_numbers = comp_data['LapNumber'].values
    
    return {
        f'avg_pace_{compound.lower()}': np.nanmean(lap_times),
        f'std_pace_{compound.lower()}': np.nanstd(lap_times),
        f'laps_on_{compound.lower()}': len(lap_times),
        f'deg_rate_{compound.lower()}': get_degradation_rate(lap_times, lap_numbers),
    }

# Aggregation function
def process_file(filepath):
    """Process a single laps file and return aggregated driver stats"""
    print(f"Processing: {filepath.name}")
    
    fh = pd.read_parquet(filepath)
    
    # Filter rows
    filtered_data = fh[
        (fh['TrackStatus'] == '1') # No flags
        & fh['PitOutTime'].isna() # Not an OUT lap
        & fh['PitInTime'].isna() # Not an IN lap
        & fh['IsAccurate'] == True # Full lap completed and is accurate
        & (fh['LapTime'] < fh['LapTime'].quantile(0.95)) # Get rid of outliers
    ].copy()
    
    if len(filtered_data) == 0:
        print(f"    No data after filtering")
        return pd.DataFrame()
    
    # Convert lap times
    filtered_data['LapTime'] = filtered_data['LapTime'].dt.total_seconds()
    filtered_data = filtered_data[filtered_data['LapTime'].notna()].copy()
    
    # Map driver codes and add metadata
    filtered_data['driver_id'] = filtered_data['Driver'].map(code_to_name_map).map(driver_id_map)
    if 'race_id' in fh.columns:
        filtered_data['race_id'] = fh.loc[filtered_data.index, 'race_id']
    if 'session' in fh.columns:
        filtered_data['session'] = fh.loc[filtered_data.index, 'session']
    
    # Extract year from filename for compound mapping
    year = int(filepath.stem.split('_')[0])
    
    # Apply compound mapping for 2018
    if year == 2018:
        compound_mapping = {
            'SUPERSOFT': 'SOFT',
            'HYPERSOFT': 'SOFT',
            'ULTRASOFT': 'SOFT',
            'SOFT': 'MEDIUM',
            'MEDIUM': 'HARD',
            'HARD': 'HARD'
        }
        filtered_data['Compound'] = filtered_data['Compound'].map(compound_mapping).fillna(filtered_data['Compound'])
    
    # Aggregate by driver
    compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
    summaries = []
    
    for driver, group in filtered_data.groupby('Driver'):
        summary = {'driver_name': driver}
        
        # Add metadata from first row
        for col in ['driver_id', 'race_id', 'session']:
            if col in group.columns:
                summary[col] = group[col].iloc[0]
        
        # Add compound stats
        for comp in compounds:
            comp_data = group[group['Compound'] == comp]
            summary.update(compute_compound_stats(comp_data, comp))
        
        summaries.append(summary)
    
    result = pd.DataFrame(summaries)
    print(f"    Created {len(result)} driver records")
    return result

# Main processing loop
fastf1_dir = Path('../data/raw/fastf1')
laps_files = sorted(fastf1_dir.glob('*_laps.parquet'))

print(f"Found {len(laps_files)} files with 'laps' in name\n")

lap_results = pd.DataFrame()

for filepath in laps_files:
    file_result = process_file(filepath)
    if not file_result.empty:
        lap_results = pd.concat([lap_results, file_result], ignore_index=True)

# Count rows where all compound lap counts are 0 (null)
compound_columns = ['laps_on_soft', 'laps_on_medium', 'laps_on_hard', 'laps_on_intermediate', 'laps_on_wet']
all_null_count = ((lap_results[compound_columns] == 0) | lap_results[compound_columns].isna()).all(axis=1).sum()

print(f"Rows with no laps on any compound: {all_null_count}")
print(f"\nShape: {lap_results.shape}\n")

lap_results.to_csv('../data/raw/lap_results_raw.csv', encoding='utf-8', index=False)

Found 790 files with 'laps' in name

Processing: 2018_Abu Dhabi_FP1_laps.parquet
    Created 20 driver records
Processing: 2018_Abu Dhabi_FP2_laps.parquet
    Created 20 driver records
Processing: 2018_Abu Dhabi_FP3_laps.parquet
    Created 20 driver records
Processing: 2018_Abu Dhabi_Qualifying_laps.parquet
    Created 20 driver records
Processing: 2018_Abu Dhabi_Race_laps.parquet
    Created 19 driver records
Processing: 2018_Australia_FP1_laps.parquet
    Created 20 driver records
Processing: 2018_Australia_FP2_laps.parquet
    Created 20 driver records
Processing: 2018_Australia_FP3_laps.parquet
    Created 15 driver records
Processing: 2018_Australia_Qualifying_laps.parquet
    Created 13 driver records
Processing: 2018_Australia_Race_laps.parquet
    Created 18 driver records
Processing: 2018_Austria_FP1_laps.parquet
    Created 20 driver records
Processing: 2018_Austria_FP2_laps.parquet
    Created 20 driver records
Processing: 2018_Austria_FP3_laps.parquet
    Created 20 driver

## Weather

In [48]:
# Get directory and files
fastf1_dir = Path('../data/raw/fastf1')
weather_files = sorted(fastf1_dir.glob('*_weather.parquet'))

print(f"Found {len(weather_files)} files with 'weather' in name\n")

weather_results = pd.DataFrame()

# Init columns to aggregate
numeric_columns = ['AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'Pressure']
boolean_columns = ['Rainfall']
string_columns = ['race_id', 'session']

# Main aggregation loop
for filepath in weather_files:
    print(f"Processing: {filepath.name}")
    fh = pd.read_parquet(filepath)
    file_result = aggregate_columns(df=fh, columns=numeric_columns, boolean_columns=boolean_columns, string_columns=string_columns)
    print(f"    Aggregated {fh.shape[0]} rows for {file_result.shape[1]} columns")
    if not file_result.empty:
        weather_results = pd.concat([weather_results, file_result], ignore_index=True)

print(f"\nShape: {weather_results.shape}\n")

weather_results.to_csv('../data/raw/weather_results_raw.csv', encoding='utf-8', index=False)

Found 790 files with 'weather' in name

Processing: 2018_Abu Dhabi_FP1_weather.parquet
    Aggregated 111 rows for 24 columns
Processing: 2018_Abu Dhabi_FP2_weather.parquet
    Aggregated 112 rows for 24 columns
Processing: 2018_Abu Dhabi_FP3_weather.parquet
    Aggregated 82 rows for 24 columns
Processing: 2018_Abu Dhabi_Qualifying_weather.parquet
    Aggregated 78 rows for 24 columns
Processing: 2018_Abu Dhabi_Race_weather.parquet
    Aggregated 137 rows for 24 columns
Processing: 2018_Australia_FP1_weather.parquet
    Aggregated 102 rows for 24 columns
Processing: 2018_Australia_FP2_weather.parquet
    Aggregated 68 rows for 24 columns
Processing: 2018_Australia_FP3_weather.parquet
    Aggregated 68 rows for 24 columns
Processing: 2018_Australia_Qualifying_weather.parquet
    Aggregated 78 rows for 24 columns
Processing: 2018_Australia_Race_weather.parquet
    Aggregated 111 rows for 24 columns
Processing: 2018_Austria_FP1_weather.parquet
    Aggregated 99 rows for 24 columns
Proces

## Flags

In [49]:
# Get directory and files
fastf1_dir = Path('../data/raw/fastf1')
flag_files = sorted(fastf1_dir.glob('*_messages.parquet'))

print(f"Found {len(flag_files)} files with 'messages' in name\n")

flag_results = pd.DataFrame()

# Init columns to aggregate
basic_count_flags = ['YELLOW', 'DOUBLE YELLOW', 'RED', 'CLEAR']
string_columns = ['race_id', 'session']

# Main aggregation loop
for filepath in flag_files:
    print(f"Processing: {filepath.name}")
    fh = pd.read_parquet(filepath)
    
    # Initialize aggregation dictionary
    agg = {}
    
    # Count each flag type
    if 'Flag' in fh.columns:
        for flag in basic_count_flags:
            agg[f'flag_{flag.lower().replace(" ", "_")}_count'] = (fh['Flag'] == flag).sum()
    
    # Count safety car deployments
    if all(col in fh.columns for col in ['Category', 'Status', 'Message']):
        sc_deployments = fh[(fh['Category'] == 'SafetyCar') & (fh['Status'] == 'DEPLOYED') & 
                           (~fh['Message'].str.contains('VIRTUAL', case=False, na=False))]
        agg['safety_car_deployments'] = len(sc_deployments)
    
    # Count virtual safety Car deployments
    if all(col in fh.columns for col in ['Category', 'Status', 'Message']):
        vsc_deployments = fh[(fh['Category'] == 'SafetyCar') & (fh['Status'] == 'DEPLOYED') & 
                            (fh['Message'].str.contains('VIRTUAL', case=False, na=False))]
        agg['virtual_safety_car_deployments'] = len(vsc_deployments)
    
    # Calculate safety car lap percentages
    if 'Lap' in fh.columns:
        total_laps = fh['Lap'].max()
        if total_laps > 0:
            # Find all SC laps
            sc_deployed_laps = fh[(fh['Category'] == 'SafetyCar') & (fh['Status'] == 'DEPLOYED') & 
                                 (~fh['Message'].str.contains('VIRTUAL', case=False, na=False))]['Lap'].values
            sc_in_laps = fh[(fh['Category'] == 'SafetyCar') & (fh['Status'] == 'IN THIS LAP') & 
                           (~fh['Message'].str.contains('VIRTUAL', case=False, na=False))]['Lap'].values
            
            # Find all VSC laps
            vsc_deployed_laps = fh[(fh['Category'] == 'SafetyCar') & (fh['Status'] == 'DEPLOYED') & 
                                  (fh['Message'].str.contains('VIRTUAL', case=False, na=False))]['Lap'].values
            vsc_ending_laps = fh[(fh['Category'] == 'SafetyCar') & (fh['Status'] == 'ENDING') & 
                                (fh['Message'].str.contains('VIRTUAL', case=False, na=False))]['Lap'].values
            
            # Sum up all the safety car periods
            total_sc_laps = sum(sc_in_laps[i] - sc_deployed_laps[i] + 1 for i in range(min(len(sc_deployed_laps), len(sc_in_laps))))
            total_vsc_laps = sum(vsc_ending_laps[i] - vsc_deployed_laps[i] + 1 for i in range(min(len(vsc_deployed_laps), len(vsc_ending_laps))))
            agg['total_sc_laps'] = total_sc_laps
            agg['total_vsc_laps'] = total_vsc_laps
    
    # Add string columns
    for col in string_columns:
        if col in fh.columns and not fh[col].empty:
            agg[col] = fh[col].iloc[0]
    
    file_result = pd.DataFrame([agg])
    print(f"    Aggregated {fh.shape[0]} rows for {file_result.shape[1]} columns")
    if not file_result.empty:
        flag_results = pd.concat([flag_results, file_result], ignore_index=True)

print(f"\nShape: {flag_results.shape}\n")

flag_results.to_csv('../data/raw/flag_results_raw.csv', encoding='utf-8', index=False)

Found 790 files with 'messages' in name

Processing: 2018_Abu Dhabi_FP1_messages.parquet
    Aggregated 13 rows for 8 columns
Processing: 2018_Abu Dhabi_FP2_messages.parquet
    Aggregated 8 rows for 8 columns
Processing: 2018_Abu Dhabi_FP3_messages.parquet
    Aggregated 19 rows for 8 columns
Processing: 2018_Abu Dhabi_Qualifying_messages.parquet
    Aggregated 8 rows for 8 columns
Processing: 2018_Abu Dhabi_Race_messages.parquet
    Aggregated 70 rows for 10 columns
Processing: 2018_Australia_FP1_messages.parquet
    Aggregated 8 rows for 8 columns
Processing: 2018_Australia_FP2_messages.parquet
    Aggregated 32 rows for 8 columns
Processing: 2018_Australia_FP3_messages.parquet
    Aggregated 5 rows for 8 columns
Processing: 2018_Australia_Qualifying_messages.parquet
    Aggregated 28 rows for 8 columns
Processing: 2018_Australia_Race_messages.parquet
    Aggregated 55 rows for 10 columns
Processing: 2018_Austria_FP1_messages.parquet
    Aggregated 9 rows for 8 columns
Processing: 2

# Wikipedia

## Circuits

In [8]:
# Establish variables
urls = ["https://en.wikipedia.org/wiki/List_of_Formula_One_circuits"]
total_cols = 11
col_idx_map = {
    'name': 0,
    'type': 2,
    'direction': 3,
    'location': 4,
    'country': 5,
    'length': 6,
    'turns': 7,
    'gp': 8,
    'seasons': 9,
    'gps_held': 10}

# Scrape 2018+ results
df = scrape_url_table(
    urls,
    total_cols,
    col_idx_map)
df.to_csv('../data/raw/circuits_raw.csv', encoding='utf-8', index=False)

# Photon

## Location

In [None]:
circuits = pd.read_csv('../data/raw/circuits_raw.csv')

# Create a new dataframe to store location data
location_data = []

# Iterate through each circuit
for index, row in circuits.iterrows():
    # Get original name before stripping
    original_name = row['name']
    
    # Clean place name by stripping non-text and excess white space
    place = re.sub(r'[^A-Za-z0-9\s]', '', row['name']).strip()
    city = row['location']
    country = row['country']
    
    # Get location data
    location = get_location_data(place, city, country)
    
    # Add to results
    location_info = {
        'original_name': original_name,
        'cleaned_name': place,
        'city': city,
        'country': country,
        'latitude': location['latitude'] if location else None,
        'longitude': location['longitude'] if location else None,
        'elevation': location['elevation'] if location else None
    }
    location_data.append(location_info)

# Create final dataframe
df = pd.DataFrame(location_data)
df.to_csv('../data/raw/locations_raw.csv', encoding='utf-8', index=False)

No results found
No results found
No results found
