# Introduction
Jack Wilson
10/26/2025

This notebook outlines the cleaning and merging of raw data into intermediate and final dataframes

# Import Modules

In [1]:
import pandas as pd

import os, sys

In [2]:
# Connects notebook to 'src' Package
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.data_functions import load_id_map, save_id_map

# DataFrame Display Options

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)

# Prep for Merge

## ID Map

### Load

In [102]:
id_map = load_id_map('../data/raw/circuit_id_map.pkl')

### Fix Circuit ID Map
Austria & Styria (2020, 2021) are the same race

Great Britain & 70th Anniversary (2020) are the same race

In [103]:
# Ensure Austria and Styria have the same circuit ID
if 'Styria' in id_map and 'Austria' in id_map:
    id_map_styria = id_map['Styria']
    id_map['Styria'] = id_map['Austria']

# Ensure Great Britain and 70th Anniversary have the same circuit ID
if '70th Anniversary' in id_map and 'Great Britain' in id_map:
    id_map_anniversary = id_map['70th Anniversary']
    id_map['70th Anniversary'] = id_map['Great Britain']

print(f"Styria: {id_map_styria} -> {id_map.get('Styria')}")
print(f"Austria: {id_map.get('Austria')}")
print(f"70th Anniversary: {id_map_anniversary} -> {id_map.get('70th Anniversary')}")
print(f"Great Britain: {id_map.get('Great Britain')}")

Styria: 9 -> 9
Austria: 9
70th Anniversary: 10 -> 10
Great Britain: 10


### Save

In [104]:
save_id_map('../data/raw/circuit_id_map.pkl', id_map)

## Race Results 2001-2017

### Load

In [None]:
races_2001 = pd.read_csv('../data/raw/race_results_raw_2001-2017.csv')

### Save

In [None]:
races_2001.to_csv('../data/clean/race_results_clean_2001-2017.csv', encoding='utf-8', index=False)

## Race Results 2018+

### Load

In [114]:
races_2018 = pd.read_csv('../data/raw/race_results_raw_2018+.csv')

### Fix circuit_id in Results

In [115]:
# Change circuit_id from 23 to 10 and from 22 to 9 
rows_replaced = (races_2018['circuit_id'] == 23).sum() + (races_2018['circuit_id'] == 22).sum()
races_2018['circuit_id'] = races_2018['circuit_id'].replace({23: 10, 22: 9})

print(f"Rows replaced: {rows_replaced}")

Rows replaced: 60


### Save

In [116]:
races_2018.to_csv('../data/clean/race_results_clean_2018+.csv', encoding='utf-8', index=False)

## Practice Results

### Load

In [None]:
practices = pd.read_csv('../data/raw/pratice_results_raw.csv')

### Save

In [None]:
practices.to_csv('../data/clean/practice_results_clean.csv', encoding='utf-8', index=False)

## Qualifying Results

## Starting Grid

## Pit Stops

## Fastest Laps

## Rounds

## Lap Results

## Weather

## Flag Results

## Circuits

### Load

In [105]:
circuits = pd.read_csv('../data/raw/circuits_raw.csv')

### Separate Bahrain Outer Circuit

In [106]:
# Create a dictionary with Sakhir Outer Circuit (2020) data
new_circuit_data = {
    'name': 'Bahrain International Outer Circuit',
    'type': 'Race circuit',
    'direction': 'Clockwise',
    'location': 'Sakhir',
    'country': 'Bahrain',
    'length': '3.543 km (2.202 mi)',
    'turns': '11',
    'gp': 'Sakhir Grand Prix',
    'seasons': '2020',
    'gps_held': '1'
}

# Convert to DataFrame and append to existing data
new_row = pd.DataFrame([new_circuit_data])
circuits = pd.concat([circuits, new_row], ignore_index=True)

circuits.tail(3)

Unnamed: 0,name,type,direction,location,country,length,turns,gp,seasons,gps_held
76,Yas Marina Circuit *,Race circuit,Anti-clockwise,Abu Dhabi,United Arab Emirates,5.281 km (3.281 mi),15,Abu Dhabi Grand Prix,2009–2024,16
77,Zeltweg Airfield,Road circuit,Clockwise,Zeltweg,Austria,3.186 km (1.980 mi),4,Austrian Grand Prix,1964,1
78,Bahrain International Outer Circuit,Race circuit,Clockwise,Sakhir,Bahrain,3.543 km (2.202 mi),11,Sakhir Grand Prix,2020,1


### Filter for 2018+ Seasons

In [107]:
circuits = circuits.dropna(subset=['seasons'])

def has_year_after_2018(season_str, target_start=2018):
    # Split by comma to handle multiple ranges
    ranges = season_str.split(',')
    
    for range_part in ranges:
        range_part = range_part.strip()
        
        # Clean the string
        range_part = range_part.replace(' ', '').replace('[', '').replace(']', '').replace('c', '').replace('e', '')
        
        # Check if it's a range
        if '–' in range_part:
            end_year = int(range_part.split('–')[-1])
        else:
            # Single year with no range
            end_year = int(range_part)
        
        # Check if this range includes any year from 2018 onward
        if end_year >= target_start:
            return True
    
    return False

circuits = circuits[circuits['seasons'].apply(has_year_after_2018)]
print(f"{circuits.shape[0]} rows left in the filtered dataframe")

32 rows left in the filtered dataframe


### Add circuit_id Column

In [108]:
# Load circuit_id mapping dictionary
circuit_id_map = load_id_map('../data/raw/circuit_id_map.pkl')

def find_circuit_info(gp_str, country_str, id_map):
    # Clean the GP string
    cleaned_gp = gp_str.replace("Grand Prix", "").replace("\n", "").strip()
    
    # Split on commas if present
    gp_parts = [part.strip() for part in cleaned_gp.split(',')] if ',' in cleaned_gp else [cleaned_gp]
    
    # Try to match each part with circuit_id dictionary keys
    for part in gp_parts:
        for key in id_map.keys():
            if part.lower() in key.lower() or key.lower() in part.lower():
                return id_map[key], key
    
    # If no match found in GP parts, try to match with country
    for key in id_map.keys():
        if country_str.lower() in key.lower() or key.lower() in country_str.lower():
            return id_map[key], key
    
    # If still no match found
    return "no match", "no match"

# Apply the function to create circuit_id and circuit_name columns
circuits[['circuit_id', 'circuit_name']] = circuits.apply(
    lambda row: pd.Series(find_circuit_info(row['gp'], row['country'], circuit_id_map)), 
    axis=1
)

circuits

Unnamed: 0,name,type,direction,location,country,length,turns,gp,seasons,gps_held,circuit_id,circuit_name
3,Albert Park Circuit *,Street circuit,Clockwise,Melbourne,Australia,5.278 km (3.280 mi),14,Australian Grand Prix,"1996–2019, 2022–2025",28,1,Australia
4,Algarve International Circuit,Race circuit,Clockwise,Portimão,Portugal,4.653 km (2.891 mi),15,Portuguese Grand Prix,2020–2021,2,26,Portugal
6,Autódromo Hermanos Rodríguez *,Race circuit,Clockwise,Mexico City,Mexico,4.304 km (2.674 mi),17,"Mexican Grand Prix,\nMexico City Grand Prix","1963–1970, 1986–1992, 2015–2019, 2021–2024",24,19,Mexico
8,Autodromo Internazionale del Mugello,Race circuit,Clockwise,Scarperia e San Piero,Italy,5.245 km (3.259 mi),15,Tuscan Grand Prix,2020,1,24,Tuscany
9,Autodromo Internazionale Enzo e Dino Ferrari *,Race circuit,Anti-clockwise,Imola,Italy,4.909 km (3.050 mi),19,"Italian Grand Prix,\nSan Marino Grand Prix,\nEmilia Romagna Grand Prix","1980–2006, 2020–2022, 2024–2025",32,14,Italy
10,Autodromo José Carlos Pace *,Race circuit,Anti-clockwise,São Paulo,Brazil,4.309 km (2.677 mi),15,"Brazilian Grand Prix,\nSão Paulo Grand Prix","1973–1977, 1979–1980, 1990–2019, 2021–2024",41,20,Brazil
11,Autodromo Nazionale di Monza *,Race circuit,Clockwise,Monza,Italy,5.793 km (3.600 mi),11,Italian Grand Prix,"1950–1979, 1981–2025",75,14,Italy
14,Bahrain International Circuit *,Race circuit,Clockwise,Sakhir,Bahrain,5.412 km (3.363 mi),15,"Bahrain Grand Prix,\nSakhir Grand Prix","2004–2010, 2012–2025[c]",22,2,Bahrain
15,Baku City Circuit *,Street circuit,Anti-clockwise,Baku,Azerbaijan,6.003 km (3.730 mi),20,"European Grand Prix,\nAzerbaijan Grand Prix","2016–2019, 2021–2025",9,4,Azerbaijan
22,Circuit de Barcelona-Catalunya *,Race circuit,Clockwise,Montmeló,Spain,4.657 km (2.894 mi),14,Spanish Grand Prix,1991–2025,35,5,Spain


In [109]:
# Update circuit names and IDs for specific rows
circuit_id_map = load_id_map('../data/raw/circuit_id_map.pkl')

# Update row index 9: change circuit name to "Emilia-Romagna"
new_circuit_id_9, new_circuit_name_9 = find_circuit_info("Emilia-Romagna", circuits.loc[9, 'country'], circuit_id_map)
circuits.loc[9, 'circuit_name'] = new_circuit_name_9
circuits.loc[9, 'circuit_id'] = new_circuit_id_9

# Update row index 59: change circuit name to "Eifel"
new_circuit_id_59, new_circuit_name_59 = find_circuit_info("Eifel", circuits.loc[59, 'country'], circuit_id_map)
circuits.loc[59, 'circuit_name'] = new_circuit_name_59
circuits.loc[59, 'circuit_id'] = new_circuit_id_59

# Check for duplicates in circuit_id and circuit_name columns
duplicate_circuit_ids = circuits[circuits.duplicated('circuit_id', keep=False)]
duplicate_circuit_names = circuits[circuits.duplicated('circuit_name', keep=False)]

print(f"Rows with duplicate circuit_id: {duplicate_circuit_ids.shape[0]}")
print(f"Rows with duplicate circuit_name: {duplicate_circuit_names.shape[0]}")

print(f"\nUpdated row 9: circuit_name = '{new_circuit_name_9}', circuit_id = {new_circuit_id_9}")
print(f"Updated row 59: circuit_name = '{new_circuit_name_59}', circuit_id = {new_circuit_id_59}")

Rows with duplicate circuit_id: 0
Rows with duplicate circuit_name: 0

Updated row 9: circuit_name = 'Emilia-Romagna', circuit_id = 27
Updated row 59: circuit_name = 'Eifel', circuit_id = 25


### Remove Unnecessary Data

In [110]:
circuits = circuits[['type', 'direction', 'length', 'turns', 'circuit_id', 'circuit_name']].sort_values('circuit_id', ascending=False)
circuits['length'] = circuits['length'].apply(lambda x: x.split(' ')[0] if isinstance(x, str) else x)
circuits

Unnamed: 0,type,direction,length,turns,circuit_id,circuit_name
50,Street circuit,Anti-clockwise,6.201,17,34,Las Vegas
55,Street circuit,Anti-clockwise,5.412,19,33,Miami
47,Street circuit,Anti-clockwise,6.174,27,32,Saudi Arabia
52,Race circuit,Clockwise,5.419,16,31,Qatar
33,Race circuit,Clockwise,4.259,14,30,Netherlands
78,Race circuit,Clockwise,3.543,11,29,Sakhir
46,Race circuit,Anti-clockwise,5.338,14,28,Turkey
9,Race circuit,Anti-clockwise,4.909,19,27,Emilia-Romagna
4,Race circuit,Clockwise,4.653,15,26,Portugal
59,Race circuit,Clockwise,5.148,15,25,Eifel


### Fix Japan Direction

In [111]:
circuits.loc[circuits['circuit_name'] == 'Japan', 'direction'] = 'Figure eight'
circuits[circuits['circuit_name'] == 'Japan']

Unnamed: 0,type,direction,length,turns,circuit_id,circuit_name
72,Race circuit,Figure eight,5.807,18,17,Japan


### Correct DataTypes

In [112]:
print(f"Previous DataTypes:\n{circuits.dtypes}")

circuits['length'] = circuits['length'].astype(float)
circuits['turns'] = circuits['turns'].astype(int)
circuits['circuit_id'] = circuits['circuit_id'].astype(int)


print(f"\nNew DataTypes:\n{circuits.dtypes}")

Previous DataTypes:
type            object
direction       object
length          object
turns           object
circuit_id       int64
circuit_name    object
dtype: object

New DataTypes:
type             object
direction        object
length          float64
turns             int32
circuit_id        int32
circuit_name     object
dtype: object


### Save

In [113]:
circuits.to_csv('../data/clean/circuits_clean.csv', encoding='utf-8', index=False)

# Clean F1 Website [Race Results 2001-2017]

## Load Data

In [3]:
races_2001 = pd.read_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8')
races_2001.head()

Unnamed: 0,race_url,driverid,driver_name,position,points
0,https://www.formula1.com/en/results/2001/races/703/australia/race-result,1,Michael Schumacher,1,10.0
1,https://www.formula1.com/en/results/2001/races/703/australia/race-result,2,David Coulthard,2,6.0
2,https://www.formula1.com/en/results/2001/races/703/australia/race-result,3,Rubens Barrichello,3,4.0
3,https://www.formula1.com/en/results/2001/races/703/australia/race-result,4,Nick Heidfeld,4,3.0
4,https://www.formula1.com/en/results/2001/races/703/australia/race-result,5,Heinz-Harald Frentzen,5,2.0


## Separate Position and Status

In [5]:
races_2001['position'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', 'NC', '15', '16', '17', '18', '19', '20', '21', 'DQ',
       '22', '23', '24', 'EX'], dtype=object)

In [6]:
# Initialize new columns
end_positions = []
statuses = []
current_position = 1
current_race = None

# Group by each race
for idx, row in races_2001.iterrows():
    # Check if its a new race
    if current_race != row["race_url"]:
        current_race = row["race_url"]
        current_position = 1

    pos = row["position"]

    try:
        # Try converting to int to valid position
        numeric_pos = int(pos)
        end_positions.append(numeric_pos)
        statuses.append("CLAS")
        current_position = numeric_pos + 1
    except ValueError:
        # Not a number so need to assign position and keep status
        if pos in ["NC", "DQ", "DNS", "EX"]:
            end_positions.append(current_position)
            statuses.append(pos)
            current_position += 1
        else:
            end_positions.append(current_position)
            statuses.append("DNF")
            current_position += 1

# Assign back to dataframe
races_2001["end_position"] = end_positions
races_2001["position_status"] = statuses

# Rename columns
races_2001.rename(columns={"position": "raw_end_position"}, inplace=True)
races_2001.rename(columns={"end_position": "end_position"}, inplace=True)
races_2001.head()

Unnamed: 0,race_url,driverid,driver_name,raw_end_position,points,end_position,position_status
0,https://www.formula1.com/en/results/2001/races/703/australia/race-result,1,Michael Schumacher,1,10.0,1,CLAS
1,https://www.formula1.com/en/results/2001/races/703/australia/race-result,2,David Coulthard,2,6.0,2,CLAS
2,https://www.formula1.com/en/results/2001/races/703/australia/race-result,3,Rubens Barrichello,3,4.0,3,CLAS
3,https://www.formula1.com/en/results/2001/races/703/australia/race-result,4,Nick Heidfeld,4,3.0,4,CLAS
4,https://www.formula1.com/en/results/2001/races/703/australia/race-result,5,Heinz-Harald Frentzen,5,2.0,5,CLAS


## Consolidate Statuses

In [8]:
races_2001['position_status'].unique()

array(['CLAS', 'NC', 'DQ', 'EX'], dtype=object)

In [9]:
# Convert 'EX' values in position_status to 'DQ'
races_2001['position_status'] = races_2001['position_status'].replace('EX', 'DQ')

In [10]:
races_2001['position_status'].unique()

array(['CLAS', 'NC', 'DQ'], dtype=object)

## Aggregate Results

In [11]:
# Convert end_position and points columns to int
races_2001['end_position'] = races_2001['end_position'].astype(int)
races_2001['points'] = races_2001['points'].astype(float)

In [12]:
# Aggregate the races_2001 dataframe
races_2001_agg = races_2001.groupby(['driverid', 'driver_name']).agg({
    'end_position': ['mean', 'std'],
    'points': ['sum', 'mean', 'std']
}).round(2)

# Calculate NC and DQ rates
clas_rate = races_2001.groupby('driverid')['position_status'].apply(lambda x: (x == 'CLAS').mean()).round(2)
nc_rate = races_2001.groupby('driverid')['position_status'].apply(lambda x: (x == 'NC').mean()).round(2)
dq_rate = races_2001.groupby('driverid')['position_status'].apply(lambda x: (x == 'DQ').mean()).round(2)

# Flatten the multi-level column names
races_2001_agg.columns = ['avg_position', 'std_position', 'total_points', 'avg_points', 'std_points']
races_2001_agg = races_2001_agg.reset_index()

# Add NC and DQ rates
races_2001_agg['CLAS_rate'] = clas_rate.values
races_2001_agg['NC_rate'] = nc_rate.values
races_2001_agg['DQ_rate'] = dq_rate.values

races_2001_agg.head()

Unnamed: 0,driverid,driver_name,avg_position,std_position,total_points,avg_points,std_points,CLAS_rate,NC_rate,DQ_rate
0,1,Michael Schumacher,7.0,6.85,888.0,5.45,4.24,0.84,0.16,0.0
1,2,David Coulthard,10.21,6.04,241.0,1.72,2.44,0.72,0.28,0.0
2,3,Rubens Barrichello,9.15,6.01,519.0,2.68,3.3,0.84,0.16,0.0
3,4,Nick Heidfeld,10.19,5.22,259.0,1.55,2.45,0.79,0.21,0.0
4,5,Heinz-Harald Frentzen,12.6,5.28,21.0,0.5,1.27,0.6,0.4,0.0


## Save Clean DataFrame

In [13]:
races_2001_agg.to_csv('races_results_clean_2001-2017.csv', encoding='utf-8', index=False)

# Clean F1 Website [Race Results 2018-2025]

In [None]:
races_2018 = pd.read_csv('../data/raw/races_results_raw_2018-2025.csv', encoding='utf-8')
races_2018.head()

In [None]:
# For the first row, save that lap_time as the base time, add gaps to that time
                        if row == rows[0]:
                            # Find raw lap time
                            lap_time = cells[4].text
                            
                            # Check if lap time is blank
                            if not lap_time.strip():
                                lap_times.append('NULL')
                            
                            # Distinguish between times over and under a minute
                            if ':' in lap_time:
                                # Time in "min:sec.millisec" format
                                time_parts = re.split(r"[:.]", lap_time)
                                minutes = int(time_parts[0])
                                seconds = int(time_parts[1])
                                milliseconds = int(time_parts[2])
                            else:
                                # Time in "sec.millisec" format
                                time_parts = lap_time.split('.')
                                minutes = 0
                                seconds = int(time_parts[0])
                                milliseconds = int(time_parts[1])
                            
                            # Convert that into timedelta so it can be added later
                            base_time = timedelta(minutes=minutes, seconds=seconds, milliseconds=milliseconds)
                            
                            # Append it to the list
                            lap_times.append(base_time)
                        else:
                            # Find raw lap time
                            lap_time = cells[4].text

                            # Check if lap time is blank
                            if not lap_time.strip():
                                lap_times.append('NULL')
                            else:
                                # Get rid of the + and s
                                time_clean = lap_time.strip('+s')
                                
                                # Distinguish between times over and under a minute
                                if ':' in time_clean:
                                    # Gap time in "min:sec.millisec" format
                                    time_parts = re.split(r"[:.]", time_clean)
                                    gap_minutes = int(time_parts[0])
                                    gap_seconds = int(time_parts[1])
                                    gap_milliseconds = int(time_parts[2])
                                else:
                                    # Gap time in "sec.millisec" format
                                    time_parts = time_clean.split('.')
                                    gap_minutes = 0
                                    gap_seconds = int(time_parts[0])
                                    gap_milliseconds = int(time_parts[1])
                                
                                # Convert that into timedelta so it can be added
                                gap = timedelta(minutes=gap_minutes, seconds=gap_seconds, milliseconds=gap_milliseconds)

                                # Add the time gap to the base time
                                new_time = base_time + gap
                                lap_times.append(new_time)
                    