In [1]:
import pandas as pd
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

In [3]:
circuits = pd.read_csv('historic_data/circuits.csv')
constructor_results = pd.read_csv('historic_data/constructor_results.csv')
constructor_standings = pd.read_csv('historic_data/constructor_standings.csv')
constructors = pd.read_csv('historic_data/constructors.csv')
driver_standings = pd.read_csv('historic_data/driver_standings.csv')
drivers = pd.read_csv('historic_data/drivers.csv')
pit_stops = pd.read_csv('historic_data/pit_stops.csv')
qualifying = pd.read_csv('historic_data/qualifying.csv')
races = pd.read_csv('historic_data/races.csv')
results = pd.read_csv('historic_data/results.csv')
seasons = pd.read_csv('historic_data/seasons.csv')
sprint_results = pd.read_csv('historic_data/sprint_results.csv')
status = pd.read_csv('historic_data/status.csv')

In [10]:
# Filter and clean races df
races_cleaned = races[["raceId", "year", "round", "circuitId"]].copy()
races_cleaned = races_cleaned.sort_values(by=['year', 'round'])
races_cleaned = races_cleaned[races_cleaned["year"] >= 2021]

# Filter and clean results df
results_cleaned = results[["raceId", "driverId", "constructorId", "grid", "positionOrder"]].copy()

# Merge cleaned datasets based on the raceID
race_data = pd.merge(races_cleaned, results_cleaned, on='raceId')

# Adding new column to indicate if a driver finished on the podium
race_data['Top 3 Finish'] = race_data['positionOrder'].le(3).astype(int)

print(race_data)

      raceId  year  round  circuitId  driverId  constructorId  grid  \
0       1052  2021      1          3         1            131     2   
1       1052  2021      1          3       830              9     1   
2       1052  2021      1          3       822            131     3   
3       1052  2021      1          3       846              1     7   
4       1052  2021      1          3       815              9     0   
...      ...   ...    ...        ...       ...            ...   ...   
1115    1110  2023     12         13       817            213    19   
1116    1110  2023     12         13       858              3    18   
1117    1110  2023     12         13       807            210     0   
1118    1110  2023     12         13       832              6     4   
1119    1110  2023     12         13       857              1     5   

      positionOrder  Top 3 Finish  
0                 1             1  
1                 2             1  
2                 3             1  
3  

In [11]:
# Calculate driver and constructor averages
driver_avg_finish = results.groupby('driverId')['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'driverAvgFinish'})
constructor_avg_finish = results.groupby('constructorId')['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'constructorAvgFinish'})

# Calculate win rates and podium finishes
results['win'] = (results['positionOrder'] == 1).astype(int)
results['podium'] = (results['positionOrder'] <= 3).astype(int)

# Driver win rates and podium rates
driver_win_rate = results.groupby('driverId')['win'].mean().reset_index().rename(columns={'win': 'driverWinRate'})
driver_podium_rate = results.groupby('driverId')['podium'].mean().reset_index().rename(columns={'podium': 'driverPodiumRate'})

# Constructor win rates and podium rates
constructor_win_rate = results.groupby('constructorId')['win'].mean().reset_index().rename(columns={'win': 'constructorWinRate'})
constructor_podium_rate = results.groupby('constructorId')['podium'].mean().reset_index().rename(columns={'podium': 'constructorPodiumRate'})

# Average pit stop duration per race and driver
average_pit_stop_duration = pit_stops.groupby(['raceId', 'driverId'])['milliseconds'].mean().reset_index().rename(columns={'milliseconds': 'averagePitStopDuration'})

# Relevant columns from constructor standings df
constructor_standings_relevant = constructor_standings[['raceId', 'constructorId', 'points', 'position', 'wins']]

# Merge new columns into race_data df
race_data = race_data.merge(driver_avg_finish, on='driverId', how='left')
race_data = race_data.merge(constructor_avg_finish, on='constructorId', how='left')
race_data = race_data.merge(driver_win_rate, on='driverId', how='left')
race_data = race_data.merge(driver_podium_rate, on='driverId', how='left')
race_data = race_data.merge(constructor_win_rate, on='constructorId', how='left')
race_data = race_data.merge(constructor_podium_rate, on='constructorId', how='left')
race_data = pd.merge(race_data, average_pit_stop_duration, on=['raceId', 'driverId'], how='left')
race_data = pd.merge(race_data, constructor_standings_relevant, on=['raceId', 'constructorId'], how='left', suffixes=('', '_const'))

print(race_data)

      raceId  year  round  circuitId  driverId  constructorId  grid  \
0       1052  2021      1          3         1            131     2   
1       1052  2021      1          3       830              9     1   
2       1052  2021      1          3       822            131     3   
3       1052  2021      1          3       846              1     7   
4       1052  2021      1          3       815              9     0   
...      ...   ...    ...        ...       ...            ...   ...   
1115    1110  2023     12         13       817            213    19   
1116    1110  2023     12         13       858              3    18   
1117    1110  2023     12         13       807            210     0   
1118    1110  2023     12         13       832              6     4   
1119    1110  2023     12         13       857              1     5   

      positionOrder  Top 3 Finish  driverAvgFinish  constructorAvgFinish  \
0                 1             1         4.770186              5.81335

In [15]:
# Drop columns not used for predictive model from df
columns_to_drop = ['points_const', 'position_const', 'wins_const']
race_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Fill in missing values of pit stop duration with the median value in column
race_data['averagePitStopDuration'] = race_data['averagePitStopDuration'].fillna(race_data['averagePitStopDuration'].median())

# Fill missing values in points, position, and wins columns with 0
for col in ['points', 'position', 'wins']:
    race_data[col] = race_data[col].fillna(0)

print(race_data)

      raceId  year  round  circuitId  driverId  constructorId  grid  \
0       1052  2021      1          3         1            131     2   
1       1052  2021      1          3       830              9     1   
2       1052  2021      1          3       822            131     3   
3       1052  2021      1          3       846              1     7   
4       1052  2021      1          3       815              9     0   
...      ...   ...    ...        ...       ...            ...   ...   
1115    1110  2023     12         13       817            213    19   
1116    1110  2023     12         13       858              3    18   
1117    1110  2023     12         13       807            210     0   
1118    1110  2023     12         13       832              6     4   
1119    1110  2023     12         13       857              1     5   

      positionOrder  Top 3 Finish  driverAvgFinish  constructorAvgFinish  \
0                 1             1         4.770186              5.81335

In [14]:
# Convert cleaned data to csv 
race_data.to_csv('cleaned_data/cleaned_races.csv')