In [76]:
import pandas as pd
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

circuits = pd.read_csv('data/circuits.csv')
constructor_results = pd.read_csv('data/constructor_results.csv')
constructor_standings = pd.read_csv('data/constructor_standings.csv')
constructors = pd.read_csv('data/constructors.csv')
driver_standings = pd.read_csv('data/driver_standings.csv')
drivers = pd.read_csv('data/drivers.csv')
lap_times = pd.read_csv('data/lap_times.csv')
pit_stops = pd.read_csv('data/pit_stops.csv')
qualifying = pd.read_csv('data/qualifying.csv')
races = pd.read_csv('data/races.csv')
results = pd.read_csv('data/results.csv')
seasons = pd.read_csv('data/seasons.csv')
sprint_results = pd.read_csv('data/sprint_results.csv')
status = pd.read_csv('data/status.csv')

In [77]:
races_cleaned = races[["raceId", "year", "round", "circuitId"]].copy()
# print(races_cleaned)

races_cleaned = races_cleaned.sort_values(by=['year', 'round'])

races_cleaned = races_cleaned[races_cleaned["year"] >= 2000]

results_cleaned = results[["raceId", "driverId", "constructorId", "grid", "positionOrder"]].copy()
# print(results_cleaned)

df = pd.merge(races_cleaned, results_cleaned, on='raceId')
df['Top 3 Finish'] = df['positionOrder'].le(3).astype(int)
 
# print(results_cleaned)
# print(pit_stops)
print(constructor_standings)

       constructorStandingsId  raceId  constructorId  points  position  \
0                           1      18              1    14.0         1   
1                           2      18              2     8.0         3   
2                           3      18              3     9.0         2   
3                           4      18              4     5.0         4   
4                           5      18              5     2.0         5   
...                       ...     ...            ...     ...       ...   
13046                   28568    1110            214    57.0         6   
13047                   28569    1110              3    11.0         7   
13048                   28570    1110            213     3.0        10   
13049                   28571    1110            210    11.0         8   
13050                   28572    1110              1   103.0         5   

      positionText  wins  
0                1     1  
1                3     0  
2                2     0  
3  

In [38]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
raceId,9400.0,598.320426,431.981674,1.0,114.0,866.0,986.0,1110.0
year,9400.0,2011.647447,6.725949,2000.0,2006.0,2012.0,2017.0,2023.0
round,9400.0,9.877872,5.538708,1.0,5.0,10.0,14.0,22.0
circuitId,9400.0,18.957234,20.474229,1.0,6.0,13.0,21.0,79.0
driverId,9400.0,323.793191,388.235245,1.0,14.75,35.0,821.0,858.0
constructorId,9400.0,45.602553,72.399018,1.0,4.0,9.0,21.0,214.0
grid,9400.0,10.970319,6.207779,0.0,6.0,11.0,16.0,24.0
positionOrder,9400.0,11.111702,6.179882,1.0,6.0,11.0,16.0,24.0


In [82]:
# Calculate driver and constructor averages
driver_avg_finish = results.groupby('driverId')['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'driverAvgFinish'})
constructor_avg_finish = results.groupby('constructorId')['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'constructorAvgFinish'})

# Calculate win rates and podium finishes
results['win'] = (results['positionOrder'] == 1).astype(int)
results['podium'] = (results['positionOrder'] <= 3).astype(int)

driver_win_rate = results.groupby('driverId')['win'].mean().reset_index().rename(columns={'win': 'driverWinRate'})
driver_podium_rate = results.groupby('driverId')['podium'].mean().reset_index().rename(columns={'podium': 'driverPodiumRate'})

constructor_win_rate = results.groupby('constructorId')['win'].mean().reset_index().rename(columns={'win': 'constructorWinRate'})
constructor_podium_rate = results.groupby('constructorId')['podium'].mean().reset_index().rename(columns={'podium': 'constructorPodiumRate'})

# Assuming pit_stop_df is your DataFrame containing the pit stop data
average_pit_stop_duration = pit_stops.groupby(['raceId', 'driverId'])['milliseconds'].mean().reset_index().rename(columns={'milliseconds': 'averagePitStopDuration'})

constructor_standings_relevant = constructor_standings[['raceId', 'constructorId', 'points', 'position', 'wins']]

# Merge these features back into the main DataFrame
df = df.merge(driver_avg_finish, on='driverId', how='left')
df = df.merge(constructor_avg_finish, on='constructorId', how='left')
df = df.merge(driver_win_rate, on='driverId', how='left')
df = df.merge(driver_podium_rate, on='driverId', how='left')
df = df.merge(constructor_win_rate, on='constructorId', how='left')
df = df.merge(constructor_podium_rate, on='constructorId', how='left')
df = pd.merge(df, average_pit_stop_duration, on=['raceId', 'driverId'], how='left')

df = pd.merge(df, constructor_standings_relevant, on=['raceId', 'constructorId'], how='left', suffixes=('', '_const'))


print(df)

MergeError: Passing 'suffixes' which cause duplicate columns {'driverAvgFinish_x'} is not allowed.

In [47]:
# Assuming 'races' DataFrame contains 'raceId' and 'circuitId'
# Merge 'results' with 'races' to get 'circuitId' in 'results'
results_with_circuit = results_cleaned.merge(races[['raceId', 'circuitId']], on='raceId', how='left')

# Now, you can proceed with the calculations that require 'circuitId'
# Calculate the circuit-specific average finishing position for drivers
driver_circuit_avg_finish = results_with_circuit.groupby(['driverId', 'circuitId'])['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'driverCircuitAvgFinish'})

# Then, for constructors
constructor_circuit_avg_finish = results_with_circuit.groupby(['constructorId', 'circuitId'])['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'constructorCircuitAvgFinish'})

# Merge these new features back into your main DataFrame 'df'
df = df.merge(driver_circuit_avg_finish, on=['driverId', 'circuitId'], how='left')
df = df.merge(constructor_circuit_avg_finish, on=['constructorId', 'circuitId'], how='left')

print(df)

      raceId  year  round  circuitId  driverId  constructorId  grid  \
0        158  2000      1          1        30              6     3   
1        158  2000      1          1        22              6     4   
2        158  2000      1          1        23              3    11   
3        158  2000      1          1        35             16     8   
4        158  2000      1          1        21             22     9   
...      ...   ...    ...        ...       ...            ...   ...   
9395    1110  2023     12         13       817            213    19   
9396    1110  2023     12         13       858              3    18   
9397    1110  2023     12         13       807            210     0   
9398    1110  2023     12         13       832              6     4   
9399    1110  2023     12         13       857              1     5   

      positionOrder  Top 3 Finish  driverCircuitAvgFinish  \
0                 1             1                9.428571   
1                 2      

In [49]:
# Assuming 'races' DataFrame has 'raceId', 'year', and 'round' columns to determine the season's midpoint dynamically
# For simplicity, let's proceed with a static approach

# Add a column to indicate if the race is in the first or second half of the season
# Assuming a standard 22-race format, rounds 1-11 are in the first half, and 12-22 are in the second half
df['seasonHalf'] = df['round'].apply(lambda x: 'firstHalf' if x <= 11 else 'secondHalf')

# Calculate average finishing position for each driver in each half of the season
driver_season_performance = df.groupby(['driverId', 'year', 'seasonHalf'])['positionOrder'].mean().reset_index().pivot_table(index=['driverId', 'year'], columns='seasonHalf', values='positionOrder').reset_index().rename(columns={'firstHalf': 'driverFirstHalfAvgFinish', 'secondHalf': 'driverSecondHalfAvgFinish'})

# Calculate average finishing position for each constructor in each half of the season
constructor_season_performance = df.groupby(['constructorId', 'year', 'seasonHalf'])['positionOrder'].mean().reset_index().pivot_table(index=['constructorId', 'year'], columns='seasonHalf', values='positionOrder').reset_index().rename(columns={'firstHalf': 'constructorFirstHalfAvgFinish', 'secondHalf': 'constructorSecondHalfAvgFinish'})

# Merge these new features back into your main DataFrame 'df'
# Merge driver performance
df = pd.merge(df, driver_season_performance, on=['driverId', 'year'], how='left')

# Merge constructor performance
df = pd.merge(df, constructor_season_performance, on=['constructorId', 'year'], how='left')

print(df)

      raceId  year  round  circuitId  driverId  constructorId  grid  \
0        158  2000      1          1        30              6     3   
1        158  2000      1          1        22              6     4   
2        158  2000      1          1        23              3    11   
3        158  2000      1          1        35             16     8   
4        158  2000      1          1        21             22     9   
...      ...   ...    ...        ...       ...            ...   ...   
9395    1110  2023     12         13       817            213    19   
9396    1110  2023     12         13       858              3    18   
9397    1110  2023     12         13       807            210     0   
9398    1110  2023     12         13       832              6     4   
9399    1110  2023     12         13       857              1     5   

      positionOrder  Top 3 Finish  driverCircuitAvgFinish  \
0                 1             1                9.428571   
1                 2      

In [56]:
# Assuming 'qualifying' DataFrame has 'raceId', 'driverId', and 'qualifyingPosition'
# First, ensure that 'qualifyingPosition' is named correctly and represents the position in which the driver will start the race
qualifying_cleaned = qualifying[['raceId', 'driverId', 'position']].rename(columns={'position': 'qualifyingPosition'})

# Now, merge this qualifying information with your main DataFrame 'df'
df = pd.merge(df, qualifying_cleaned, on=['raceId', 'driverId'], how='left')

print(df)

      raceId  year  round  circuitId  driverId  constructorId  grid  \
0        158  2000      1          1        30              6     3   
1        158  2000      1          1        22              6     4   
2        158  2000      1          1        23              3    11   
3        158  2000      1          1        35             16     8   
4        158  2000      1          1        21             22     9   
...      ...   ...    ...        ...       ...            ...   ...   
9395    1110  2023     12         13       817            213    19   
9396    1110  2023     12         13       858              3    18   
9397    1110  2023     12         13       807            210     0   
9398    1110  2023     12         13       832              6     4   
9399    1110  2023     12         13       857              1     5   

      positionOrder  Top 3 Finish  driverCircuitAvgFinish  \
0                 1             1                9.428571   
1                 2      

In [59]:
df.to_csv('cleaned_data.csv', index=False)