In [25]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
# Load dataset
df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv', low_memory=False)

# Convert 'LapTime' and sector times to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Keep the original 'Driver' and 'Circuit' for EDA, Preprocessing be for one hot encoding
df['Original_Driver'] = df['Driver']
df['Original_Circuit'] = df['Circuit']

# Encode Rainfall, FreshTyre, IsAccurate, and IsPersonalBest to integer
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)

# Drop columns not related to lap time prediction
columns_to_drop = ['Time', 'LapStartTime', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated','IsPersonalBest', 'Sector3Time','Sector2Time','Sector1Time']
df.drop(columns=columns_to_drop, inplace=True) # Sector 3 times are deleted since it could shadow impact of weather features

# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Circuit', 'Compound', 'Team'])

# Feature Engineering: 

# Categorize weather and return numerical labels for models Initial: 0 , 25 , 19 
def categorize_weather(row):
    if row['Rainfall'] > 0:
        return 'Rainy'
    elif row['AirTemp'] > 27:
        return 'Hot'
    elif row['AirTemp'] > 20:
        return 'Warm'
    else:
        return 'Cool'
df['Weather_Category'] = df.apply(categorize_weather, axis=1)

# Keep for further steps 
df['Original_Weather_Category'] = df['Weather_Category']
# Apply one-hot encoding to the 'Weather_Category' column
df = pd.get_dummies(df, columns=['Weather_Category'])

# Create Track temperature category based on the result of clustering 
df['TrackTemp_Cat'] = pd.cut(df['TrackTemp'], bins=[18, 27, 34, 41, 50, np.inf], labels=['VERY_LOW', 'Low', 'Medium', 'High', 'VERY_HIGH'])
# Initial values : [10, 20, 30, 40, 45, np.inf], labels=['VERY_LOW', 'Low', 'Medium', 'High', 'VERY_HIGH'])
df['Original_TrackTemp_Cat'] = df['TrackTemp_Cat']

# One hot encoding for TrackTemp_cat
df = pd.get_dummies(df, columns=['TrackTemp_Cat'])

# Tyre Age Interaction with TrackTemp 
df['TyreAge_TrackTemp'] = df['TyreLife'] * df['TrackTemp']

df.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82418 entries, 0 to 82417
Data columns (total 119 columns):
 #    Column                               Dtype   
---   ------                               -----   
 0    DriverNumber                         int64   
 1    LapTime                              float64 
 2    LapNumber                            float64 
 3    Stint                                float64 
 4    SpeedI1                              float64 
 5    SpeedI2                              float64 
 6    SpeedFL                              float64 
 7    SpeedST                              float64 
 8    TyreLife                             float64 
 9    FreshTyre                            int64   
 10   TrackStatus                          float64 
 11   Position                             float64 
 12   IsAccurate                           int64   
 13   AirTemp                              float64 
 14   Humidity                             float64 
 15   

In [26]:
# Check for NaN values in each column
nan_counts = df.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

df['SpeedI1'].fillna(df['SpeedI1'].mean(), inplace=True)
df['SpeedI2'].fillna(df['SpeedI2'].mean(), inplace=True)
df['SpeedFL'].fillna(df['SpeedFL'].mean(), inplace=True)
df['SpeedST'].fillna(df['SpeedST'].mean(), inplace=True)

df = df.drop(['Original_Driver', 'Original_Circuit','Original_Weather_Category','Original_TrackTemp_Cat'], axis= 1)

# Standardize numeric features
numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed','TrackStatus','Position']
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Check for NaN values in each column
nan_counts = df.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])


LapTime                    1610
SpeedI1                   11803
SpeedI2                     212
SpeedFL                    2900
SpeedST                    6483
TrackStatus                  96
Position                     96
Original_TrackTemp_Cat     3497
dtype: int64
LapTime        1610
TrackStatus      96
Position         96
dtype: int64


In [27]:
# 1. Separate LapTime as dry or wet(rainy) condition ( since lapTime of rainy day would be recognized as outliers)
# 2. Remove Outliers for dry condition LapTime
# 3. Build Combined LapTime df (Outliers for dry days are deleted)

# Flag for rainy conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)

df_dry = df[df['IsRainy'] == 0]
#df for rainy days 
df_wet = df[df['IsRainy'] == 1]

# Remove Outliers for dry days
Q1_dry = df_dry['LapTime'].quantile(0.25)
Q3_dry = df_dry['LapTime'].quantile(0.75)
IQR_dry = Q3_dry - Q1_dry
lower_bound_dry = Q1_dry - 1.5 * IQR_dry
upper_bound_dry = Q3_dry + 1.5 * IQR_dry
#Only dry days (Outliers Removed)
df_dry_filtered = df_dry[(df_dry['LapTime'] >= lower_bound_dry) & (df_dry['LapTime'] <= upper_bound_dry)]



# Build combined df for both dry-rainy days (Outliers are removed with rainy days lapTimes) 
df_combined = pd.concat([df_dry_filtered, df_wet], ignore_index=True)
df_combined.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79375 entries, 0 to 79374
Data columns (total 116 columns):
 #    Column                               Dtype  
---   ------                               -----  
 0    DriverNumber                         int64  
 1    LapTime                              float64
 2    LapNumber                            float64
 3    Stint                                float64
 4    SpeedI1                              float64
 5    SpeedI2                              float64
 6    SpeedFL                              float64
 7    SpeedST                              float64
 8    TyreLife                             float64
 9    FreshTyre                            int64  
 10   TrackStatus                          float64
 11   Position                             float64
 12   IsAccurate                           int64  
 13   AirTemp                              float64
 14   Humidity                             float64
 15   Pressure         

In [28]:
nan_counts = df_wet.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

df_wet['Position'].fillna(df_wet['Position'].mean(), inplace=True)
df_wet['TrackStatus'].fillna(df_wet['TrackStatus'].mean(), inplace=True)
df_wet['LapTime'].fillna(df_wet['LapTime'].mean(), inplace=True)


nan_counts = df_wet.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

LapTime        181
TrackStatus      8
Position         8
dtype: int64
Series([], dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wet['Position'].fillna(df_wet['Position'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wet['TrackStatus'].fillna(df_wet['TrackStatus'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wet['LapTime'].fillna(df_wet['LapTime'].mean(), inplace=True)


In [29]:
nan_counts = df_dry_filtered.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

Series([], dtype: int64)


In [30]:
nan_counts = df_combined.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

df_combined['Position'].fillna(df_combined['Position'].mean(), inplace=True)
df_combined['TrackStatus'].fillna(df_combined['TrackStatus'].mean(), inplace=True)
df_combined['LapTime'].fillna(df_combined['LapTime'].mean(), inplace=True)

nan_counts = df_combined.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

LapTime        181
TrackStatus      8
Position         8
dtype: int64
Series([], dtype: int64)


In [31]:
# 1. df_dry_filtered lapTime

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Split Train and test set for a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fitting a RandomForest model
model = RandomForestRegressor(n_estimators=100,max_depth=20,min_samples_split=5,random_state=42)
# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

# Print RMSE value of the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")


RMSE: 1.797974331153758


In [32]:
# Shows Important features

importance = model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")

SpeedFL: 0.2751141475455859
Pressure: 0.1960144459326944
Circuit_Sakhir Grand Prix: 0.09611604549319025
IsAccurate: 0.07737938873446054
Circuit_Azerbaijan Grand Prix: 0.07048865487138345
Circuit_Singapore Grand Prix: 0.045793245608361816
SpeedI2: 0.041281144309500424
Circuit_Bahrain Grand Prix: 0.03188243272449703
Circuit_Chinese Grand Prix: 0.023877734416708613
AirTemp: 0.011719110935374154
SpeedST: 0.011710089768427412
Year: 0.011316537213222552
TyreAge_TrackTemp: 0.010975371142734202
Humidity: 0.009586986875808442
SpeedI1: 0.008930158936092449
LapNumber: 0.008125119268291698
Circuit_British Grand Prix: 0.007251699869424733
Circuit_Japanese Grand Prix: 0.007193113705375357
Circuit_Saudi Arabian Grand Prix: 0.006261117925667921
TyreLife: 0.004720454313833979
TrackTemp: 0.004336732004475724
Circuit_United States Grand Prix: 0.00421462458365223
Circuit_Dutch Grand Prix: 0.004197822153643112
TrackStatus: 0.0037790206537921998
Position: 0.0036838697521864036
Circuit_Belgian Grand Prix: 0.

In [33]:
features = X_train.columns

# List of weather-related feature patterns
weather_features = [
    'Pressure', 'AirTemp', 'Humidity', 'TrackTemp',
    'WindDirection', 'WindSpeed', 'Rainfall',
    'Weather_Category_','Rainfall','TrackTemp_Cat_VERY_LOW','TrackTemp_Cat_Low','Weather_Category_Warm','TrackTemp_Cat_High','TrackTemp_Cat_VERY_HIGH'
]

# Calculate the total importance of weather-related features
total_weather_importance = sum(
    importance for feature, importance in zip(features, importance)
    if any(weather_feature in feature for weather_feature in weather_features)
)

print(f"Total importance of weather-related features: {total_weather_importance}")

# Calculate the importance for the circuit feature as a whole
circuit_importance_sum = sum(
    importance for feature, importance in zip(features, importance)
    if 'Circuit_' in feature
)

print(f"Total importance of circuit features: {circuit_importance_sum}")


Total importance of weather-related features: 0.238096446549956
Total importance of circuit features: 0.3098318175578247


In [34]:
# 2. wet days lapTime

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


X = df_wet.drop(['LapTime'], axis=1)
y = df_wet['LapTime']

# Correctly filling NaN values
y.fillna(y.mean(), inplace=True)


# Split Train and test set for a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fitting a RandomForest model
model = RandomForestRegressor(n_estimators=100,max_depth=20,min_samples_split=5,random_state=42)
# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

# Print RMSE value of the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

# Shows Important features

importance = model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.fillna(y.mean(), inplace=True)


RMSE: 4.272941557186386
SpeedST: 0.2755509237191408
IsAccurate: 0.16404821923098256
SpeedI2: 0.07824355086239612
Pressure: 0.07640751005426687
TrackStatus: 0.07496298816457315
Compound_INTERMEDIATE: 0.0706788781542228
Humidity: 0.060145423539473185
TrackTemp: 0.05687180877606787
SpeedFL: 0.03010819036513285
AirTemp: 0.021281488374344328
LapNumber: 0.019757409486592034
TyreAge_TrackTemp: 0.01596775959211551
SpeedI1: 0.011161182894233606
Position: 0.0080496351203119
TyreLife: 0.007321197994517331
Circuit_German Grand Prix: 0.005075947532603328
WindDirection: 0.004247231761447668
WindSpeed: 0.003962950327994655
Year: 0.0028618542037424067
Circuit_Russian Grand Prix: 0.0017170495374979272
Circuit_Monaco Grand Prix: 0.0016882848045119375
DriverNumber: 0.0011208642823990317
Stint: 0.0009240775014502661
Compound_WET: 0.0008473503318722411
Driver_MSC: 0.0005668127495934385
Circuit_Hungarian Grand Prix: 0.0004018636167167504
Compound_MEDIUM: 0.00037128890511300923
Driver_SAI: 0.0003037910257141

In [35]:
# 3. Combined LapTime

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = df_combined.drop(['LapTime'], axis=1)
y = df_combined['LapTime']

y.fillna(y.mean(), inplace=True)

# Split Train and test set for a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fitting a RandomForest model
model = RandomForestRegressor(n_estimators=100,max_depth=20,min_samples_split=5,random_state=42)
# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

# Print RMSE value of the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

# Shows Important features

importance = model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")

RMSE: 2.001176030880632
SpeedFL: 0.27000953893709706
Pressure: 0.18788253733499227
Circuit_Sakhir Grand Prix: 0.09034377843017127
IsAccurate: 0.08085909490483757
Circuit_Azerbaijan Grand Prix: 0.06585391823814511
SpeedI2: 0.047396880109146826
Circuit_Singapore Grand Prix: 0.04156138096464013
Circuit_Bahrain Grand Prix: 0.0303737821144244
Circuit_Japanese Grand Prix: 0.019163115053170376
Circuit_Chinese Grand Prix: 0.01408163247034092
TyreAge_TrackTemp: 0.013168399790024302
SpeedST: 0.013026816354258853
Humidity: 0.011937917232944852
SpeedI1: 0.011111891612220866
AirTemp: 0.010754878313322882
LapNumber: 0.007442660525599711
WindDirection: 0.007253557578146803
Circuit_British Grand Prix: 0.0067478344623211756
Circuit_United States Grand Prix: 0.006641946221281023
Year: 0.006209769206655495
Compound_INTERMEDIATE: 0.0061950077947974
Circuit_Saudi Arabian Grand Prix: 0.005970818463042796
TyreLife: 0.005115119717028664
TrackStatus: 0.004755732319481224
TrackTemp: 0.0044748290319908636
Positi