In [14]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv', low_memory=False)

# Convert time columns to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Convert binary columns to integer type
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)


# Feature Engineering: 

# Categorize weather and return numerical labels for models Initial: 0 , 25 , 19 
def categorize_weather(row):
    if row['Rainfall'] > 0:
        return 'Rainy'
    elif row['AirTemp'] > 27:
        return 'Hot'
    elif row['AirTemp'] > 20:
        return 'Warm'
    else:
        return 'Cool'
df['Weather_Category'] = df.apply(categorize_weather, axis=1)

df = pd.get_dummies(df, columns=['Weather_Category'])

# Create Track temperature category based on the result of clustering 
df['TrackTemp_Cat'] = pd.cut(df['TrackTemp'], bins=[18, 27, 34, 41, 50, np.inf], labels=['VERY_LOW', 'Low', 'Medium', 'High', 'VERY_HIGH'])

# Initial values : [10, 20, 30, 40, 45, np.inf], labels=['VERY_LOW', 'Low', 'Medium', 'High', 'VERY_HIGH'])
df = pd.get_dummies(df, columns=['TrackTemp_Cat'])

# Tyre Age Interaction with TrackTemp 
df['TyreAge_TrackTemp'] = df['TyreLife'] * df['TrackTemp']

# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Circuit', 'Compound', 'Team','TrackStatus'])

# Drop irrelevant columns
columns_to_drop = ['Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated',
                   'IsPersonalBest', 'Sector3Time', 'Sector2Time','LapStartTime']
df.drop(columns=columns_to_drop, inplace=True)

df.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82418 entries, 0 to 82417
Data columns (total 136 columns):
 #    Column                               Dtype  
---   ------                               -----  
 0    DriverNumber                         int64  
 1    LapTime                              float64
 2    LapNumber                            float64
 3    Stint                                float64
 4    Sector1Time                          float64
 5    SpeedI1                              float64
 6    SpeedI2                              float64
 7    SpeedFL                              float64
 8    SpeedST                              float64
 9    TyreLife                             float64
 10   FreshTyre                            int64  
 11   Position                             float64
 12   IsAccurate                           int64  
 13   AirTemp                              float64
 14   Humidity                             float64
 15   Pressure         

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression


# List of numeric features to be standardized, excluding 'LapTime'
numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed']


imputer = IterativeImputer(estimator=LinearRegression(), random_state=42)
# Impute missing values only for selected features
df[numeric_features] = imputer.fit_transform(df[numeric_features])


## Scaler for other features 
# Initialize the RobustScaler
robust_scaler = RobustScaler()
# Apply scaling to the selected features
df[numeric_features] = robust_scaler.fit_transform(df[numeric_features])



## for LapTime ## 

# KNN Imputation
imputer = KNNImputer(n_neighbors=5)
df[['LapTime','Sector1Time']] = imputer.fit_transform(df[['LapTime','Sector1Time']])  


# df['LapTime'] = df['LapTime'].fillna(method='ffill')
# OR -> df['LapTime'] = df['LapTime'].interpolate(method='linear')

In [16]:
nan_counts = df.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

Position    96
dtype: int64


In [17]:
df['Position'] = df['Position'].fillna(method='ffill')


# KNN Imputation
#imputer = KNNImputer(n_neighbors=5)
#df['Position'] = imputer.fit_transform(df[['Position']])  # reshape if necessary
# df['Position'] = df['Position'].interpolate(method='linear')



In [18]:
nan_counts = df.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

Series([], dtype: int64)


In [19]:
# Handling LapTime based on weather conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)
df_dry = df[df['IsRainy'] == 0]
df_wet = df[df['IsRainy'] == 1]

# Remove outliers for dry conditions using IQR
Q1_dry = df_dry['LapTime'].quantile(0.25)
Q3_dry = df_dry['LapTime'].quantile(0.75)
IQR_dry = Q3_dry - Q1_dry
lower_bound_dry = Q1_dry - 1.5 * IQR_dry
upper_bound_dry = Q3_dry + 1.5 * IQR_dry
df_dry_filtered = df_dry[(df_dry['LapTime'] >= lower_bound_dry) & (df_dry['LapTime'] <= upper_bound_dry)]

# Re-combine the dry and wet dataframes
df_combined = pd.concat([df_dry_filtered, df_wet], ignore_index=True)


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor

##  df_dry_filtered ##

# Separate features and target
X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
predictions = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Baseline RMSE for Dry_filtered df: {rmse}")

########

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the model
xgb = XGBRegressor(random_state=42)

# Set up the hyperparameters to test in GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Custom scorer for RMSE
neg_rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# Setup GridSearchCV with multiple scoring metrics
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=kf,
                           scoring={'RMSE': neg_rmse_scorer, 'MAE': 'neg_mean_absolute_error', 'R2': 'r2'},
                           refit='RMSE',n_jobs=-1)
grid_search.fit(X, y)  # Make sure to fit on scaled X to maintain consistency

# Get the best model and print results
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_rmse = -grid_search.cv_results_['mean_test_RMSE'][grid_search.best_index_]
best_mae = -grid_search.cv_results_['mean_test_MAE'][grid_search.best_index_]
best_r2 = grid_search.cv_results_['mean_test_R2'][grid_search.best_index_]

print("Best model parameters:", best_params)
print(f"Best CV RMSE: {best_rmse:.3f}")
print(f"Best CV MAE: {best_mae:.3f}")
print(f"Best CV R²: {best_r2:.3f}")


Baseline RMSE for Dry_filtered df: 1.8627849430574932




Best model parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Best CV RMSE: 1.492
Best CV MAE: 0.650
Best CV R²: 0.988


In [21]:
# Shows Important features
importance = xgb_model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")

Circuit_Belgian Grand Prix: 0.15193714201450348
Circuit_French Grand Prix: 0.12637251615524292
Circuit_Singapore Grand Prix: 0.10123765468597412
Sector1Time: 0.08551418036222458
Circuit_Monaco Grand Prix: 0.05540865659713745
Circuit_Dutch Grand Prix: 0.04967232421040535
Circuit_Chinese Grand Prix: 0.04555055499076843
SpeedFL: 0.039821840822696686
Circuit_Sakhir Grand Prix: 0.03620833903551102
Circuit_Miami Grand Prix: 0.03073704056441784
Circuit_Hungarian Grand Prix: 0.029318347573280334
Pressure: 0.02522863820195198
Circuit_Abu Dhabi Grand Prix: 0.02059454470872879
Circuit_Tuscan Grand Prix: 0.019809070974588394
Circuit_Turkish Grand Prix: 0.016631267964839935
WindDirection: 0.01429319940507412
IsAccurate: 0.013537080958485603
Circuit_British Grand Prix: 0.012072146870195866
SpeedI2: 0.009114294312894344
Weather_Category_Warm: 0.007737122941762209
TrackTemp: 0.00599433621391654
AirTemp: 0.005727218464016914
Circuit_Emilia Romagna Grand Prix: 0.005543639417737722
Circuit_Bahrain Grand 

In [22]:
# 1. df_dry_filtered lapTime

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Split Train and test set for a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fitting a RandomForest model
model = RandomForestRegressor(n_estimators=100,max_depth=20,min_samples_split=5,random_state=42,n_jobs=-1)
# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

# Print RMSE value of the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")



RMSE: 1.5964180390043854


In [23]:
# Shows Important features
importance = model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")

Sector1Time: 0.5824521531616155
SpeedFL: 0.12550396098647515
Circuit_Singapore Grand Prix: 0.057600536465465306
Circuit_French Grand Prix: 0.03387450965141271
SpeedI2: 0.02315890950697895
Circuit_Sakhir Grand Prix: 0.019129332496717326
Circuit_Dutch Grand Prix: 0.015315267279818143
Circuit_Belgian Grand Prix: 0.014052030032760388
WindDirection: 0.013234102981165173
IsAccurate: 0.012826975551864827
Circuit_Chinese Grand Prix: 0.012466930072454136
Pressure: 0.010581030001337881
Circuit_Bahrain Grand Prix: 0.008968422550668126
SpeedI1: 0.008766761108603282
Circuit_British Grand Prix: 0.007681909428438528
AirTemp: 0.007473670075131867
TrackTemp: 0.0053242792313164845
Circuit_Abu Dhabi Grand Prix: 0.005036546599478918
LapNumber: 0.004305895882594836
Circuit_70th Anniversary Grand Prix: 0.003962029051165042
Humidity: 0.0037614545798883843
Circuit_Miami Grand Prix: 0.0035189903502621544
SpeedST: 0.0029525149171634726
Position: 0.0017464906226164114
Year: 0.0014471166583754848
TyreAge_TrackTem