In [2]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv', low_memory=False)

# Convert time columns to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Convert binary columns to integer type
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)


# Feature Engineering: 

# Categorize weather and return numerical labels for models Initial: 0 , 25 , 19 
def categorize_weather(row):
    if row['Rainfall'] > 0:
        return 'Rainy'
    elif row['AirTemp'] > 27:
        return 'Hot'
    elif row['AirTemp'] > 20:
        return 'Warm'
    else:
        return 'Cool'
df['Weather_Category'] = df.apply(categorize_weather, axis=1)

df = pd.get_dummies(df, columns=['Weather_Category'])

# Create Track temperature category based on the result of clustering 
df['TrackTemp_Cat'] = pd.cut(df['TrackTemp'], bins=[18, 27, 34, 41, 50, np.inf], labels=['VERY_LOW', 'Low', 'Medium', 'High', 'VERY_HIGH'])

# Initial values : [10, 20, 30, 40, 45, np.inf], labels=['VERY_LOW', 'Low', 'Medium', 'High', 'VERY_HIGH'])
df = pd.get_dummies(df, columns=['TrackTemp_Cat'])

# Tyre Age Interaction with TrackTemp 
df['TyreAge_TrackTemp'] = df['TyreLife'] * df['TrackTemp']


# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Circuit', 'Compound', 'Team'])


# Drop irrelevant columns
columns_to_drop = ['Time', 'LapStartTime', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated',
                   'IsPersonalBest', 'Sector3Time', 'Sector2Time', 'Sector1Time']
df.drop(columns=columns_to_drop, inplace=True)


In [3]:
from sklearn.linear_model import BayesianRidge

# Initialize Iterative Imputer with a simpler model for numeric features
iterative_imputer = IterativeImputer(estimator=BayesianRidge(), random_state=42)
numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed']

# Impute and scale numeric features
df[numeric_features] = iterative_imputer.fit_transform(df[numeric_features])


features_to_scale = ['AirTemp', 'Humidity', 'Pressure', 'TrackTemp']
robust_scaler = RobustScaler()
df[features_to_scale] = robust_scaler.fit_transform(df[features_to_scale])

iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1),max_iter=20, random_state=42)

# Interpolate LapTime separately considering its sequential nature
# df['LapTime'] = df['LapTime'].interpolate(method='Ran')

df['LapTime'] = iterative_imputer.fit_transform(df['LapTime'].values)

In [4]:
"""# Initialize Iterative Imputer

iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1),max_iter=20, random_state=42)
numeric_features = ['AirTemp', 'Humidity', 'Pressure', 'TrackTemp','LapTime']

df[numeric_features] = iterative_imputer.fit_transform(df[numeric_features])

features_to_scale = ['AirTemp', 'Humidity', 'Pressure', 'TrackTemp']

# Initialize the RobustScaler
robust_scaler = RobustScaler()

# Apply scaling to the selected features
df[features_to_scale] = robust_scaler.fit_transform(df[features_to_scale])
"""


"# Initialize Iterative Imputer\n\niterative_imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1),max_iter=20, random_state=42)\nnumeric_features = ['AirTemp', 'Humidity', 'Pressure', 'TrackTemp','LapTime']\n\ndf[numeric_features] = iterative_imputer.fit_transform(df[numeric_features])\n\nfeatures_to_scale = ['AirTemp', 'Humidity', 'Pressure', 'TrackTemp']\n\n# Initialize the RobustScaler\nrobust_scaler = RobustScaler()\n\n# Apply scaling to the selected features\ndf[features_to_scale] = robust_scaler.fit_transform(df[features_to_scale])\n\n\n"

In [5]:
# Handling LapTime based on weather conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)
df_dry = df[df['IsRainy'] == 0]
df_wet = df[df['IsRainy'] == 1]

# Remove outliers for dry conditions using IQR
Q1_dry = df_dry['LapTime'].quantile(0.25)
Q3_dry = df_dry['LapTime'].quantile(0.75)
IQR_dry = Q3_dry - Q1_dry
lower_bound_dry = Q1_dry - 1.5 * IQR_dry
upper_bound_dry = Q3_dry + 1.5 * IQR_dry
df_dry_filtered = df_dry[(df_dry['LapTime'] >= lower_bound_dry) & (df_dry['LapTime'] <= upper_bound_dry)]

# Re-combine the dry and wet dataframes
df_combined = pd.concat([df_dry_filtered, df_wet], ignore_index=True)


In [6]:

# Splitting data into features and target
X = df_combined.drop(['LapTime'], axis=1)
y = df_combined['LapTime']



# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
predictions = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")


RMSE: 2.5370010859272427


In [11]:
from sklearn.model_selection import cross_val_score, KFold

# Prepare the data
X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Initialize the cross-validation method
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Perform cross-validation
cv_rmse = cross_val_score(xgb_model, X, y, cv=kf, scoring='neg_root_mean_squared_error')
cv_mae = cross_val_score(xgb_model, X, y, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(xgb_model, X, y, cv=kf, scoring='r2')

print(f"CV RMSE: {-cv_rmse.mean():.3f}, Std: {cv_rmse.std():.3f}")
print(f"CV MAE: {-cv_mae.mean():.3f}, Std: {cv_mae.std():.3f}")
print(f"CV R²: {cv_r2.mean():.3f}, Std: {cv_r2.std():.3f}")



# Shows Important features
importance = xgb_model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")

CV RMSE: 2.367, Std: 0.049
CV MAE: 1.243, Std: 0.020
CV R²: 0.970, Std: 0.001


NotFittedError: need to call fit or load_model beforehand

In [7]:
from sklearn.model_selection import cross_val_score, KFold

# Prepare the data
X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Initialize the cross-validation method
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Perform cross-validation
cv_rmse = cross_val_score(xgb_model, X, y, cv=kf, scoring='neg_root_mean_squared_error')
cv_mae = cross_val_score(xgb_model, X, y, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(xgb_model, X, y, cv=kf, scoring='r2')

print(f"CV RMSE: {-cv_rmse.mean():.3f}, Std: {cv_rmse.std():.3f}")
print(f"CV MAE: {-cv_mae.mean():.3f}, Std: {cv_mae.std():.3f}")
print(f"CV R²: {cv_r2.mean():.3f}, Std: {cv_r2.std():.3f}")


In [10]:
# Shows Important features
importance = xgb_model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")

NotFittedError: need to call fit or load_model beforehand

In [12]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import numpy as np

# Prepare the data
X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Initialize the cross-validation method
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the model
xgb = XGBRegressor(random_state=42)

# Set up the hyperparameters to test
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Custom scorer for negative RMSE
neg_rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# Setup the grid search with multiple scorings
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=kf,
                           scoring={'RMSE': neg_rmse_scorer, 'MAE': 'neg_mean_absolute_error', 'R2': 'r2'},
                           refit='RMSE')
grid_search.fit(X, y)

# Get the best model and print results
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)
print("Best CV RMSE:", -grid_search.cv_results_['mean_test_RMSE'].max())
print("Best CV MAE:", -grid_search.cv_results_['mean_test_MAE'].max())
print("Best CV R²:", grid_search.cv_results_['mean_test_R2'].max())


Best model parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Best CV RMSE: 1.7646982067074881
Best CV MAE: 0.8009456437675271
Best CV R²: 0.9833476224736895
