In [9]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
import pandas as pd


# Load dataset
df = pd.read_csv('f1_2019_to_2023_all_drivers_all_data.csv', low_memory=False)

# Convert time columns to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Convert binary columns to integer type
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)


# Categorize weather condition based on centroid values of Kmeans clustering
def categorize_weather(row):
    if row['Rainfall'] > 0:
        return 'Rainy'
    elif row['AirTemp'] > 28.43213126:
        return 'high'
    elif row['AirTemp'] > 21.31279265:
        return 'medium'
    elif row['AirTemp'] > 12.84901403:
        return 'low'
    else:
        return 'very_low'

df['Weather_Category'] = df.apply(categorize_weather, axis=1)
df['Original_Weather_Category'] = df['Weather_Category']
df = pd.get_dummies(df, columns=['Weather_Category'])


# Keep the original Driver and Circuit for EDA
df['Original_Driver'] = df['Driver']

# Create Track temperature category based on the result of Kmeans clustering 
df['TrackTemp_Cat'] = pd.cut(df['TrackTemp'], bins=[0, 18.96764999, 27.87457484, 35.04425766, 41.75142602, 50.51006013], labels=['VERY_LOW', 'Low', 'Medium', 'Warm', 'High'])
df['Original_TrackTemp_Cat'] = df['TrackTemp_Cat']
df = pd.get_dummies(df, columns=['TrackTemp_Cat'])


#Feature Engineering with weather condition features
df['TrackConditionIndex'] = (df['WindDirection'] + df['TrackTemp'] + df['Humidity'] + df['Pressure'] ) / 4


# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Compound', 'Team','TrackStatus'])

# Drop irrelevant columns
columns_to_drop = ['Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated',
                   'IsPersonalBest', 'Sector3Time','LapStartTime','Sector2Time','Sector1Time','Circuit']
df.drop(columns=columns_to_drop, inplace=True)


# Select numerical values for scaling and imputation
numeric_features = ['Humidity', 'Pressure', 'WindDirection', 'WindSpeed','TrackTemp','AirTemp','Rainfall','SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

# Example for forward fill
#time_series_features = ['WindDirection', 'WindSpeed', 'TrackTemp', 'AirTemp', 'Rainfall', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

df[numeric_features] = df[numeric_features].fillna(method='ffill')


## Separate Rainy / dry days ##
# 1. Separate LapTime as dry or wet(rainy) condition ( since lapTime of rainy day would be recognized as outliers)
# 2. Remove Outliers for dry condition LapTime
# 3. Build Combined LapTime df (Outliers for dry days are deleted)

# Flag for rainy conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)

# Separate dataframes for dry and wet conditions
df_dry = df[df['IsRainy'] == 0]
df_wet = df[df['IsRainy'] == 1]

# Remove Outliers for dry days using the IQR method that effective to removes extreme outliers
Q1_dry = df_dry['LapTime'].quantile(0.25)
Q3_dry = df_dry['LapTime'].quantile(0.75)
IQR_dry = Q3_dry - Q1_dry
lower_bound_dry = Q1_dry - 1.5 * IQR_dry
upper_bound_dry = Q3_dry + 1.5 * IQR_dry
df_dry_filtered = df_dry[(df_dry['LapTime'] >= lower_bound_dry) & (df_dry['LapTime'] <= upper_bound_dry)]
df_dry_filtered = df_dry_filtered.copy()
df_wet = df_wet.copy()


## Scaling to the selected numeric features ##
robust_scaler = RobustScaler()

# Applying scaling to the numeric features
df_dry_filtered[numeric_features] = robust_scaler.fit_transform(df_dry_filtered[numeric_features])
df_wet[numeric_features] = robust_scaler.transform(df_wet[numeric_features])  # Use transform, not fit_transform

# Combining the datasets after scaling
df_combined = pd.concat([df_dry_filtered, df_wet], ignore_index=True)
df_combined.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96556 entries, 0 to 96555
Data columns (total 112 columns):
 #    Column                     Dtype   
---   ------                     -----   
 0    DriverNumber               int64   
 1    LapTime                    float64 
 2    LapNumber                  float64 
 3    Stint                      float64 
 4    SpeedI1                    float64 
 5    SpeedI2                    float64 
 6    SpeedFL                    float64 
 7    SpeedST                    float64 
 8    TyreLife                   float64 
 9    FreshTyre                  int64   
 10   Position                   float64 
 11   IsAccurate                 int64   
 12   AirTemp                    float64 
 13   Humidity                   float64 
 14   Pressure                   float64 
 15   Rainfall                   float64 
 16   TrackTemp                  float64 
 17   WindDirection              float64 
 18   WindSpeed                  float64 
 19   Ye

  df[numeric_features] = df[numeric_features].fillna(method='ffill')


In [8]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
import numpy as np

# Drop Strings columns
df_combined = df_combined.drop(['Original_Driver', 'Original_Weather_Category', 'Original_TrackTemp_Cat'], axis=1)

# Impute missing LapTime values (considering other strategy that best suits data)
df_combined['LapTime'].fillna(method='ffill', inplace=True)
df_combined['Position'].fillna(method='ffill', inplace=True)


# Define features and target
X = df_combined.drop('LapTime', axis=1)
y = df_combined['LapTime']


# stratify with Rainfall to handle imbalance 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model with class weight adjustment
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.1,random_state=42,n_jobs=-1,max_depth=7)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
predictions = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Baseline RMSE for Combined df: {rmse}")


  df_combined['LapTime'].fillna(method='ffill', inplace=True)
  df_combined['Position'].fillna(method='ffill', inplace=True)


Baseline RMSE for Combined df: 1.7802496677052293


In [4]:

########

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Defining the model : XGB
xgb = XGBRegressor(random_state=42)

# Initial Set up the hyperparameters to test in GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Custom scorer for RMSE ##gpt used ##
neg_rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# Setup GridSearchCV with multiple scoring metrics
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=kf,
                           scoring={'RMSE': neg_rmse_scorer},
                           refit='RMSE',n_jobs=-1)
grid_search.fit(X, y)  # Make sure to fit on scaled X to maintain consistency

# Get the best model and print results
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

best_rmse = -grid_search.cv_results_['mean_test_RMSE'][grid_search.best_index_]

print("Best model parameters is :", best_params)

print(f"Optimised CV RMSE: {best_rmse:.3f}")


#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
# Use IterativeImputer for more sophisticated imputation
#iterative_imputer = IterativeImputer(random_state=42)
#df_combined[numeric_features] = iterative_imputer.fit_transform(df_combined[numeric_features])





Best model parameters is : {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Optimised CV RMSE: 1.802


In [5]:
## FOR other year - TEST ##


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
import shap



# Impute missing LapTime values (considering other strategy that best suits data)
df_combined['LapTime'].fillna(method='ffill', inplace=True)
df_combined['Position'].fillna(df_combined['Position'].mean(), inplace=True)

# Assuming df_combined is already loaded and processed
df_combined = df_combined.drop(['Original_Driver', 'Original_Weather_Category', 'Original_TrackTemp_Cat'], axis=1)

train_years = [2019, 2020,2021,2022]
test_year = 2023

# Split data based on year
X_train = df_combined[df_combined['Year'].isin(train_years)].drop(['LapTime', 'Year'], axis=1)
y_train = df_combined[df_combined['Year'].isin(train_years)]['LapTime']
X_test = df_combined[df_combined['Year'] == test_year].drop(['LapTime', 'Year'], axis=1)
y_test = df_combined[df_combined['Year'] == test_year]['LapTime']


# Stage 2 : Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Defining the model: XGBRegressor
xgb = XGBRegressor(random_state=42)

# Setup the hyperparameters to test in GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Custom scorer for RMSE
neg_rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# Setup GridSearchCV with multiple scoring metrics
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=kf,
                           scoring={'RMSE': neg_rmse_scorer},
                           refit='RMSE', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and print results
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_rmse = -grid_search.cv_results_['mean_test_RMSE'][grid_search.best_index_]

print("Best model parameters:", best_params)
print(f"Optimised CV RMSE: {best_rmse:.3f}")

# Use SHAP to explain feature importance
explainer = shap.Explainer(best_model)
shap_values = explainer(X_train)

# Plot feature importance
shap.summary_plot(shap_values, X_train, feature_names=X_train.columns)


  df_combined['LapTime'].fillna(method='ffill', inplace=True)


KeyError: "['Original_Driver', 'Original_Weather_Category', 'Original_TrackTemp_Cat'] not found in axis"

In [None]:
"""shap_sum = np.abs(shap_values.values).mean(axis=0)
feature_importance = pd.Series(shap_sum, index=X_train.columns)

# Sort the features by their mean absolute SHAP value in descending order
sorted_feature_importance = feature_importance.sort_values(ascending=False)

# Display the sorted features with their importance
print(sorted_feature_importance)"""

In [6]:
import shap
import xgboost as xgb

# Assuming X_train and model are already defined and trained
model = xgb.XGBRegressor().fit(X_train, y_train)

# Explainer
explainer = shap.Explainer(model, X_train)
shap_values = explainer.shap_values(X_train)

# Plotting SHAP summary plot
shap.summary_plot(shap_values, X_train, plot_type="bar")


Found a NULL input array in _cext_dense_tree_update_weights!


TypeError: Cannot cast array data from dtype('O') to dtype('float64') according to the rule 'safe'

In [ ]:
# Assuming df is your DataFrame with 'lap_time', 'temperature', and 'humidity'
plt.figure(figsize=(12, 6))
plt.scatter(df_combined['TrackTemp'], df_combined['lap_time'], c='blue', label='TrackTemp')
plt.scatter(df_combined['humidity'], df_combined['lap_time'], c='red', label='Humidity')
plt.title('Impact of Temperature and Humidity on Lap Times')
plt.xlabel('Weather Condition')
plt.ylabel('Lap Time')
plt.legend()
plt.show()
