In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import RobustScaler

# Load dataset
df = pd.read_csv('f1_2019_to_2023_all_drivers_all_data.csv', low_memory=False)

# Convert time columns to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Convert binary columns to integer type
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)



# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Compound', 'Team','TrackStatus'])
# Drop irrelevant columns
columns_to_drop = ['Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated',
                   'IsPersonalBest', 'Sector3Time','LapStartTime','Sector2Time','Sector1Time','Circuit','Humidity', 'Pressure', 'WindDirection', 'WindSpeed','TrackTemp','AirTemp']
df.drop(columns=columns_to_drop, inplace=True)



## Separate Rainy / dry days ##
# 1. Separate LapTime as dry or wet(rainy) condition ( since lapTime of rainy day would be recognized as outliers)
# 2. Remove Outliers for dry condition LapTime
# 3. Build Combined LapTime df (Outliers for dry days are deleted)

# Flag for rainy conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)

# Separate dataframes for dry and wet conditions
df_dry = df[df['IsRainy'] == 0]
df_wet = df[df['IsRainy'] == 1]


def remove_outliers(df, column_name, multiplier=1.5):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

# Apply standard IQR for dry days
df_dry_filtered = remove_outliers(df_dry, 'LapTime', multiplier=1.5)

# Apply a more lenient IQR for wet days
df_wet_filtered = remove_outliers(df_wet, 'LapTime', multiplier=2.0)


df_combined = pd.concat([df_dry_filtered, df_wet_filtered], ignore_index=True)


# Define features and target
X = df_combined.drop('LapTime', axis=1)
y = df_combined['LapTime']


train_years = [2019,2020,2021,2022]
test_year = 2023
# Split data based on year
X_train = df_combined[df_combined['Year'].isin(train_years)].drop(['LapTime', 'Year'], axis=1)
y_train = df_combined[df_combined['Year'].isin(train_years)]['LapTime']
X_test = df_combined[df_combined['Year'] == test_year].drop(['LapTime', 'Year'], axis=1)
y_test = df_combined[df_combined['Year'] == test_year]['LapTime']




# Drop rows where the target variable is missing in the training set
train_indices = y_train.dropna().index  # Indices of rows where y_train is not NaN
X_train = X_train.loc[train_indices]
y_train = y_train.dropna()  # Drop missing values in y_train

numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

X_train[numeric_features] = X_train[numeric_features].fillna(method='ffill')

# Forward fill missing values in the test set
X_test[numeric_features] = X_test[numeric_features].fillna(method='ffill')

# Scale the test set using the same scaler fitted on the training set



scaler = RobustScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])


  X_train[numeric_features] = X_train[numeric_features].fillna(method='ffill')
  X_test[numeric_features] = X_test[numeric_features].fillna(method='ffill')


In [2]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor
import matplotlib.pyplot as plt


##BASE XG BOOST

#Grid value : (n_estimators=700, learning_rate=0.1,random_state=42,n_jobs=-1,max_depth=7)

# Train the model with class weight adjustment
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
predictions = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Baseline RMSE for Combined df: {rmse}")

from sklearn.model_selection import cross_val_score

# Evaluate with K-Fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42,)
scores = cross_val_score(xgb_model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)  # Convert MSE to RMSE
print("Cross-validated RMSE scores:", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))
    

Baseline RMSE for Combined df: 5.445294485972317
Cross-validated RMSE scores: [3.32654971 3.32202007 3.30585502 3.32546131 3.20089101]
Mean RMSE: 3.296155425754921


In [3]:

#4. XG boost with Random Search

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import make_scorer, mean_squared_error
kf = KFold(n_splits=5, shuffle=True, random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 5, 7, 9, 11],
    'reg_alpha': [0.1, 1, 10, 100],
    'reg_lambda': [0.1, 1, 10, 100],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}
neg_rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=True)
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=100, cv=kf,
                                   scoring={'RMSE': neg_rmse_scorer}, refit='RMSE', random_state=42, verbose=3, n_jobs=-1)
random_search.fit(X, y)  # X and y must be your preprocessed datasets
best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_rmse = (-random_search.best_score_) ** 0.5  # Converting MSE to RMSE

print("Best model parameters:", best_params)
print(f"Optimized CV RMSE: {best_rmse:.3f}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=600, reg_alpha=1, reg_lambda=100, subsample=0.7; RMSE: (test=-27.573) total time=   6.1s
[CV 4/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=11, n_estimators=500, reg_alpha=0.1, reg_lambda=100, subsample=0.9; RMSE: (test=-6.516) total time=  15.3s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=7, n_estimators=200, reg_alpha=100, reg_lambda=10, subsample=0.8; RMSE: (test=-11.109) total time=   3.8s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, n_estimators=800, reg_alpha=1, reg_lambda=0.1, subsample=0.9; RMSE: (test=-7.777) total time=  12.0s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.15, max_depth=7, n_estimators=800, reg_alpha=0.1, reg_lambda=10, subsample=0.9; RMSE: (test=-6.816) total time=  11.1s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=300, r



[CV 2/5] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=700, reg_alpha=100, reg_lambda=0.1, subsample=1.0; RMSE: (test=-24.897) total time=   5.5s
[CV 2/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=11, n_estimators=500, reg_alpha=0.1, reg_lambda=100, subsample=0.9; RMSE: (test=-6.203) total time=  14.0s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.2, max_depth=7, n_estimators=100, reg_alpha=1, reg_lambda=1, subsample=0.7; RMSE: (test=-11.134) total time=   2.3s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=7, n_estimators=200, reg_alpha=100, reg_lambda=10, subsample=0.8; RMSE: (test=-11.784) total time=   3.8s
[CV 2/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=7, n_estimators=600, reg_alpha=0.1, reg_lambda=10, subsample=0.9; RMSE: (test=-6.974) total time=   9.4s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.15, max_depth=7, n_estimators=800, reg_alpha=0.1, reg_lambda=10, subsample=0.9; RMSE: (test=-7.355) 

In [None]:
import shap
import numpy as np
import pandas as pd

# Use SHAP to explain feature importance
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)

# Calculate the mean absolute value for each feature to represent importance
shap_sum = np.abs(shap_values.values).mean(axis=0)
feature_importance_shap = pd.Series(shap_sum, index=X_test.columns)

# Scale the importances so that they sum to 100%
feature_importance_shap_scaled = 100 * feature_importance_shap / feature_importance_shap.sum()

# Print all scaled feature importances
print("Feature importances scaled to 100%:")
for feature, importance in feature_importance_shap_scaled.sort_values(ascending=False).items():
    print(f"{feature}: {importance:.2f}%")

# Define feature groups
weather_keywords = ['Pressure', 'Temp', 'Humidity', 'Rainfall', 'WindSpeed', 'WindDirection', 'Weather_Category', 'TrackTemp_Cat']
weather_features = [col for col in X_test.columns if any(keyword in col for keyword in weather_keywords)]
circuit_features = [col for col in X_test.columns if 'Circuit' in col]

# Summarize weather impacts
weather_importance = sum(importance for feature, importance in feature_importance_shap_scaled.items() if feature in weather_features)
print(f"Total importance of weather features scaled to 100%: {weather_importance:.2f}%")

# Summarize circuits impacts
circuit_importance = sum(importance for feature, importance in feature_importance_shap_scaled.items() if feature in circuit_features)
print(f"Total importance of circuit features scaled to 100%: {circuit_importance:.2f}%")

# Plot feature importance using SHAP
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)


  from .autonotebook import tqdm as notebook_tqdm


[CV 4/5] END colsample_bytree=0.8, learning_rate=0.2, max_depth=7, n_estimators=400, reg_alpha=10, reg_lambda=100, subsample=0.7; RMSE: (test=-8.899) total time=   6.5s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.05, max_depth=9, n_estimators=600, reg_alpha=10, reg_lambda=100, subsample=0.9; RMSE: (test=-8.814) total time=  11.9s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.2, max_depth=9, n_estimators=700, reg_alpha=1, reg_lambda=10, subsample=0.8; RMSE: (test=-6.737) total time=  13.6s
[CV 5/5] END colsample_bytree=1.0, learning_rate=0.01, max_depth=5, n_estimators=300, reg_alpha=100, reg_lambda=10, subsample=1.0; RMSE: (test=-36.424) total time=   3.9s
[CV 3/5] END colsample_bytree=1.0, learning_rate=0.01, max_depth=9, n_estimators=100, reg_alpha=10, reg_lambda=10, subsample=0.7; RMSE: (test=-46.217) total time=   3.2s
[CV 2/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=11, n_estimators=300, reg_alpha=1, reg_lambda=100, subsample=0.7; RMSE: (test=-6.728) tot

In [None]:
print(shap_values[0].values)


In [None]:
import pandas as pd

# Convert SHAP values to DataFrame
df_shap = pd.DataFrame(shap_values.values, columns=X_test.columns)

# Now you can view the DataFrame
print(df_shap.head())
df_shap.to_csv('shap_values.csv', index=False)
