In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression


# Load dataset
df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv', low_memory=False)

# Convert time columns to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Convert binary columns to integer type
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)


def categorize_weather(row):
    if row['Rainfall'] > 0:
        return 'Rainy'
    elif row['AirTemp'] > 28.43213126:
        return 'high'
    elif row['AirTemp'] > 21.31279265:
        return 'medium'
    elif row['AirTemp'] > 12.84901403:
        return 'low'
    else:
        return 'very_low'

df['Weather_Category'] = df.apply(categorize_weather, axis=1)
df['Original_Weather_Category'] = df['Weather_Category']
df = pd.get_dummies(df, columns=['Weather_Category'])

# Keep the original 'Driver' and 'Circuit' for EDA, Preprocessing be for one hot encoding
df['Original_Driver'] = df['Driver']
df['Original_Circuit'] = df['Circuit']

# Create Track temperature category based on the result of clustering 
df['TrackTemp_Cat'] = pd.cut(df['TrackTemp'], bins=[0, 18.96764999, 27.87457484, 35.04425766, 41.75142602, 50.51006013], labels=['VERY_LOW', 'Low', 'Medium', 'Warm', 'High'])
df['Original_TrackTemp_Cat'] = df['TrackTemp_Cat']
df = pd.get_dummies(df, columns=['TrackTemp_Cat'])


df['TrackConditionIndex'] = (df['AirTemp'] + df['TrackTemp'] + df['Humidity'] + df['Pressure'] +
                             df['WindSpeed'] + df['Rainfall']) / 6

# Tyre Age Interaction with TrackTemp 
df['TyreAge_TrackTemp'] = df['TyreLife'] * df['TrackTemp']

# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Circuit', 'Compound', 'Team','TrackStatus'])

# Drop irrelevant columns
columns_to_drop = ['Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated',
                   'IsPersonalBest', 'Sector3Time','LapStartTime','Sector2Time','Sector1Time']  # Also drop 'AirTemp' and 'TrackTemp' if no longer needed

#Neccesarry For Visualization
df = df.drop(['Original_Driver', 'Original_Circuit','Original_Weather_Category','Original_TrackTemp_Cat',], axis= 1)
df.drop(columns=columns_to_drop, inplace=True)

df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82418 entries, 0 to 82417
Data columns (total 137 columns):
 #    Column                               Dtype  
---   ------                               -----  
 0    DriverNumber                         int64  
 1    LapTime                              float64
 2    LapNumber                            float64
 3    Stint                                float64
 4    SpeedI1                              float64
 5    SpeedI2                              float64
 6    SpeedFL                              float64
 7    SpeedST                              float64
 8    TyreLife                             float64
 9    FreshTyre                            int64  
 10   Position                             float64
 11   IsAccurate                           int64  
 12   AirTemp                              float64
 13   Humidity                             float64
 14   Pressure                             float64
 15   Rainfall         

In [2]:
# List of numeric features to be imputed and standardized, excluding 'LapTime' 
numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'Humidity', 'Pressure', 'WindDirection', 'WindSpeed']

imputer = IterativeImputer(estimator=LinearRegression(), random_state=42)
# Impute missing values only for selected features
df[numeric_features] = imputer.fit_transform(df[numeric_features])

# df['Sector1Time'].fillna(method='ffill', inplace=True)


In [3]:
# 1. Separate LapTime as dry or wet(rainy) condition ( since lapTime of rainy day would be recognized as outliers)
# 2. Remove Outliers for dry condition LapTime
# 3. Build Combined LapTime df (Outliers for dry days are deleted)


# Flag for rainy conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)

# Separate dataframes for dry and wet conditions
df_dry = df[df['IsRainy'] == 0]
df_wet = df[df['IsRainy'] == 1]

# Remove Outliers for dry days using the IQR method
Q1_dry = df_dry['LapTime'].quantile(0.25)
Q3_dry = df_dry['LapTime'].quantile(0.75)
IQR_dry = Q3_dry - Q1_dry
lower_bound_dry = Q1_dry - 1.5 * IQR_dry
upper_bound_dry = Q3_dry + 1.5 * IQR_dry

df_dry_filtered = df_dry[(df_dry['LapTime'] >= lower_bound_dry) & (df_dry['LapTime'] <= upper_bound_dry)]

# Ensure a copy is made if modification is intended
df_dry_filtered = df_dry_filtered.copy()

## Scaling to the selected numeric features
robust_scaler = RobustScaler()
df_dry_filtered[numeric_features] = robust_scaler.fit_transform(df_dry_filtered[numeric_features])


# Build combined df for both dry-rainy days (Outliers are removed with rainy days lapTimes) 
df_combined = pd.concat([df_dry_filtered, df_wet], ignore_index=True)

df_combined.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79375 entries, 0 to 79374
Data columns (total 138 columns):
 #    Column                               Dtype  
---   ------                               -----  
 0    DriverNumber                         int64  
 1    LapTime                              float64
 2    LapNumber                            float64
 3    Stint                                float64
 4    SpeedI1                              float64
 5    SpeedI2                              float64
 6    SpeedFL                              float64
 7    SpeedST                              float64
 8    TyreLife                             float64
 9    FreshTyre                            int64  
 10   Position                             float64
 11   IsAccurate                           int64  
 12   AirTemp                              float64
 13   Humidity                             float64
 14   Pressure                             float64
 15   Rainfall         

In [4]:
"""import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.boxplot(x='Original_TrackTemp_Cat', y='LapTime', data=df_dry_filtered)
plt.title('Lap Times Across Different Track Temperatures')
plt.xlabel('Track Temperature Category')
plt.ylabel('Lap Time (seconds)')
plt.show()
"""

"import matplotlib.pyplot as plt\nimport seaborn as sns\n\nplt.figure(figsize=(10, 6))\nsns.boxplot(x='Original_TrackTemp_Cat', y='LapTime', data=df_dry_filtered)\nplt.title('Lap Times Across Different Track Temperatures')\nplt.xlabel('Track Temperature Category')\nplt.ylabel('Lap Time (seconds)')\nplt.show()\n"

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor

##  df_dry_filtered ##




# Separate features and target
X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
predictions = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Baseline RMSE for Dry_filtered df: {rmse}")

########

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the model
xgb = XGBRegressor(random_state=42)

# Set up the hyperparameters to test in GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Custom scorer for RMSE
neg_rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# Setup GridSearchCV with multiple scoring metrics
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=kf,
                           scoring={'RMSE': neg_rmse_scorer, 'MAE': 'neg_mean_absolute_error', 'R2': 'r2'},
                           refit='RMSE',n_jobs=-1)
grid_search.fit(X, y)  # Make sure to fit on scaled X to maintain consistency

# Get the best model and print results
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_rmse = -grid_search.cv_results_['mean_test_RMSE'][grid_search.best_index_]
best_mae = -grid_search.cv_results_['mean_test_MAE'][grid_search.best_index_]
best_r2 = grid_search.cv_results_['mean_test_R2'][grid_search.best_index_]

print("Best model parameters:", best_params)
print(f"Best CV RMSE: {best_rmse:.3f}")
print(f"Best CV MAE: {best_mae:.3f}")
print(f"Best CV R²: {best_r2:.3f}")


Baseline RMSE for Dry_filtered df: 2.2449275334421204
Best model parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Best CV RMSE: 1.600
Best CV MAE: 0.746
Best CV R²: 0.986


In [6]:
nan_counts = df_dry_filtered.isna().sum()
# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

Series([], dtype: int64)


In [7]:
## RANDOM FOREST with df_dry_filtered lapTime ##

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor


# Remove rows where 'Position' column contains NaN
df_dry_filtered['LapTime'].fillna(method='ffill', inplace=True)


# KNN Imputation for LapTime(dry only since removed )

#imputer = IterativeImputer(estimator=RandomForestRegressor(), random_state=42)
#df[['LapTime']] = imputer.fit_transform(df[['LapTime']])


#df_dry_filtered['Sector1Time'].fillna(df_dry_filtered['Sector1Time'].mean(), inplace=True)


X = df_dry_filtered.drop(['LapTime'], axis=1)
y = df_dry_filtered['LapTime']

# Split Train and test set for a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fitting a RandomForest model
model = RandomForestRegressor(n_estimators=100,max_depth=20,min_samples_split=5,random_state=42,n_jobs=-1)
# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

# Print RMSE value of the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")



RMSE: 1.8219665780227041


In [8]:
# Shows Important features

importance = model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")


SpeedFL: 0.2820022695599968
Pressure: 0.18649755434807028
Circuit_Sakhir Grand Prix: 0.09563307064012112
Circuit_Azerbaijan Grand Prix: 0.07106287885382771
IsAccurate: 0.0700614175136429
Circuit_Singapore Grand Prix: 0.043343200590385905
SpeedI2: 0.03917342530292315
Circuit_Bahrain Grand Prix: 0.03217060599047781
Circuit_Chinese Grand Prix: 0.024354633441152386
TrackConditionIndex: 0.015643282591867883
TyreAge_TrackTemp: 0.012923575983645831
SpeedST: 0.012382113896337643
Year: 0.011509910345620598
SpeedI1: 0.009350428798932975
Circuit_British Grand Prix: 0.008657482690517993
LapNumber: 0.008319055570448694
Circuit_Japanese Grand Prix: 0.008119139678698041
AirTemp: 0.006690705027424871
Humidity: 0.006154589862980163
Circuit_Spanish Grand Prix: 0.0054504526116323695
Circuit_Belgian Grand Prix: 0.004952608920748816
TyreLife: 0.004692071434238452
Circuit_Dutch Grand Prix: 0.0045078220552743535
TrackTemp: 0.00443992697850372
Position: 0.0037718419272223507
WindDirection: 0.00284548837900058