In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

NB_DIR = Path.cwd()
ROOT = NB_DIR.parent if NB_DIR.name.lower() == "notebooks" else NB_DIR
OUT = ROOT / "outputs"

src_path = OUT/"f1_features_weather_enhanced.csv"
dst_path = OUT/"f1_features_weather_delta.csv"

df = pd.read_csv(src_path)

#safety check
assert "raceId" in df.columns, "raceId column not found in the dataframe"
assert "bestLaps_s" in df.columns, "bestLaps_s column not found in the dataframe"

# Calculate fastest lap and delta per race
fastest_per_race = df.groupby("raceId")["bestLaps_s"].transform("min")
df["delta_s"] = df["bestLaps_s"] - fastest_per_race

#sanity check
num_neg = (df["delta_s"] < -1e-6).sum()

df["delta_s"] = df["delta_s"].clip(lower=0.0)

print("Delta summary (seconds):")
print(df["delta_s"].describe(percentiles=[0.1, 0.25, 0.50, 0.75, 0.9]))

df.to_csv(dst_path, index=False)
print(f"Saved delta-enhanced features to {dst_path}")


Delta summary (seconds):
count    11041.000000
mean         2.753279
std          6.723174
min          0.000000
10%          0.327000
25%          1.032000
50%          1.901000
75%          2.972000
90%          4.430000
max        375.193000
Name: delta_s, dtype: float64
Saved delta-enhanced features to f:\Personal Projects\F1-FastestLap-Predictor\outputs\f1_features_weather_delta.csv


In [2]:
# RF retraining note:
# After generating this delta feature file, retraining the Random Forest models.

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

#load dataset
NB_DIR = Path.cwd()
ROOT = NB_DIR.parent if NB_DIR.name.lower() == "notebooks" else NB_DIR
OUT = ROOT / "outputs"
MODELS = ROOT / "models"

df = pd.read_csv(OUT/"f1_features_weather_delta.csv")

# Target define and forbidden columns
y = df["delta_s"]

drop_cols = [ "bestLaps_s", "delta_s", "qual_best_s", "bestLaps_ms", "drivers_name", "gp_name", "circuit_name", "date", "finish_pos", "f1_rank", "f1_avg_speed_kph"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore').copy()

strict_pre_qual = False
if strict_pre_qual and "grid" in X.columns:
    X = X.drop(columns=["grid"])

#group by racId to avoid leakage 
groups = df["raceId"] if "raceId" in df.columns else df["year"]

#basic imputations 
for c in X.select_dtypes(include=[np.number]).columns:
    X[c] = X[c].fillna(X[c].median())

#Split by race: unseen race in test set
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
tr_idx, te_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[tr_idx].copy(), X.iloc[te_idx].copy()
y_train, y_test = y.iloc[tr_idx], y.iloc[te_idx]

#drop grouping id from features
for d in (X_train, X_test):
    for col in ["raceId", "driverId", "constructorId"]:
        if col in d.columns:
            d.drop(columns=[col], inplace=True)

cat_cols = [c for c in ["country"] if c in X_train.columns]
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

rf = RandomForestRegressor(
    n_estimators=600,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

model = Pipeline(steps=[("pre", preprocessor), ("rf", rf)])

#train and evaluate
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))

print("R^2: ", round(r2_score(y_test, y_pred), 3))
print("MAE: ", round(mean_absolute_error(y_test, y_pred), 3), "s (gap to fastest lap)")
print("RMSE: ", round(rmse, 3), "s (gap to fastest lap)")

#save model
model_path = MODELS / "rf_model_delta.joblib"
joblib.dump(model, model_path)
print(f"Saved retrained Random Forest model to {model_path}")

R^2:  0.816
MAE:  0.825 s (gap to fastest lap)
RMSE:  2.966 s (gap to fastest lap)
Saved retrained Random Forest model to f:\Personal Projects\F1-FastestLap-Predictor\models\rf_model_delta.joblib
