In [8]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib, os

# ---------- Load ----------
ROOT = Path.cwd()
if not (ROOT / "outputs").exists():
    ROOT = ROOT.parent
os.chdir(ROOT)

df = pd.read_csv(Path("outputs") / "f1_features_weather.csv")

# your CSV uses plural names; keep them consistent
df = df.rename(columns={"drivers_name": "driver_name"})

# ---------- Target & groups ----------
y = df["bestLaps_s"]                  # target in seconds
groups = df["raceId"] if "raceId" in df.columns else df["year"]

# ---------- Build features WITHOUT leaks ----------
# NEVER allow target or its proxies into X
leak_cols = {"bestLaps_s", "bestLap_ms", "qual_best_s"}
text_cols = {"driver_name", "gp_name", "circuit_name", "date"}

# KEEP raceId for grouping (exclude from features), drop it later
feature_cols = [c for c in df.columns if c not in leak_cols | text_cols | {"raceId"}]

# Impute numeric NaNs
for c in df[feature_cols].select_dtypes(include=np.number).columns:
    df[c] = df[c].fillna(df[c].median())

# ---------- Grouped train/test split (unseen races in test) ----------
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(df[feature_cols], y, groups=groups))

X_train = df.loc[train_idx, feature_cols].copy()
X_test  = df.loc[test_idx,  feature_cols].copy()
y_train = y.iloc[train_idx]
y_test  = y.iloc[test_idx]

# ---------- Preprocess (infer from X_train) ----------
cat_cols = [c for c in ["country", "year"] if c in X_train.columns]
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

model = Pipeline([
    ("pre", preprocess),
    ("rf", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)),
])

# Safety: ensure no leaks in features
for bad in ("bestLaps_s", "bestLap_ms", "qual_best_s"):
    assert bad not in X_train.columns, f"Leak in features: {bad}"

# ---------- Fit & evaluate ----------
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²  :", round(r2_score(y_test, y_pred), 3))
print("MAE :", round(mean_absolute_error(y_test, y_pred), 3), "s")
print("RMSE:", round(rmse, 3), "s")

Path("models").mkdir(exist_ok=True)
joblib.dump(model, "models/f1_rf_baseline.joblib")


R²  : 0.661
MAE : 3.322 s
RMSE: 7.742 s


['models/f1_rf_baseline.joblib']