In [5]:
# 5_History_Features_and_Model.ipynb  —  Step 3: add historical features (no leakage)

from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib
import os

# -------------------------------------------------------------------
# Paths
# -------------------------------------------------------------------
ROOT = Path.cwd()
if not (ROOT / "outputs").exists():
    ROOT = ROOT.parent
DATA_IN  = ROOT / "outputs" / "f1_features_weather_enhanced.csv"   # your current merged table
DATA_OUT = ROOT / "outputs" / "f1_features_history_v1.csv"
MODEL_OUT = ROOT / "models" / "rf_model_delta_hist_v1.joblib"

print(f"Reading: {DATA_IN}")
df = pd.read_csv(DATA_IN)

# Basic sanity
must_have = ["raceId","driverId","circuitId","year","round"]
for c in must_have:
    if c not in df.columns:
        raise ValueError(f"Missing required column '{c}' in {DATA_IN}")

# -------------------------------------------------------------------
# Target construction (NO leakage):
# - Compute per-race fastest lap (seconds) and define delta_s as
#   (driver's best lap - race fastest lap).
# - This target is for *this* race, so OK to compute from current row.
#   We will drop lap-time columns from features later.
# -------------------------------------------------------------------
if "bestLaps_s" not in df.columns:
    raise ValueError("Column 'bestLaps_s' not found. We need it to define the target delta_s.")

race_fast = df.groupby("raceId")["bestLaps_s"].transform("min")
df["delta_s"] = df["bestLaps_s"] - race_fast

# -------------------------------------------------------------------
# Historical (lagged) features
# RULE: Everything must be computed from the PAST.
# We sort chronologically and always .shift(1) before rolling/expanding.
# -------------------------------------------------------------------
df = df.sort_values(["year","round","raceId","driverId"]).reset_index(drop=True)

def rolling_feat(g, src, windows, prefix):
    """
    Build shifted rolling stats for series 'src' in group g.
    Returns DataFrame with new columns (mean/std for each window).
    """
    out = pd.DataFrame(index=g.index)
    s = g[src].shift(1)  # shift to ensure we only use past info
    for w in windows:
        out[f"{prefix}_mean_{w}"] = s.rolling(w, min_periods=1).mean()
        out[f"{prefix}_std_{w}"]  = s.rolling(w, min_periods=1).std()
    # long-term expanding mean (all history)
    out[f"{prefix}_mean_exp"] = s.expanding(min_periods=1).mean()
    return out

# ---- 1) Driver rolling history on delta_s
windows = [3, 5, 10]

drv_hist = df.groupby("driverId", group_keys=False).apply(
    lambda g: rolling_feat(g, src="delta_s", windows=windows, prefix="drv_delta")
)
df = pd.concat([df, drv_hist], axis=1)

# Driver experience = how many prior starts
df["drv_starts"] = (
    df.groupby("driverId").cumcount()
)

# ---- 2) Constructor rolling history on delta_s (needs constructorId in table)
if "constructorId" in df.columns:
    team_hist = df.groupby("constructorId", group_keys=False).apply(
        lambda g: rolling_feat(g, src="delta_s", windows=windows, prefix="team_delta")
    )
    df = pd.concat([df, team_hist], axis=1)

    # driver vs team form: driver's expanding mean minus team's expanding mean (both lagged)
    df["drv_delta_mean_exp_shift1"]  = df.groupby("driverId")["delta_s"].shift(1).expanding().mean().reset_index(level=0, drop=True)
    df["team_delta_mean_exp_shift1"] = df.groupby("constructorId")["delta_s"].shift(1).expanding().mean().reset_index(level=0, drop=True)
    df["drv_minus_team_form"] = df["drv_delta_mean_exp_shift1"] - df["team_delta_mean_exp_shift1"]
    df.drop(columns=["drv_delta_mean_exp_shift1","team_delta_mean_exp_shift1"], inplace=True, errors="ignore")
else:
    print("constructorId not found — skipping team features.")
    df["drv_minus_team_form"] = np.nan

# ---- 3) Track-specific (driver at the same circuit) history on delta_s
track_hist = df.groupby(["driverId","circuitId"], group_keys=False).apply(
    lambda g: rolling_feat(g, src="delta_s", windows=[3,5], prefix="drv_track_delta")
)
df = pd.concat([df, track_hist], axis=1)

# -------------------------------------------------------------------
# Optional: historical features from grid/qualifying (lagged only!)
# These are safe as history; we will *not* use current race's grid/qual directly as-is.
# -------------------------------------------------------------------
for col in [c for c in ["grid","qual_best_s"] if c in df.columns]:
    df[f"{col}_hist_mean_5"] = (
        df.groupby("driverId")[col]
          .shift(1)    # past only
          .rolling(5, min_periods=1).mean()
    )

# -------------------------------------------------------------------
# Save enriched dataset (useful to inspect / iterate)
# -------------------------------------------------------------------
df.to_csv(DATA_OUT, index=False)
print(f"Saved historical feature table -> {DATA_OUT}")
print("Rows:", len(df), "| Cols:", df.shape[1])

# -------------------------------------------------------------------
# Train a clean model:
# - Drop target leak columns from X (any lap-time columns for current race)
# - Keep historical (lagged) columns we just built
# - Simple RF baseline to verify signal emerges (>0 R² ideally)
# -------------------------------------------------------------------
drop_now = [
    # columns that are direct outcomes of the *current* race:
    "bestLaps_s", "bestLap_ms", "f1_rank", "fl_avg_speed_kph", "finish_pos", "fl_rank",
    # identifiers we never want to predict on
    "raceId",
    # obvious text cols if present
    "drivers_name", "gp_name", "circuit_name", "date"
]

X = df.drop(columns=[c for c in drop_now if c in df.columns], errors="ignore")
y = df["delta_s"].astype(float)

# Choose categorical vs numeric
cat_cols = []
if "country" in X.columns:
    cat_cols.append("country")

# safe numerics = all numeric except the target
num_cols = [c for c in X.columns if c != "delta_s" and pd.api.types.is_numeric_dtype(X[c])]

# Remove any remaining columns that equal y (just in case)
num_cols = [c for c in num_cols if c != "delta_s"]

# Build preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop",
)

# Split (simple holdout; later we can do GroupKFold per race)
X_model = X[cat_cols + num_cols].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X_model, y, test_size=0.2, random_state=42
)

rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[("pre", preprocess), ("rf", rf)])
pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
r2   = r2_score(y_test, pred)
mae  = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("\n=== Historical Model (no leakage) ===")
print(f"R²   : {r2:0.3f}")
print(f"MAE  : {mae:0.3f} s (gap to fastest)")
print(f"RMSE : {rmse:0.3f} s (gap to fastest)")

MODEL_OUT.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(pipe, MODEL_OUT)
print(f"\n✅ Saved model -> {MODEL_OUT}")


Reading: f:\Personal Projects\F1-FastestLap-Predictor\outputs\f1_features_weather_enhanced.csv


  drv_hist = df.groupby("driverId", group_keys=False).apply(
  team_hist = df.groupby("constructorId", group_keys=False).apply(
  track_hist = df.groupby(["driverId","circuitId"], group_keys=False).apply(


Saved historical feature table -> f:\Personal Projects\F1-FastestLap-Predictor\outputs\f1_features_history_v1.csv
Rows: 11041 | Cols: 58

=== Historical Model (no leakage) ===
R²   : 0.006
MAE  : 1.945 s (gap to fastest)
RMSE : 9.349 s (gap to fastest)

✅ Saved model -> f:\Personal Projects\F1-FastestLap-Predictor\models\rf_model_delta_hist_v1.joblib
