In [5]:
# 5_History_Features_and_Model.ipynb  —  Step 3: add historical features (no leakage)

from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib
import os

# -------------------------------------------------------------------
# Paths
# -------------------------------------------------------------------
ROOT = Path.cwd()
if not (ROOT / "outputs").exists():
    ROOT = ROOT.parent
DATA_IN  = ROOT / "outputs" / "f1_features_weather_enhanced.csv"   # your current merged table
DATA_OUT = ROOT / "outputs" / "f1_features_history_v1.csv"
MODEL_OUT = ROOT / "models" / "rf_model_delta_hist_v1.joblib"

print(f"Reading: {DATA_IN}")
df = pd.read_csv(DATA_IN)

# Basic sanity
must_have = ["raceId","driverId","circuitId","year","round"]
for c in must_have:
    if c not in df.columns:
        raise ValueError(f"Missing required column '{c}' in {DATA_IN}")

# -------------------------------------------------------------------
# Target construction (NO leakage):
# - Compute per-race fastest lap (seconds) and define delta_s as
#   (driver's best lap - race fastest lap).
# - This target is for *this* race, so OK to compute from current row.
#   We will drop lap-time columns from features later.
# -------------------------------------------------------------------
if "bestLaps_s" not in df.columns:
    raise ValueError("Column 'bestLaps_s' not found. We need it to define the target delta_s.")

race_fast = df.groupby("raceId")["bestLaps_s"].transform("min")
df["delta_s"] = df["bestLaps_s"] - race_fast

# -------------------------------------------------------------------
# Historical (lagged) features
# RULE: Everything must be computed from the PAST.
# We sort chronologically and always .shift(1) before rolling/expanding.
# -------------------------------------------------------------------
df = df.sort_values(["year","round","raceId","driverId"]).reset_index(drop=True)

def rolling_feat(g, src, windows, prefix):
    """
    Build shifted rolling stats for series 'src' in group g.
    Returns DataFrame with new columns (mean/std for each window).
    """
    out = pd.DataFrame(index=g.index)
    s = g[src].shift(1)  # shift to ensure we only use past info
    for w in windows:
        out[f"{prefix}_mean_{w}"] = s.rolling(w, min_periods=1).mean()
        out[f"{prefix}_std_{w}"]  = s.rolling(w, min_periods=1).std()
    # long-term expanding mean (all history)
    out[f"{prefix}_mean_exp"] = s.expanding(min_periods=1).mean()
    return out

# ---- 1) Driver rolling history on delta_s
windows = [3, 5, 10]

drv_hist = df.groupby("driverId", group_keys=False).apply(
    lambda g: rolling_feat(g, src="delta_s", windows=windows, prefix="drv_delta")
)
df = pd.concat([df, drv_hist], axis=1)

# Driver experience = how many prior starts
df["drv_starts"] = (
    df.groupby("driverId").cumcount()
)

# ---- 2) Constructor rolling history on delta_s (needs constructorId in table)
if "constructorId" in df.columns:
    team_hist = df.groupby("constructorId", group_keys=False).apply(
        lambda g: rolling_feat(g, src="delta_s", windows=windows, prefix="team_delta")
    )
    df = pd.concat([df, team_hist], axis=1)

    # driver vs team form: driver's expanding mean minus team's expanding mean (both lagged)
    df["drv_delta_mean_exp_shift1"]  = df.groupby("driverId")["delta_s"].shift(1).expanding().mean().reset_index(level=0, drop=True)
    df["team_delta_mean_exp_shift1"] = df.groupby("constructorId")["delta_s"].shift(1).expanding().mean().reset_index(level=0, drop=True)
    df["drv_minus_team_form"] = df["drv_delta_mean_exp_shift1"] - df["team_delta_mean_exp_shift1"]
    df.drop(columns=["drv_delta_mean_exp_shift1","team_delta_mean_exp_shift1"], inplace=True, errors="ignore")
else:
    print("constructorId not found — skipping team features.")
    df["drv_minus_team_form"] = np.nan

# ---- 3) Track-specific (driver at the same circuit) history on delta_s
track_hist = df.groupby(["driverId","circuitId"], group_keys=False).apply(
    lambda g: rolling_feat(g, src="delta_s", windows=[3,5], prefix="drv_track_delta")
)
df = pd.concat([df, track_hist], axis=1)

# -------------------------------------------------------------------
# Optional: historical features from grid/qualifying (lagged only!)
# These are safe as history; we will *not* use current race's grid/qual directly as-is.
# -------------------------------------------------------------------
for col in [c for c in ["grid","qual_best_s"] if c in df.columns]:
    df[f"{col}_hist_mean_5"] = (
        df.groupby("driverId")[col]
          .shift(1)    # past only
          .rolling(5, min_periods=1).mean()
    )

# -------------------------------------------------------------------
# Save enriched dataset (useful to inspect / iterate)
# -------------------------------------------------------------------
df.to_csv(DATA_OUT, index=False)
print(f"Saved historical feature table -> {DATA_OUT}")
print("Rows:", len(df), "| Cols:", df.shape[1])

# -------------------------------------------------------------------
# Train a clean model:
# - Drop target leak columns from X (any lap-time columns for current race)
# - Keep historical (lagged) columns we just built
# - Simple RF baseline to verify signal emerges (>0 R² ideally)
# -------------------------------------------------------------------
drop_now = [
    # columns that are direct outcomes of the *current* race:
    "bestLaps_s", "bestLap_ms", "f1_rank", "fl_avg_speed_kph", "finish_pos", "fl_rank",
    # identifiers we never want to predict on
    "raceId",
    # obvious text cols if present
    "drivers_name", "gp_name", "circuit_name", "date"
]

X = df.drop(columns=[c for c in drop_now if c in df.columns], errors="ignore")
y = df["delta_s"].astype(float)

# Choose categorical vs numeric
cat_cols = []
if "country" in X.columns:
    cat_cols.append("country")

# safe numerics = all numeric except the target
num_cols = [c for c in X.columns if c != "delta_s" and pd.api.types.is_numeric_dtype(X[c])]

# Remove any remaining columns that equal y (just in case)
num_cols = [c for c in num_cols if c != "delta_s"]

# Build preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop",
)

# Split (simple holdout; later we can do GroupKFold per race)
X_model = X[cat_cols + num_cols].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X_model, y, test_size=0.2, random_state=42
)

rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[("pre", preprocess), ("rf", rf)])
pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
r2   = r2_score(y_test, pred)
mae  = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("\n=== Historical Model (no leakage) ===")
print(f"R²   : {r2:0.3f}")
print(f"MAE  : {mae:0.3f} s (gap to fastest)")
print(f"RMSE : {rmse:0.3f} s (gap to fastest)")

MODEL_OUT.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(pipe, MODEL_OUT)
print(f"\n✅ Saved model -> {MODEL_OUT}")


Reading: f:\Personal Projects\F1-FastestLap-Predictor\outputs\f1_features_weather_enhanced.csv


  drv_hist = df.groupby("driverId", group_keys=False).apply(
  team_hist = df.groupby("constructorId", group_keys=False).apply(
  track_hist = df.groupby(["driverId","circuitId"], group_keys=False).apply(


Saved historical feature table -> f:\Personal Projects\F1-FastestLap-Predictor\outputs\f1_features_history_v1.csv
Rows: 11041 | Cols: 58

=== Historical Model (no leakage) ===
R²   : 0.006
MAE  : 1.945 s (gap to fastest)
RMSE : 9.349 s (gap to fastest)

✅ Saved model -> f:\Personal Projects\F1-FastestLap-Predictor\models\rf_model_delta_hist_v1.joblib


In [6]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path.cwd()
if not (ROOT / "outputs").exists():
    ROOT = ROOT.parent
# 1) pick input/output paths
IN_PATH  = ROOT / "outputs" / "f1_features_history_v1.csv"   # v1 file we already made
OUT_PATH = ROOT / "outputs" / "f1_features_history_v2_temp.csv"  # temp file for this step

# 2) load
df = pd.read_csv(IN_PATH)

# 3) ensure key columns are numeric where needed (helps sort)
for c in ["year", "round", "raceId", "driverId", "constructorId"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="ignore")

# 4) sort into season order (stable)
df = df.sort_values(
    by=["year", "round", "raceId", "driverId"],
    kind="mergesort"
).reset_index(drop=True)

# quick sanity check
print(df[["year","round","raceId","driverId"]].head(8))
print("rows:", len(df), "cols:", len(df.columns))



WINS = (3, 5, 10)

def add_drv_qual_rollups(frame: pd.DataFrame) -> pd.DataFrame:
    """
    Create drv_qual_mean_{w} and drv_qual_std_{w} using past (shifted) qual_best_s
    within each (driverId, year) group.
    """
    if "qual_best_s" not in frame.columns:
        print("WARN: 'qual_best_s' missing; skipping driver qualifying rollups")
        return frame

    # 0) ensure sort is correct inside each group (idempotent if you ran Step 1)
    frame = frame.sort_values(["year","round","raceId","driverId"], kind="mergesort")

    # 1) compute past-only values per (driverId, year)
    past = frame.groupby(["driverId", "year"], sort=False)["qual_best_s"].shift(1)

    # 2) rolling stats per (driverId, year)
    #    We re-group 'past' to keep windows inside each driver-year block.
    g = past.groupby([frame["driverId"], frame["year"]])

    for w in WINS:
        frame[f"drv_qual_mean_{w}"] = g.apply(lambda s: s.rolling(w, min_periods=1).mean()).reset_index(level=[0,1], drop=True)
        frame[f"drv_qual_std_{w}"]  = g.apply(lambda s: s.rolling(w, min_periods=1).std()).reset_index(level=[0,1], drop=True)

    # Optional gentle fill for early-rows std (single prior → NaN). Comment out if you’d rather leave NaNs.
    # std_cols = [f"drv_qual_std_{w}" for w in WINS]
    # frame[std_cols] = frame[std_cols].fillna(0.0)

    return frame

df = add_drv_qual_rollups(df)

# peek: a few rows to confirm values evolve only from *previous* races
cols_to_show = (
    ["year","round","driverId","qual_best_s"]
    + [f"drv_qual_mean_{w}" for w in WINS]
    + [f"drv_qual_std_{w}" for w in WINS]
)
print(df[cols_to_show].head(12))

# (mini-checkpoint) save temp so we don’t lose progress
df.to_csv(OUT_PATH, index=False)
print("saved temp to:", OUT_PATH)

d0 = int(df["driverId"].iloc[0])   # pick any driver id that exists
y0 = int(df["year"].min())         # earliest season in your data
cols = ["year","round","qual_best_s","drv_qual_mean_3","drv_qual_std_3",
        "drv_qual_mean_5","drv_qual_mean_10"]
print(df[(df.driverId==d0) & (df.year==y0)][cols].head(8))



  df[c] = pd.to_numeric(df[c], errors="ignore")


   year  round  raceId  driverId
0  1996      1     224        14
1  1996      1     224        21
2  1996      1     224        22
3  1996      1     224        30
4  1996      1     224        35
5  1996      1     224        44
6  1996      1     224        49
7  1996      1     224        50
rows: 11041 cols: 58
    year  round  driverId  qual_best_s  drv_qual_mean_3  drv_qual_mean_5  \
0   1996      1        14       95.351              NaN              NaN   
1   1996      1        21       95.898              NaN              NaN   
2   1996      1        22       94.474              NaN              NaN   
3   1996      1        30       93.125              NaN              NaN   
4   1996      1        35       92.371              NaN              NaN   
5   1996      1        44       95.330              NaN              NaN   
6   1996      1        49       94.494              NaN              NaN   
7   1996      1        50       95.338              NaN              NaN  

In [7]:
import numpy as np
import pandas as pd

# -----------------------------
# Configurable knobs (you can tune later)
# -----------------------------
SEASON_DECAY       = 0.95   # mild decay at any season boundary
REGULATION_DECAY   = 0.80   # extra decay in big-reg years
TEAM_SWITCH_DECAY  = 0.30   # keep 30% of prior team form when driver changes team
REGULATION_YEARS   = {2014, 2017, 2022}  # major resets; extend if desired

# -----------------------------
# 3.0 Order rows strictly (WHY: history walks row order)
# -----------------------------
df = df.sort_values(["year","round","raceId","driverId"], kind="mergesort").reset_index(drop=True)

# -----------------------------
# 3.1 Normalize quali pace by weekend (delta_to_pole)
# -----------------------------
pole_best = df.groupby(["year","round"])["qual_best_s"].transform("min")     # aligned column
df["delta_to_pole"] = df["qual_best_s"] - pole_best                           # 0 for pole sitter

# -----------------------------
# 3.2 Helper: per-row decay multiplier at season boundaries (for carry-over)
# -----------------------------
def boundary_decay_per_row(year_series: pd.Series) -> pd.Series:
    """
    Returns a multiplicative decay vector aligned to the input series:
    - 1.0 within a season
    - *= SEASON_DECAY at a season rollover
    - *= REGULATION_DECAY extra if new season is a regulation year
    """
    y = year_series.to_numpy()
    b = np.ones_like(y, dtype=float)

    # boundary iff current year != previous row's year (within the grouping we'll apply on)
    boundary = np.zeros_like(y, dtype=bool)
    boundary[1:] = (y[1:] != y[:-1])

    # base season decay
    b[boundary] *= SEASON_DECAY

    # extra reg decay when ENTERING a big-reg year
    reg_boundary = np.zeros_like(y, dtype=bool)
    reg_boundary[1:] = boundary[1:] & np.isin(y[1:], list(REGULATION_YEARS))
    b[reg_boundary] *= REGULATION_DECAY

    return pd.Series(b, index=year_series.index)

# ==========================================================
# DRIVER FORM (carry across seasons; mild boundary decay; no team-switch penalty)
# ==========================================================
# 3.3 Shift BEFORE rolling (no-leak rule)
drv_past = df.groupby("driverId", sort=False)["delta_to_pole"].shift(1)

# 3.4 Season-boundary decay per driver (applied to past values)
drv_decay = df.groupby("driverId", sort=False)["year"].transform(boundary_decay_per_row).astype(float)
drv_past_decayed = drv_past * drv_decay

# 3.5 Rolling stats across seasons (group by driver only)
g_drv = drv_past_decayed.groupby(df["driverId"])

for w in (3, 5, 10):
    df[f"drv_d2p_mean_{w}"] = g_drv.transform(lambda s: s.rolling(w, min_periods=1).mean())
    df[f"drv_d2p_std_{w}"]  = g_drv.transform(lambda s: s.rolling(w, min_periods=2).std())

# EWMA (smooth momentum)
df["drv_d2p_ewm_h3"] = g_drv.transform(lambda s: s.ewm(halflife=3, adjust=False).mean())

# ==========================================================
# TEAM FORM (carry across seasons; boundary decay; STRONG penalty on team switches)
# ==========================================================
# 3.6 Detect team changes per driver (True when constructorId != previous race)
df["driver_team_changed"] = (
    df.groupby("driverId")["constructorId"].transform(lambda s: s != s.shift(1))
)

# 3.7 Shift team pace BEFORE rolling (no leak); carry by constructorId across seasons
team_past = df.groupby("constructorId", sort=False)["delta_to_pole"].shift(1)

# 3.8 Base season/reg decay per constructor
team_decay_base = df.groupby("constructorId", sort=False)["year"].transform(boundary_decay_per_row).astype(float)

# 3.9 Apply extra decay when a driver switches teams (the *team* history becomes less relevant to this row)
# Note: we multiply a row-wise factor; where switch happened, shrink the contribution of previous team form.
team_decay = team_decay_base.copy()
team_decay[df["driver_team_changed"]] *= TEAM_SWITCH_DECAY

team_past_decayed = team_past * team_decay
g_team = team_past_decayed.groupby(df["constructorId"])

for w in (3, 5, 10):
    df[f"team_d2p_mean_{w}"] = g_team.transform(lambda s: s.rolling(w, min_periods=1).mean())
    df[f"team_d2p_std_{w}"]  = g_team.transform(lambda s: s.rolling(w, min_periods=2).std())

df["team_d2p_ewm_h3"] = g_team.transform(lambda s: s.ewm(halflife=3, adjust=False).mean())

# -----------------------------
# 3.10 Quick peek
# -----------------------------
cols = [
    "year","round","driverId","constructorId","qual_best_s","delta_to_pole",
    "driver_team_changed",
    "drv_d2p_mean_3","drv_d2p_ewm_h3","team_d2p_mean_3","team_d2p_ewm_h3"
]
print(df[cols].head(12))


    year  round  driverId  constructorId  qual_best_s  delta_to_pole  \
0   1996      1        14              1       95.351          2.980   
1   1996      1        21             18       95.898          3.527   
2   1996      1        22             17       94.474          2.103   
3   1996      1        30              6       93.125          0.754   
4   1996      1        35              3       92.371          0.000   
5   1996      1        44             27       95.330          2.959   
6   1996      1        49             15       94.494          2.123   
7   1996      1        50             29       95.338          2.967   
8   1996      1        55             22       94.257          1.886   
9   1996      1        56              6       92.889          0.518   
10  1996      1        57              1       94.054          1.683   
11  1996      1        63             25       94.832          2.461   

    driver_team_changed  drv_d2p_mean_3  drv_d2p_ewm_h3  team_d