In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load dataset
data = pd.read_csv("winners_f1_1950_2025_v2.csv")

# ---------------------
# 1. Data Cleaning
# ---------------------
# Convert date to datetime
data["date"] = pd.to_datetime(data["date"], errors="coerce")

# Convert race time (HH:MM:SS) to total seconds
def time_to_seconds(t):
    try:
        parts = list(map(int, t.split(":")))
        return parts[0] * 3600 + parts[1] * 60 + parts[2]
    except:
        return np.nan

# Apply conversion
data["time_seconds"] = data["time"].apply(time_to_seconds)

# ---------------------
# 2. Preprocessing Pipelines
# ---------------------
num_features = ["laps", "year", "time_seconds"]
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_features = ["continent", "grand_prix", "circuit", "winner_name", "team"]
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine transformers for input (X)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")

# Apply preprocessing to input features
X = preprocessor.fit_transform(data)

# ---------------------
# 3. Output (y)
# ---------------------
preprocessor_out = ColumnTransformer(
    transformers=[
        ("num", num_transformer, ["time_seconds"])
    ]
)
preprocessor_out.set_output(transform="pandas")

y = preprocessor_out.fit_transform(data)

# ---------------------
# 4. Feature Engineering
# ---------------------
data["time_per_lap"] = data["time_seconds"] / data["laps"]
X["time_per_lap"] = data["time_per_lap"]

# ---------------------
# 5. Train-Test Split
# ---------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display processed samples
print(X_train.head())
print(y_train.head())


      num__laps  num__year  num__time_seconds  cat__continent_Africa  \
12     1.265567  -2.015220           2.073543                    0.0   
758    0.068939   0.623813          -0.731568                    0.0   
636    0.667253   0.287937           0.124062                    0.0   
1108  -0.429656   1.487497          -0.195414                    0.0   
743    0.268377   0.575831          -0.546284                    0.0   

      cat__continent_Asia  cat__continent_Europe  \
12                    0.0                    1.0   
758                   0.0                    1.0   
636                   0.0                    1.0   
1108                  1.0                    0.0   
743                   0.0                    1.0   

      cat__continent_North America  cat__continent_Oceania  \
12                             0.0                     0.0   
758                            0.0                     0.0   
636                            0.0                     0.0   
1108  