In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset
data = pd.read_csv("winners_f1_1950_2025_v2.csv")

# Convert race time to seconds
def time_to_seconds(t):
    try:
        parts = list(map(int, t.split(":")))
        return parts[0] * 3600 + parts[1] * 60 + parts[2]
    except:
        return np.nan

data["time_seconds"] = data["time"].apply(time_to_seconds)
data = data.dropna(subset=["time_seconds"])  # remove missing

# ---------------------------
# Features & Target for Classification (continent)
# ---------------------------
X_class = data[["laps", "year", "grand_prix", "circuit", "team"]]
y_class = data["continent"]

# Preprocessor for categorical + numeric
preprocessor_class = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["grand_prix", "circuit", "team"])
    ], remainder='passthrough')

Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

# Decision Tree Classifier
clf = Pipeline(steps=[("preprocessor", preprocessor_class),
                     ("classifier", DecisionTreeClassifier(criterion="entropy", random_state=42))])
clf.fit(Xc_train, yc_train)
yc_pred = clf.predict(Xc_test)
accuracy_dt = accuracy_score(yc_test, yc_pred)
print("Decision Tree Accuracy:", accuracy_dt)

# ---------------------------
# Features & Target for Regression (time_seconds)
# ---------------------------
X_reg = data[["laps", "year", "grand_prix", "circuit", "team"]]
y_reg = data["time_seconds"]

preprocessor_reg = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["grand_prix", "circuit", "team"])
    ], remainder='passthrough')

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

lrr = Pipeline(steps=[("preprocessor", preprocessor_reg),
                     ("regressor", LinearRegression())])

lrr.fit(Xr_train, yr_train)
yr_pred = lrr.predict(Xr_test)

mse = mean_squared_error(yr_test, yr_pred)
r2 = r2_score(yr_test, yr_pred)
print("Linear Regression MSE:", mse)
print("Linear Regression R2:", r2)

Decision Tree Accuracy: 0.9956140350877193
Linear Regression MSE: 837874.4107846403
Linear Regression R2: 0.711403112118626
