In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset
data = pd.read_csv("winners_f1_1950_2025_v2.csv")

# ---------------------------
# Preprocessing
# ---------------------------
# Features and target
y = data["continent"]   # Classification target
X = data.drop(columns=["continent", "date", "winner_name", "time"])

# Define numeric and categorical features
num_features = ["laps", "year"]
cat_features = ["grand_prix", "circuit", "team"]

num_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

# ---------------------------
# Models with preprocessing
# ---------------------------
clf1 = Pipeline(steps=[("preprocessor", preprocessor),
                      ("classifier", DecisionTreeClassifier(random_state=42))])

clf2 = Pipeline(steps=[("preprocessor", preprocessor),
                      ("classifier", LogisticRegression(max_iter=1000))])

# ---------------------------
# Cross-validation
# ---------------------------
k_folds = KFold(n_splits=10, shuffle=True, random_state=42)

scores1 = cross_val_score(clf1, X, y, cv=k_folds)
scores2 = cross_val_score(clf2, X, y, cv=k_folds)

print("Cross Validation Scores (DecisionTree):", scores1)
print("Cross Validation Scores (LogisticRegression):", scores2)
print("Average CV Score (DecisionTree):", scores1.mean())
print("Average CV Score (LogisticRegression):", scores2.mean())
print("Number of CV Scores used (DecisionTree):", len(scores1))
print("Number of CV Scores used (LogisticRegression):", len(scores2))

Cross Validation Scores (DecisionTree): [0.99130435 1.         0.99122807 0.98245614 0.99122807 0.99122807
 0.99122807 0.99122807 1.         1.        ]
Cross Validation Scores (LogisticRegression): [1.         0.99130435 1.         0.99122807 1.         1.
 1.         1.         1.         1.        ]
Average CV Score (DecisionTree): 0.9929900839054155
Average CV Score (LogisticRegression): 0.9982532418001526
Number of CV Scores used (DecisionTree): 10
Number of CV Scores used (LogisticRegression): 10
