In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [None]:
df = pd.read_csv("dataset_f1_ml_ready.csv")
df["win"] = (df["positionOrder"] == 1).astype(int)

features = ["year","round","gp_name","forename","surname","team",
            "grid","n_pitstops","avg_pit_ms","is_rain"]
target = "win"

X = df[features].copy()
y = df[target].astype(int)

num_cols = ["year","round","grid","n_pitstops","avg_pit_ms"]
cat_cols = ["gp_name","forename","surname","team","is_rain"]


In [4]:
preproc = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
clf = RandomForestClassifier(
    n_estimators=600, max_depth=None, min_samples_leaf=2,
    random_state=42, n_jobs=-1
)

pipe = Pipeline([
    ("prep", preproc),
    ("rf", clf)
])

pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:,1]


Accuracy: 1.0
Report:
               precision    recall  f1-score   support

           0      1.000     1.000     1.000     69278
           1      1.000     0.999     0.999      3646

    accuracy                          1.000     72924
   macro avg      1.000     0.999     1.000     72924
weighted avg      1.000     1.000     1.000     72924

Confusion:
 [[69278     0]
 [    4  3642]]


In [None]:
joblib.dump(pipe, "model_winner_rf.pkl")

✅ Sauvegardé: model_winner_rf.pkl
