Ce notebook va être consacré au preprocessing du dataset et l'entrainement d'un premier modèle simple pour établir une baseline.

---

# Load

In [1]:
# imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# pandas display options
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

random_state = 42

base_path = 'data'
file_name = 'season.csv'
years = range(2019, 2023)

In [2]:
from f1pitpred.loading import load_from_csv

data = load_from_csv(years, base_path, file_name)

# Preprocessing

In [4]:
X_test

Unnamed: 0,LapNumber,LapTime,TyreLife,Stint,DistanceToDriverAhead,Position,GapToLeader,IntervalToPositionAhead,LapsToLeader,TotalLaps,Compound_HARD,Compound_MEDIUM,Compound_SOFT,Track_Austin,Track_Baku,Track_Barcelona,Track_Budapest,Track_Imola,Track_Jeddah,Track_Le_Castellet,Track_Lusail,Track_Melbourne,Track_Mexico_City,Track_Miami,Track_Monaco,Track_Monte_Carlo,Track_Montréal,Track_Monza,Track_Nürburgring,Track_Portimão,Track_Sakhir,Track_Shanghai,Track_Silverstone,Track_Singapore,Track_Sochi,Track_Spa-Francorchamps,Track_Spielberg,Track_Suzuka,Track_São_Paulo,Track_Yas_Island,Track_Zandvoort,Green,Yellow,SC,Red,VSC,SC_ending
0,2,89.759003,5,1,100.351944,2,1.242000,1.242,0,58,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False
1,3,89.332001,6,1,116.560837,2,1.344000,1.344,0,58,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False
2,4,88.804001,7,1,127.691391,2,1.451000,1.451,0,58,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False
3,5,88.914001,8,1,137.242218,2,1.580000,1.580,0,58,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False
4,6,89.474998,9,1,154.618607,2,1.987000,1.987,0,58,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13815,57,91.113998,38,2,341.771118,9,82.209000,6.811,0,58,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,True,False,False,False,False,False
13816,57,90.742996,27,2,483.675568,15,-1.000000,9.154,1,58,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,True,False,False,False,False,False
13817,58,90.021004,17,3,1607.539429,7,57.240002,1.006,0,58,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,True,False,False,False,False,False
13818,58,91.330002,19,3,989.131653,8,76.931000,19.691,0,58,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,True,False,False,False,False,False


In [3]:
from f1pitpred.preprocessing import get_preprocessed_train_test_split
df = data.copy()
train_df, test_df, encoder, train_groups, test_groups = get_preprocessed_train_test_split(
    df, 
    0.2, 
    return_groups=True, 
    random_state=random_state
)

from f1pitpred.preprocessing import get_x_y_pit
X_train, y_train = get_x_y_pit(train_df)
X_test, y_test = get_x_y_pit(test_df)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(55617, 47) (13820, 47) (55617,) (13820,)


# Modelisation

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
def specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return tn / (tn + fp)

def balanced_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    return (specificity + sensitivity) / 2

In [9]:
model = RandomForestClassifier(
    n_estimators=2000,
    max_depth=5, 
    max_features='sqrt',
    criterion='entropy',
    class_weight='balanced_subsample',
    random_state=random_state,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

print("Balanced accuracy : ", balanced_accuracy(y_test, y_pred))
print("Specificity : ", specificity(y_test, y_pred))

Balanced accuracy :  0.7299316788907867
Specificity :  0.735539033457249


In [11]:
from sklearn.metrics import classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[9893 3557]
 [ 102  268]]
              precision    recall  f1-score   support

       False       0.99      0.74      0.84     13450
        True       0.07      0.72      0.13       370

    accuracy                           0.74     13820
   macro avg       0.53      0.73      0.49     13820
weighted avg       0.97      0.74      0.82     13820



In [14]:
# Save model
import pickle
models_dir = 'models/balanced_accuracy_2'
os.makedirs(models_dir, exist_ok=True)
pickle.dump(model, open(models_dir + '/model.pkl', 'wb'))
pickle.dump(encoder, open(models_dir + '/encoder.pkl', 'wb'))