Ce notebook va être consacré au preprocessing du dataset et l'entrainement d'un premier modèle simple pour établir une baseline.

---

# Load

In [2]:
# imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# pandas display options
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

random_state = 42

base_path = 'data'
file_name = 'season.csv'
years = range(2019, 2023)

In [3]:
from f1pitpred.loading import load_from_csv

data = load_from_csv(years, base_path, file_name)

# Preprocessing

In [4]:
from f1pitpred.preprocessing import get_preprocessed_train_test_split
df = data.copy()
train_df, test_df, encoder, train_groups, test_groups = get_preprocessed_train_test_split(
    df, 
    0.2, 
    return_groups=True, 
    random_state=random_state
)

from f1pitpred.preprocessing import get_x_y_pit
X_train, y_train = get_x_y_pit(train_df)
X_test, y_test = get_x_y_pit(test_df)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(55617, 48) (13820, 48) (55617,) (13820,)


# Modelisation

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
def specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return tn / (tn + fp)

def balanced_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    return (specificity + sensitivity) / 2

In [6]:
model = RandomForestClassifier(
    n_estimators=2000,
    max_depth=5, 
    max_features='sqrt',
    criterion='entropy',
    class_weight='balanced_subsample',
    random_state=random_state,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)

print("Balanced accuracy : ", balanced_accuracy(y_test, y_pred))
print("Specificity : ", specificity(y_test, y_pred))

Balanced accuracy :  0.7299276600020095
Specificity :  0.7517472118959108


In [8]:
from sklearn.metrics import classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[10111  3339]
 [  108   262]]
              precision    recall  f1-score   support

       False       0.99      0.75      0.85     13450
        True       0.07      0.71      0.13       370

    accuracy                           0.75     13820
   macro avg       0.53      0.73      0.49     13820
weighted avg       0.96      0.75      0.84     13820



In [9]:
# Save model
import pickle
models_dir = 'models/f1_score_new'
os.makedirs(models_dir, exist_ok=True)
pickle.dump(model, open(models_dir + '/model.pkl', 'wb'))
pickle.dump(encoder, open(models_dir + '/encoder.pkl', 'wb'))