Ce notebook va être consacré au preprocessing du dataset et l'entrainement d'un premier modèle simple pour établir une baseline.

---

# Load

In [1]:
# imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# pandas display options
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

random_state = 42

base_path = 'data'
file_name = 'season.csv'
years = range(2019, 2023)

In [2]:
from f1pitpred.loading import load_from_csv

data = load_from_csv(years, base_path, file_name)

# Preprocessing

In [3]:
from f1pitpred.preprocessing import get_preprocessed_train_test_split
df = data.copy()
train_df, test_df, encoder, train_groups, test_groups = get_preprocessed_train_test_split(
    df, 
    0.2, 
    return_groups=True, 
    random_state=random_state
)

from f1pitpred.preprocessing import get_x_y
X_train, y_train = get_x_y(train_df)
X_test, y_test = get_x_y(test_df)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(55617, 47) (13820, 47) (55617,) (13820,)


# Modelisation

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
def specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return tn / (tn + fp)

def balanced_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    return (specificity + sensitivity) / 2

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score

param_grid = {
    'n_estimators': [100, 500, 1000, 2000],
    'max_depth': [5, 20, None],
    'class_weight': ['balanced_subsample', 'balanced'],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['entropy']
}

model = RandomForestClassifier(random_state=0)

scorer = make_scorer(
    balanced_accuracy,
    greater_is_better=True,
    needs_proba=False
)

grid_search = GridSearchCV(
    model,
    param_grid,
    cv=4,
    scoring=scorer,
    verbose=10,
    n_jobs=-1
)

In [9]:
grid_search.fit(X_train.values, y_train)

Fitting 4 folds for each of 72 candidates, totalling 288 fits


In [10]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.13001121179827513
{'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 2000}


In [11]:
grid_search.best_estimator_.predict(X_test).shape
y_test.shape



(14276,)

In [12]:
from sklearn.metrics import classification_report
ypred = grid_search.best_estimator_.predict(X_test.values)
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))

[[10470  3422]
 [  103   281]]
              precision    recall  f1-score   support

       False       0.99      0.75      0.86     13892
        True       0.08      0.73      0.14       384

    accuracy                           0.75     14276
   macro avg       0.53      0.74      0.50     14276
weighted avg       0.97      0.75      0.84     14276



In [13]:
model = grid_search.best_estimator_

In [14]:
# Save model
import pickle
models_dir = 'models/balanced_accuracy_2'
os.makedirs(models_dir, exist_ok=True)
pickle.dump(model, open(models_dir + '/model.pkl', 'wb'))
pickle.dump(encoder, open(models_dir + '/encoder.pkl', 'wb'))

In [15]:
ranks = grid_search.cv_results_['rank_test_score']
params = grid_search.cv_results_['params']
scores = grid_search.cv_results_['mean_test_score']

results = pd.DataFrame({
    'rank': ranks,
    'class_weight': [p['class_weight'] for p in params],
    'criterion': [p['criterion'] for p in params],
    'max_depth': [p['max_depth'] for p in params],
    'max_features': [p['max_features'] for p in params],
    'n_estimators': [p['n_estimators'] for p in params],
    'score': scores
})

results.set_index('rank', inplace=True)
results.dropna(subset=['score'], inplace=True)
results.sort_values(by='score', ascending=False, inplace=True)

In [16]:
pd.set_option('display.max_rows', None)
results

Unnamed: 0_level_0,class_weight,criterion,max_depth,max_features,n_estimators,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,balanced_subsample,entropy,5.0,sqrt,2000,0.130011
2,balanced,entropy,5.0,sqrt,500,0.129973
3,balanced_subsample,entropy,5.0,sqrt,500,0.129856
4,balanced_subsample,entropy,5.0,log2,2000,0.129681
5,balanced,entropy,5.0,log2,2000,0.129667
6,balanced,entropy,5.0,sqrt,2000,0.129547
7,balanced_subsample,entropy,5.0,sqrt,1000,0.129496
8,balanced,entropy,5.0,sqrt,1000,0.129489
9,balanced_subsample,entropy,5.0,log2,1000,0.12894
10,balanced_subsample,entropy,5.0,log2,100,0.128932
