In [1]:
import pandas as pd
import numpy as np
import os 

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import src.features.feature_selection as fsel

In [2]:
input_path = os.path.join('..\data\datasets\csv_files','2016-2020-v2.csv')
df = pd.read_csv(input_path)

In [3]:
df.columns

Index(['period', 'period_type', 'period_time', 'game_seconds', 'gameID',
       'attacking_team_id', 'attacking_team_name', 'home_team', 'shooter',
       'goalie', 'shot_type', 'x_coordinate', 'y_coordinate', 'strength',
       'last_event_type', 'last_event_x', 'last_event_y',
       'time_since_last_event', 'distance_from_last_event',
       'powerplay_duration', 'home_team_players', 'away_team_players',
       'distance_to_net', 'shot_angle', 'is_goal', 'is_empty_net', 'rebound',
       'angle_change', 'speed'],
      dtype='object')

$$ \textbf{Traitement des valeurs manquantes dans la colonne 'strength':} $$

In [6]:
df = fsel.remove_nan_from_strength(df)

$$ \textbf{Encodage des caractéristiques : } $$

In [7]:
# Encodage des caractéristiques de type catégorielle :
# On utilise LabelEncoder() pour les variables pour lesquelles l'ordre n'est pas important

# Colonne pour lesquels l'ordre n'est pas important
categorical_columns_1 = ['period_type', 'attacking_team_name', 'shooter', 'goalie', 'rebound', 'last_event_type', 'home_team']

# Colonne pour laquelle l'ordre est important
# Sachant que certains types de tirs sont plus efficaces en moyenne que d'autre, on encode les 
# types de tirs les plus efficaces avec des valeurs élevées
# (Au Milestone 1, on a vu que les 'Tip-in' sont les plus efficaces et que les 'Wrap-around' sont les moins
# efficaces)

shot_type_classified = [['Wrap-around',0], ['Slap Shot', 1], ['Snap Shot', 2], ['Wrist Shot', 3], ['Backhand', 4], ['Deflected', 5], ['Tip-In',6]]

# La caractéristique 'strength' doit aussi étre encodée de manière ordinale, étant donné
# que lorsqu'une équipe est en 'Power Play', elle a plus de chances de marquer tandis que lorsqu'elle est
# 'Short handed', ses chances de marquer diminuent
strength_classified = [['Short Handed',0], ['Even', 1], ['Power Play', 2]]

In [8]:
df = df.dropna()
df = fsel.encode_categorical_features(df, categorical_columns_1, shot_type_classified, strength_classified)

$$ \textbf{Sélection des caractéristiques + Séparation des données (entrainement, validation, test):} $$

$$\textbf{Méthode de filtrage (K-best) } $$

In [9]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import recall_score, f1_score

In [10]:
def get_test_df(df: pd.DataFrame, test_year : int) :
    df = df.copy()
    # Ajout d'une colonne pour l'année
    df['year'] = df['gameID'].apply(lambda x : x//1000000)

    # Récupération du DataFrame de test
    test_df = df[df['year'] == test_year]
    test_df.drop(columns = 'year')

    # Récupération du DataFrame d'entrainement et validation
    train_val_df = df[df['year'] == test_year]
    train_val_df.drop(columns = 'year')

    return test_df, train_val_df

In [11]:
# Récupération des deux DataFrames
test_df, train_val_df = get_test_df(df, 2020)

In [12]:
X = train_val_df.drop(columns=['is_goal', 'period_time'])
Y = train_val_df['is_goal']

# On récupère le dataset avec les K-meilleures caractéristiques
X_Kbest, Kbest_features = fsel.get_features_KBest(X, Y, 10)

  f = msb / msw


Unbalanced dataset

In [15]:
X_Kbest.to_csv('../data/datasets/csv_files/Dataset_train.csv', index=False)

In [16]:
Y.to_csv('../data/datasets/csv_files/test_feature_train.csv', index = False)

In [17]:
# Mise à jour de notre ensemble de test
test_df = test_df[Kbest_features]

In [18]:
test_df.to_csv('../data/datasets/csv_files/Test_dataset.csv')

In [19]:
X_train, X_val, Y_train, Y_val = train_test_split(
    X_Kbest, Y, train_size = 0.8, random_state = 42 
)

$$ \textbf{Réequilibrage des données : }  $$ 

In [24]:
# 1ere approche : Oversampling en utilisant la méthode SMOTE 
X_train_over, Y_train_over = fsel.oversample_dataset(X_train, Y_train)

Execute the following two rows to get a unique dataset on which you will continue to work on for the rest of the experiences

In [95]:
X_train_over.to_csv('../data/datasets/csv_files/Oversampled_train_dataset.csv', index = False)

In [96]:
Y_train_over.to_csv('../data/datasets/csv_files/Oversampled_train_labels.csv', index = False)

In [17]:
# 2e approche : Utilisation de RandomUnderSampler pour réduire la taille de la classe majoritaire
from imblearn.under_sampling import RandomUnderSampler
# https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html#imblearn.under_sampling.RandomUnderSampler

In [18]:
rus = RandomUnderSampler(random_state = 42)
X_train_res, Y_train_res = rus.fit_resample(X_train, Y_train)

$$ \textbf{Entrainement d'un modèle RandomForest} $$ 

In [25]:
RandomForest_model = RandomForestClassifier(random_state = 42)

In [26]:
# Entrainement du modèle sur les données sur-échantillonées par SMOTE
RandomForest_model.fit(X_train_over,Y_train_over)

In [27]:
Y_pred = RandomForest_model.predict(X_val)

In [28]:
confusion_matrix(Y_val,Y_pred)

array([[9130,  611],
       [ 816,  129]], dtype=int64)

In [30]:
print("Accuracy_score :",accuracy_score(Y_pred, Y_val))
print("Recall score :" , recall_score(Y_pred, Y_val))
print("F1-score :" , f1_score(Y_pred, Y_val))

Accuracy_score : 0.8664607898184541
Recall score : 0.17432432432432432
F1-score : 0.15311572700296736


$$ \textbf{Méthode de wrapping : Recursive feature elimination (RFE)} $$

In [58]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

In [59]:
X_1 = train_val_df.drop(columns=['is_goal', 'period_time'])
Y_1 = train_val_df['is_goal']

In [60]:
estimator = SVC(kernel = 'linear')
selector = RFE(estimator, n_features_to_select = 10, step = 1)

selector = selector.fit(X_1,Y_1)

In [61]:
selector.get_feature_names_out()

array(['game_seconds', 'gameID', 'shooter', 'goalie', 'last_event_x',
       'time_since_last_event', 'distance_from_last_event',
       'powerplay_duration', 'distance_to_net', 'speed'], dtype=object)

$$ \textbf{Optimisation des hyperparamètres du modèle : Cross-Validation} $$ 

In [40]:
from sklearn.model_selection import RandomizedSearchCV 

In [41]:
from scipy.stats import randint

In [77]:
param_dist = {'n_estimators' : [100, 150, 200, 250, 300, 400, 500],
              'max_depth' : [5, 10, 15, 20]}

rf = RandomForestClassifier(random_state = 42)

rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, scoring='roc_auc', n_iter = 10, cv = 5)

rand_search.fit(X_train_over, Y_train_over)

best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'n_estimators': 300, 'max_depth': 30}


In [88]:
rf_opti = RandomForestClassifier(n_estimators = 300, max_depth = 30, random_state = 42)

rf_opti.fit(X_Kbest,Y)

In [89]:
Y_pred = rf_opti.predict(X_val)

In [90]:
from sklearn.metrics import roc_auc_score

In [91]:
print("Accuracy_score :",accuracy_score(Y_pred, Y_val))
print("Recall score :" , recall_score(Y_pred, Y_val))
print("F1-score :" , f1_score(Y_pred, Y_val))
print("Roc_auc :", roc_auc_score(Y_pred, Y_val))

Accuracy_score : 0.9992513569155905
Recall score : 1.0
F1-score : 0.9957492029755579
Roc_auc : 0.999589701507847


In [92]:
confusion_matrix(Y_pred, Y_val)

array([[9741,    8],
       [   0,  937]], dtype=int64)

$$ \textbf{Utilisation d'un autre modèle de classification : AdaBoost} $$

In [31]:
from sklearn.ensemble import AdaBoostClassifier

In [36]:
Ada_clf = AdaBoostClassifier(n_estimators = 100, random_state = 42)

Ada_clf.fit(X_train_over, Y_train_over)

In [37]:
Y_pred = Ada_clf.predict(X_val)

In [38]:
print("Accuracy_score :",accuracy_score(Y_pred, Y_val))
print("Recall score :" , recall_score(Y_pred, Y_val))
print("F1-score :" , f1_score(Y_pred, Y_val))

Accuracy_score : 0.7505146921205316
Recall score : 0.16891111966140823
F1-score : 0.2477426636568849


In [39]:
confusion_matrix(Y_val, Y_pred)

array([[7581, 2160],
       [ 506,  439]], dtype=int64)