In [1]:
import pandas as pd
import numpy as np
import os 

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
input_path = os.path.join('..\data\datasets\csv_files','2016-2020-v2.csv')

df = pd.read_csv(input_path)

In [3]:
target_feature =['distance_to_net','shot_angle']
test_feature = 'is_goal'

In [4]:
def split_data(df : pd.DataFrame, target_feature : list, test_feature : str, test_year: int):
    df = df.copy()
    
    # Ajout d'une colonne pour l'année
    df['year'] = df['gameID'].apply(lambda x : x//1000000)
    features = target_feature.copy()
    features.append(test_feature)
    
    # DataFrame pour l'ensemble d'entrainement et de validation
    df_train_val = df[df['year'] != test_year]
    df_train_val = df_train_val[features].dropna()

    X_train, X_val, Y_train, Y_val = train_test_split(
        df_train_val[target_feature], df_train_val[test_feature], test_size = 0.2, random_state = 42
    )

    # DataFrame pour l'ensemble de test
    df_test = df[df['year'] == test_year]
    X_test = df_test[target_feature].dropna()
    Y_test = df_test[test_feature].dropna()

    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [5]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = split_data(df, target_feature, test_feature, 2020)

$$\textbf{Encodage des caractéristiques :}$$

In [6]:
# Encodage des caractéristiques de type catégorielle :
# On utilise LabelEncoder() pour les variables pour lesquelles l'ordre n'est pas important

# Colonne pour lesquels l'ordre n'est pas important
categorical_columns_1 = ['period_type', 'attacking_team_name', 'shooter', 'goalie', 'rebound', 'strength', 'last_event_type']

# Colonne pour laquelle l'ordre est important
# Sachant que certains types de tirs sont plus efficaces en moyenne que d'autre, on encode les 
# types de tirs les plus efficaces avec des valeurs élevées
# (Au Milestone 1, on a vu que les 'Tip-in' sont les plus efficaces et que les 'Wrap-around' sont les moins
# efficaces)

shot_type_classified = [['Wrap-around',0], ['Slap Shot', 1], ['Snap Shot', 2], ['Wrist Shot', 3], ['Backhand', 4], ['Deflected', 5], ['Tip-In',6]]

In [7]:
def encode_categorical_features(df : pd.DataFrame, categorical_features: list, shot_type_classified : list):
    df = df.copy()

    # Encodage des 'shot_type'
    mapping_dict = {row[0]: row[1] for row in shot_type_classified}
    df['shot_type'] = df['shot_type'].replace(mapping_dict)

    # Encodage des autres caractéristiques
    label_encoder = LabelEncoder()

    for feature in categorical_features :
        df[feature] = label_encoder.fit_transform(df[feature]) 

    return df

In [8]:
df = encode_categorical_features(df, categorical_columns_1, shot_type_classified)
df.head()

Unnamed: 0,period,period_type,period_time,game_seconds,gameID,attacking_team_id,attacking_team_name,shooter,goalie,shot_type,...,powerplay_duration,home_team_players,away_team_players,distance_to_net,shot_angle,is_goal,is_empty_net,rebound,angle_change,speed
0,1,1,01:11,71,2016020001,10,26,983,43,3.0,...,0,5,5,13.0,23.0,0,0,0,0.0,17.088007
1,1,1,02:53,173,2016020001,9,20,268,58,3.0,...,0,5,5,13.0,77.0,0,0,0,0.0,7.343024
2,1,1,04:01,241,2016020001,9,20,295,58,3.0,...,0,5,5,76.0,-30.0,0,0,0,0.0,5.684341
3,1,1,04:46,286,2016020001,9,20,455,58,1.0,...,0,5,5,58.0,-15.0,0,0,0,0.0,2.414752
4,1,1,06:46,406,2016020001,10,26,876,43,3.0,...,0,5,5,62.0,27.0,0,0,0,0.0,5.07637


$$ \textbf{Réequilibrage des données :} $$

In [None]:
# 1re approche : Utilisation de SMOTE pour créer des échantillons synthétiques de la classe minoritaire


In [9]:
# 2e approche : Utilisation de RandomUnderSampler pour réduire la taille de la classe majoritaire
from imblearn.under_sampling import RandomUnderSampler
# https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html#imblearn.under_sampling.RandomUnderSampler

In [35]:
df = df.dropna()
Y = df['is_goal']
#X = df[['distance_to_net','shot_angle']]
X = df.drop(columns=['period_time','is_goal'], axis = 1)

X_train, X_val, Y_train, Y_val = train_test_split(
    X, Y, train_size = 0.7, random_state = 42
)

In [36]:
rus = RandomUnderSampler(random_state = 42)
X_train_res, Y_train_res = rus.fit_resample(X_train, Y_train)

$$\textbf{Sélection de caractéristiques :}$$
$$ \underline{\text{Option 1 : }} \text{RandomForestClassifier} $$

In [37]:
RandomForest_model = RandomForestClassifier()

In [30]:
X.columns

Index(['period', 'period_type', 'game_seconds', 'gameID', 'attacking_team_id',
       'attacking_team_name', 'shooter', 'goalie', 'shot_type', 'x_coordinate',
       'y_coordinate', 'strength', 'last_event_type', 'last_event_x',
       'last_event_y', 'time_since_last_event', 'distance_from_last_event',
       'powerplay_duration', 'home_team_players', 'away_team_players',
       'distance_to_net', 'shot_angle', 'is_empty_net', 'rebound',
       'angle_change', 'speed'],
      dtype='object')

In [38]:
RandomForest_model.fit(X_train_res,Y_train_res)

In [39]:
Y_pred = RandomForest_model.predict(X_val)

In [40]:
confusion_matrix(Y_val,Y_pred)

array([[100883,      0],
       [     0,  10317]], dtype=int64)

In [41]:
accuracy_score(Y_pred, Y_val)

1.0