In [3]:
from comet_ml import Experiment
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn import metrics
from sklearn.metrics import *
from sklearn.model_selection import train_test_split

In [4]:
DATA_PATH = '../../IFT6758_Data/'
PROJECT_PATH = '../../Milestone2/'

import sys
sys.path.append(PROJECT_PATH)

from features.feature_eng2 import *
from features.tidy_data import *

In [5]:
#get_train_data(DATA_PATH)

In [6]:
# Loading data and pre-processing
X = pd.read_csv(DATA_PATH + '/train_data.csv', index_col=0)
has_nan = X.isna().any().any()

if has_nan:
    print("There are NaN values in the DataFrame 'X'.")
    # Dropping NaNs since these events do not have x and y coordinates
    X.dropna(inplace=True)
    X = X.reset_index(drop=True)
else:
    print("There are no NaN values in the DataFrame 'X'.")

X = X[~X.isin([np.nan, np.inf, -np.inf]).any(axis = 1)]
X = X.reset_index(drop=True)
num_cols = X.select_dtypes([np.number]).columns
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

categorical_cols = X.select_dtypes(exclude=["number", "bool"]).columns
X = pd.get_dummies(data=X, columns=categorical_cols)

boolean_cols = X.select_dtypes([bool]).columns
X[boolean_cols] = X[boolean_cols].astype(int)
X = X.reset_index(drop=True)

There are NaN values in the DataFrame 'X'.


In [7]:
# Target
y = X['is_goal']
# Feature DataFrame
X = X.drop('is_goal', axis=1)

#N = 1000
#y = y.head(N)
#X = X.head(N)
print(X)

        gameSeconds    period  x_coordinate  y_coordinate  shotDistance  \
0         -1.340171 -1.235754      0.996901      0.013153     -1.017819   
1         -1.226231 -1.235754      1.249356      0.013153     -1.280119   
2         -1.189003 -1.235754     -1.085850     -0.826730     -1.024171   
3         -1.176594 -1.235754     -0.991180     -0.616760     -0.974610   
4         -1.106651 -1.235754      1.170463     -0.459281     -1.157283   
...             ...       ...           ...           ...           ...   
318575     1.965211  3.485882     -0.533606     -1.246672     -0.460293   
318576     2.027257  3.485882     -1.085850      1.692919     -0.825422   
318577     2.078022  3.485882      1.296691      1.377963     -1.002641   
318578     2.484144  3.485882     -0.975401     -1.509135     -0.794482   
318579     2.867704  3.485882      1.217799     -0.354296     -1.216307   

        shotAngle  Last_x_coordinate  Last_y_coordinate  timeFromLastEvent  \
0       -0.989162    

In [20]:
# Training/test split 70%/30%, respectively
# Stratification counters imbalanced dataset with respect to target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

regularization = False

if regularization == True:
    # Max_feature adds REGULARIZATION as a form of feature subsampling
    RandomForest_model = RandomForestRegressor(n_estimators=3, max_features='sqrt', random_state=0)
else:
    RandomForest_model = RandomForestRegressor(n_estimators=3, random_state=0)

sel = RFECV(estimator=RandomForest_model, step=1, cv=5, scoring='neg_mean_squared_error')
X_train_sel = sel.fit_transform(X_train, y_train)
X_test_sel = sel.transform(X_test)

# Showing selected features
selected_features = X_train.columns[sel.support_]
print("Selected Features:", selected_features)

Selected Features: Index(['gameSeconds', 'period', 'x_coordinate', 'y_coordinate', 'shotDistance',
       'shotAngle', 'Last_x_coordinate', 'Last_y_coordinate',
       'timeFromLastEvent', 'DistanceLastEvent', 'changeShotAngle', 'speed',
       'time_since_pp', 'no_players_home', 'no_players_away',
       'shotType_Backhand', 'shotType_Deflected', 'shotType_Slap Shot',
       'shotType_Snap Shot', 'shotType_Tip-In', 'shotType_Wrist Shot',
       'LastEventType_Goal'],
      dtype='object')


In [21]:
# Training and predicting with feature selection-cleaned dataset
RandomForest_model.fit(X_train_sel, y_train)

In [23]:
#y_preds is continous, forbidding us of using f1, accuracy, precision, recall
y_preds = RandomForest_model.predict(X_test_sel)
print(y_preds)
# Thresholding y_preds to turn it into a binary classification output
y_preds = (y_preds > 0).astype(int)
y_test = (y_test > 0).astype(int)

f1 = f1_score(y_test, y_preds)
print(f'f1 score: {f1}')
accuracy = accuracy_score(y_test, y_preds)
print(f'accuracy score: {accuracy}')
precision = precision_score(y_test, y_preds)
print(f'precision score: {precision}')
recall = recall_score(y_test, y_preds)
print(f'recall score: {recall}')

[-0.31779379  0.83703568 -0.31779379 ...  0.83703568 -0.31779379
  0.83703568]
f1 score: 0.2141514973576042
accuracy score: 0.7199447548496453
precision score: 0.1441900921203495
recall score: 0.4159917873845101


In [24]:
pickle.dump(RandomForest_model, open("rf_no_reg_model.pkl", "wb"))
experiment = Experiment(
  api_key='M0ld212AYoT5RG6UcLL807o5T',
  project_name="nhl-project-b10",
  workspace="ift6758b-project-b10"
)

evaluation = {"accuracy": accuracy, "f1": f1, "recall": recall, "precision": precision}
params = {
    "model": 'Random Forest',
    "description": 'Random Forest Classifier (3 estimators) without Regularization on Feature Eng2 Cleaned Dataframe',
    **RandomForest_model.get_params()
}
experiment.set_name('Random Forest w/o Regularization')
experiment.log_parameters(params)
experiment.log_metrics(evaluation)

experiment.log_model('Random Forest Classifier without Regularization', 'rf_no_reg_model.pkl') #Edit this
experiment.end() # Important if you are using jupyter

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/Users/tristanmartin/Desktop/UdeM_PhD/Cours/A2023/IFT6758/Project/IFT6758B-Project-B10-main-2/Milestone2/models' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/ift6758b-project-b10/nhl-project-b10/072a1b655d1a4995b8dfe64ff73b18d8

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/ift6758b-project-b10/nhl-project-b10/072a1b655d1a4995b8dfe64ff73b18d8
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;3