In [1]:
from comet_ml import Experiment

In [2]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve, auc
import shap
import pickle

import sys
sys.path.append('../ift6758/visualizations/')

from question_5_plots import *

In [3]:
df = pd.read_csv('final_df.csv')

In [4]:
experiment = Experiment(
    api_key=os.environ.get('COMET_API_KEY'),
    project_name="random-forest",
    workspace="kleitoun"
)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/kleitoun/random-forest/7a877e900dde4ba0bd282c77f52d5141



## Set the parameters and the proper dataset for the model

In [5]:
model = RandomForestClassifier()
param_grid = {'random_state':[1],
             'n_estimators':[50,75,100,125,150],
              'max_depth':[6,8,10,12,14,16]
              }

In [6]:
search = GridSearchCV(model,param_grid,scoring='balanced_accuracy',cv=5,refit=True,error_score='raise')

In [7]:
# training (train+validation sets for model) and test tests creation

df_train = df[df['season'] != 20192020]
df_train = df[df['season_type'] == 'R']

## Select the appropriate features

In [8]:
old_feat = ['game_id', 'season', 'season_type', 'event_id', 'home_team',
       'away_team', 'home_team_side_1st_period', 'attacking_team',
       'attacking_player', 'goalie', 'period', 'period_time', 'goal_ind',
       'shot_ind', 'x_coordinates', 'y_coordinates', 'shot_type', 'empty_net',
       'strength', 'gwg', 'previous_event_x_coordinates',
       'previous_event_y_coordinates', 'previous_event_period_time',
       'previous_event_type', 'previous_event_period', 'previous_attacking_team',
       'attacking_team_side']

df_feat = df.iloc[:,:60].columns.tolist()
print(len(df_feat))
#features extracted in question 4
new_feat = [feat for feat in df_feat if feat not in old_feat]

60


In [9]:
X = df_train[new_feat]
y = df_train['goal_ind']
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.15,random_state=10, stratify=y, shuffle = True)

## Search, using gridsearch, the best model parameters and. use them on the validation set

In [None]:
%%time

fit = search.fit(X,y)

COMET ERROR: Failed to extract parameters from estimator


In [None]:
model = fit.best_estimator_
print(fit.best_estimator_)

## Store the model using pickle to avoid having to run it multiple times

In [None]:
pickle.dump(model, open("random_forest", 'wb'))

In [None]:
experiment.log_model(name = "RandomForest", file_or_folder = "random_forest")

In [None]:
# loaded_model = pickle.load(open("random_forest", 'rb'))

In [None]:
valid_preds = model.predict(X_valid)
# valid_preds = loaded_model.predict(X_valid)
accuracy = accuracy_score(y_valid,valid_preds)
roc_auc =  roc_auc_score(y_valid,valid_preds)
report = classification_report(y_valid, valid_preds, output_dict=True)
metrics = {
    "roc_auc": roc_auc,
    "accuracy": accuracy,
    "classification report": report
}
experiment.log_metrics(metrics)

## Generate the appropriate plots

In [None]:
question_no = 6
perf_eval = Performance_Eval(model, 'Random Forest', X_train, y_train, X_valid, y_valid, question_no = question_no)
# perf_eval = Performance_Eval(loaded_model,'Random Forest',X_train, y_train, X_valid, y_valid, question_no = question_no)
roc = perf_eval.get_roc_auc_plot()
experiment.log_figure(figure=roc,overwrite=False)
gr = perf_eval.get_goal_rate_plot()
experiment.log_figure(figure=gr,overwrite=False)
cr = perf_eval.get_cum_rate_plot()
experiment.log_figure(figure=cr,overwrite=False)
cp = perf_eval.get_calibration_plot()
experiment.log_figure(figure=cp,overwrite=False)

In [None]:
experiment.end()