In [None]:
%load_ext autoreload
%autoreload 2

import pathlib
if pathlib.Path().resolve().name == 'notebooks':
    %cd ..
%pwd

import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
from src.data import NHLDataDownloader
from src.features import load_df_shots, add_goalie_ratio, add_opponent_concedes, add_shooter_ratio, add_team_goals

In [None]:
nhl = NHLDataDownloader(2016)
test = nhl.load_processed_data()

In [None]:
columns = ['Shot_distance', 'Shot_angle', 'Goal', 'Empty_net']
df_2016 = load_df_shots(2016)
df_2017 = load_df_shots(2017)
df_2018 = load_df_shots(2018)
df_2019 = load_df_shots(2019)
df_2020 = load_df_shots(2020)

# df_tot = pd.concat([df_2016, df_2017, df_2018, df_2019]).reset_index(drop=True)
df = pd.concat([df_2016, df_2017, df_2018, df_2019, df_2020]).reset_index(drop=True)
# df = df_tot[columns]

In [None]:
df = add_shooter_ratio(df)
df = add_goalie_ratio(df)
df = add_team_goals(df)
df = add_opponent_concedes(df)

In [None]:
df.drop('Strength', axis=1, inplace=True)
df.Shot_angle = df.Shot_angle.abs()
df.to_pickle("final.pkl")

In [None]:
pd.read_pickle('final.pkl').columns

In [None]:
dfp = df

In [None]:
from src.client import start_experiment

In [None]:
exp = start_experiment()

In [None]:
exp.end()

In [None]:
subset_df = df[(df.Year == 2017) & (df.Game_id == 1065)]

In [None]:
if 'Gam_id' in df:
    print('test')

In [None]:
exp.log_dataframe_profile(subset_df, name='wpg_v_wsh_2017021065', dataframe_format='csv')

In [None]:
goal_df = df_all[['Year','Game_id','Shooter', 'Goal']]
goal_ratio_df = goal_df.groupby(['Year','Game_id', 'Shooter']).Goal.mean().reset_index(name='Goal_ratio')

base = pd.DataFrame([[0,0,n,0.1] for n in goal_ratio_df.Shooter.unique()], columns=goal_ratio_df.columns)
goal_ratio_df = pd.concat([base, goal_ratio_df]).reset_index(drop=True)
goal_ratio_df['Ema'] = goal_ratio_df.groupby('Shooter')['Goal_ratio'].transform(
    lambda x: x.ewm(alpha=0.01, adjust=False).mean()
)

df_all = df_all.drop('Shooter_ratio', axis=1, errors='ignore')

goal_ratio_df['Shooter_ratio'] = goal_ratio_df.groupby('Shooter')['Ema'].shift().round(4)
df_all = df_all.merge(goal_ratio_df[['Year','Game_id', 'Shooter', 'Shooter_ratio']], how='left', on=['Year','Game_id', 'Shooter'])

In [None]:
save_df = df_all[['Year','Game_id','Goalie', 'Goal']]
save_ratio_df = save_df.groupby(['Year','Game_id', 'Goalie']).Goal.mean().reset_index(name='Save_ratio')
save_ratio_df.Save_ratio = 1 - save_ratio_df.Save_ratio

base = pd.DataFrame([[0,0,n,0.9] for n in save_ratio_df.Goalie.unique()], columns=save_ratio_df.columns)
save_ratio_df = pd.concat([base, save_ratio_df]).reset_index(drop=True)
save_ratio_df['Ema'] = save_ratio_df.groupby('Goalie')['Save_ratio'].transform(
    lambda x: x.ewm(alpha=0.01, adjust=False).mean()
)

df_all = df_all.drop('Goalie_ratio', axis=1, errors='ignore')

save_ratio_df['Goalie_ratio'] = save_ratio_df.groupby('Goalie')['Ema'].shift().round(4)
df_all = df_all.merge(save_ratio_df[['Year','Game_id', 'Goalie', 'Goalie_ratio']], how='left', on=['Year','Game_id', 'Goalie'])
df_all.loc[df_all.Goalie == '', 'Goalie_ratio'] = 0.5

In [None]:
goal_df = df_all[['Year','Game_id','Team', 'Goal']]
goal_ratio_df = goal_df.groupby(['Year','Game_id', 'Team']).apply(
    lambda x: x['Goal'].sum()
).reset_index(name='Goal_ratio')

base = pd.DataFrame([[0,0,n,3] for n in goal_ratio_df.Team.unique()], columns=goal_ratio_df.columns)
goal_ratio_df = pd.concat([base, goal_ratio_df]).reset_index(drop=True)
goal_ratio_df['Ema'] = goal_ratio_df.groupby('Team')['Goal_ratio'].transform(
    lambda x: x.ewm(alpha=0.01, adjust=False).mean()
)

df_all = df_all.drop('Team_goals', axis=1, errors='ignore')

goal_ratio_df['Team_goals'] = goal_ratio_df.groupby('Team')['Ema'].shift().round(4)
df_all = df_all.merge(goal_ratio_df[['Year','Game_id', 'Team', 'Team_goals']], how='left', on=['Year','Game_id', 'Team'])

In [None]:
goal_df = df_all[['Year','Game_id', 'OppTeam', 'Goal']]
goal_ratio_df = goal_df.groupby(['Year','Game_id', 'OppTeam']).apply(
    lambda x: x['Goal'].sum()
).reset_index(name='Goals')

base = pd.DataFrame([[0,0,n,3] for n in goal_ratio_df.OppTeam.unique()], columns=goal_ratio_df.columns)
goal_ratio_df = pd.concat([base, goal_ratio_df]).reset_index(drop=True)
goal_ratio_df['Ema'] = goal_ratio_df.groupby('OppTeam')['Goals'].transform(
    lambda x: x.ewm(alpha=0.01, adjust=False).mean()
)

df_all = df_all.drop('Opp_concedes', axis=1, errors='ignore')

goal_ratio_df['Opp_concedes'] = goal_ratio_df.groupby('OppTeam')['Ema'].shift().round(4)
df_all = df_all.merge(goal_ratio_df[['Year','Game_id', 'OppTeam', 'Opp_concedes']], how='left', on=['Year','Game_id', 'OppTeam'])

In [None]:
import plotly.express as px


ddd = pd.DataFrame([["1 / 500 / 4 ", 0.777], ["1.5 / 400 / 4 ", 0.781], ["2 / 200 / 5 ", 0.78], ["3 / 300 / 6 ", 0.776]],
                   columns=['hypers', 'AUC'])


# ddd
fig = px.histogram(ddd, x='AUC', y='hypers', labels=dict(hypers="Pos Weight / Estimators / Max depth"), histfunc='avg')

# px.scatter_3d(x=[3,1,1.5], y=[300,500,400], z=[0.777,0.776,0.781], color=[1,2,3],
#               labels=dict(x="Pos Weight", y="Estimators", z="AUC", color='accuracy'))

# import plotly.express as px
# fig = px.scatter_3d(x=[3,1,1.5], y=[300,500,400], z=[0.777,0.776,0.781], color=[1,2,3])
fig.update_xaxes(range=[0.76, 0.79])
fig.show()
fig.write_html(f'../NHL-blog/_includes/hp-tuning.html')

In [None]:
df_tot = df[[
        'Game_id',
        'Game_time',
        'Type',
        # 'Empty_net',
        'Previous_distance',
        # 'X_diff',
        'Speed',
        # 'Period',
        # 'Is_rebound',
        'Time_since_powp',
        'Players',
        'Opp_players',
        'P_diff',
        'Shot_distance',
        # 'X_net',
        'Shot_angle',
        'Rebound_angle',
        'Year',
        'Shooter_ratio',
        'Goalie_ratio',
        'Team_goals',
        'Opp_concedes',
        'Previous_event_type',
        'Goal']].copy()

In [None]:
df_2016 = load_df_shots(2016)
df_2017 = load_df_shots(2017)
df_2018 = load_df_shots(2018)
df_2019 = load_df_shots(2019)
df_2020 = load_df_shots(2020)

df = pd.concat([df_2016, df_2017, df_2018, df_2019, df_2020]).reset_index(drop=True)

df = add_shooter_ratio(df)
df = add_goalie_ratio(df)
df = add_team_goals(df)
df = add_opponent_concedes(df)

df.Shot_angle = df.Shot_angle.abs()
df['Powp'] = df.Players - df.Opp_players
df.loc[df.Powp < 0, 'Powp'] = 0
df

In [None]:
df.columns

In [None]:
df_tot.loc[df_tot.P_diff < 0, 'P_diff'] = 0

In [None]:
from sklearn.preprocessing import OrdinalEncoder, normalize, minmax_scale
enc = OrdinalEncoder()

In [None]:
df_tot.Game_time = minmax_scale(df_tot.Game_time.values)

df_tot.Previous_distance = minmax_scale(df_tot.Previous_distance.values)
df_tot.Speed = minmax_scale(df_tot.Speed.values)
df_tot.Time_since_powp = minmax_scale(df_tot.Time_since_powp.values)
df_tot.Shot_distance = minmax_scale(df_tot.Shot_distance.values)

df_tot.Shooter_ratio = minmax_scale(df_tot.Shooter_ratio.values)
df_tot.Goalie_ratio = minmax_scale(df_tot.Goalie_ratio.values)
df_tot.Team_goals = minmax_scale(df_tot.Team_goals.values)
df_tot.Opp_concedes = minmax_scale(df_tot.Opp_concedes.values)
# df_tot.Players = minmax_scale(df_tot.Players.values)
# df_tot.Opp_players = minmax_scale(df_tot.Opp_players.values)
df_tot.P_diff = minmax_scale(df_tot.P_diff.values)

df_tot.Shot_angle = df_tot.Shot_angle.abs()
df_tot.Shot_angle = minmax_scale(df_tot.Shot_angle.values)
df_tot.Rebound_angle = minmax_scale(df_tot.Rebound_angle.values)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

df_train = df_tot[df_tot.Year < 2020].drop('Year', axis=1)
df_val = df_tot[df_tot.Year == 2020].drop('Year', axis=1)
df_test = df_tot[df_tot.Year == 2020].drop('Year', axis=1)

if 'Type' in df_tot:
    type_enc = df_train.groupby('Type').Goal.mean().reset_index(name='Type_enc')
    df_train = df_train.drop('Type_enc', axis=1, errors='ignore')
    df_train = df_train.merge(type_enc[['Type', 'Type_enc']], how='left', on=['Type'])
    df_train = df_train.drop('Type', axis=1, errors='ignore')
    df_val = df_val.drop('Type_enc', axis=1, errors='ignore')
    df_val = df_val.merge(type_enc[['Type', 'Type_enc']], how='left', on=['Type'])
    df_val = df_val.drop('Type', axis=1, errors='ignore')
    df_test = df_test.drop('Type_enc', axis=1, errors='ignore')
    df_test = df_test.merge(type_enc[['Type', 'Type_enc']], how='left', on=['Type'])
    df_test = df_test.drop('Type', axis=1, errors='ignore')

    # df_train.Type_enc = minmax_scale(df_train.Type_enc.values)
    # df_val.Type_enc = minmax_scale(df_val.Type_enc.values)
    # df_test.Type_enc = minmax_scale(df_test.Type_enc.values)
    # df_train.Type = enc.fit_transform(df_train.Type.values.reshape(-1,1)).reshape(-1)
    # df_val.Type = enc.fit_transform(df_val.Type.values.reshape(-1,1)).reshape(-1)
    # df_test.Type = enc.transform(df_test.Type.values.reshape(-1,1)).reshape(-1)

# df_train.Empty_net = df_train.Empty_net.astype(int)
# df_train.Is_rebound = df_train.Is_rebound.astype(int)
df_train.Goal = df_train.Goal.astype(float)

# df_val.Empty_net = df_val.Empty_net.astype(int)
# df_val.Is_rebound = df_val.Is_rebound.astype(int)
df_val.Goal = df_val.Goal.astype(float)

# df_test.Empty_net = df_test.Empty_net.astype(int)
# df_test.Is_rebound = df_test.Is_rebound.astype(int)
df_test.Goal = df_test.Goal.astype(float)

train_labels = df_train.Goal.values.reshape(-1,1)
val_labels = df_val.Goal.values.reshape(-1,1)
test_labels = df_test.Goal.values.reshape(-1,1)

pos = df_train.Goal.sum()
neg = len(df_train) - df_train.Goal.sum()
tot = len(df_train)
class_weight = {0: (1 / neg) * (tot/2.0), 1: (1 / pos) * (tot/2.0)}

In [None]:
type_enc = df_train.groupby('Previous_event_type').Goal.mean().reset_index(name='Pe_enc')
df_train = df_train.drop('Pe_enc', axis=1, errors='ignore')
df_train = df_train.merge(type_enc[['Previous_event_type', 'Pe_enc']], how='left', on=['Previous_event_type'])
df_train = df_train.drop('Previous_event_type', axis=1, errors='ignore')

df_val = df_val.drop('Pe_enc', axis=1, errors='ignore')
df_val = df_val.merge(type_enc[['Previous_event_type', 'Pe_enc']], how='left', on=['Previous_event_type'])
df_val = df_val.drop('Previous_event_type', axis=1, errors='ignore')
df_val.Pe_enc.fillna(df_val.Pe_enc.mean(), inplace=True)

df_test = df_test.drop('Pe_enc', axis=1, errors='ignore')
df_test = df_test.merge(type_enc[['Previous_event_type', 'Pe_enc']], how='left', on=['Previous_event_type'])
df_test = df_test.drop('Previous_event_type', axis=1, errors='ignore')
df_test.Pe_enc.fillna(df_test.Pe_enc.mean(), inplace=True)

## XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score

X_train = df_train.Shot_distance
y_train = df_train.Goal
X_test = df_test.Shot_distance
y_test = df_test.Goal

bst = XGBClassifier()
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score

X_train = df_train.drop('Goal', axis=1)
y_train = df_train.Goal
X_test = df_test.drop('Goal', axis=1)
y_test = df_test.Goal

bst = XGBClassifier(scale_pos_weight=9)
# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)

In [None]:
y_val

In [None]:
accuracy_score(y_test, preds)

## TFDF

In [None]:
import tensorflow_decision_forests as tfdf

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_train, label="Goal")
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_val, label="Goal")
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_test, label="Goal")

model = tfdf.keras.RandomForestModel(num_trees=550, max_depth=30)
# model.fit(train_ds)
model.fit(train_ds, class_weight=class_weight)

In [None]:
43597 / df_val[df_val.Goal == 0].Goal.count()

In [None]:
4487 / df_val[df_val.Goal == 1].Goal.count()

In [None]:
from keras import metrics

In [None]:
mtrcs = [metrics.AUC(), metrics.AUC(curve='PR', name='auc_pr'), metrics.Precision(), metrics.Recall()]
model.compile(metrics=mtrcs)
eval = model.evaluate(val_ds)[1:]
mtrs_dir = {mtr.name:eval[i] for i, mtr in enumerate(mtrcs)}

In [None]:
mtrs_dir

In [None]:
from keras import metrics
model.compile(metrics=['accuracy', metrics.AUC(), metrics.AUC(curve='PR'), metrics.Precision(), metrics.Recall()])
model.evaluate(val_ds)
# predictions = model.predict(test_ds)

In [None]:
from keras import metrics
model.compile(metrics.AUC(), 'accuracy')
model.evaluate(val_ds)

In [None]:
predictions = model.predict(val_ds)

In [None]:
predictions.reshape(-1)

In [None]:
y_val = df_val.Goal

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.calibration import calibration_curve

def plots(y_valid, y_prob, model_name):
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    plt.subplots_adjust(wspace=0.4, hspace=0.6)
    plt.suptitle(f'Performance Evaluation of: {model_name}', fontsize=16)


    # Plot ROC curve and calculate AUC
    fpr, tpr, thresholds = roc_curve(y_valid, y_prob)
    roc_auc = roc_auc_score(y_valid, y_prob)

    axes[0,0].plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    axes[0,0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    axes[0,0].set_xlim([0.0, 1.0])
    axes[0,0].set_ylim([0.0, 1.05])
    axes[0,0].set_xlabel('False Positive Rate')
    axes[0,0].set_ylabel('True Positive Rate')
    axes[0,0].set_title('Receiver Operating Characteristic (ROC)')
    axes[0,0].legend(loc="lower right")

    

    axes[0,1].plot([0, 1], [0, 1], 'k:', label="Perfectly calibrated")
    prob_true, prob_pred = calibration_curve(y_valid, y_prob, n_bins=10)
    axes[0,1].plot(prob_pred, prob_true, 's-', label="%s" % ('clf_distance',))
    axes[0,1].set_ylabel('Fraction of positives')
    axes[0,1].set_xlabel('Mean predicted probability')
    axes[0,1].set_title('Calibration Plot')

    # Calculate and plot the rate of goals and cumulative proportion of goals
    sorted_indices = np.argsort(y_prob)
    sorted_goals = y_valid[sorted_indices]
    predicted_probs = y_prob[sorted_indices]

    n_bins = 20
    bins = np.linspace(0, 1, n_bins + 1)
    midpoints = (bins[:-1] + bins[1:]) / 2

    goal_rates = []
    for i in range(n_bins):
        start_idx = int(i * len(predicted_probs) / n_bins)
        end_idx = int((i + 1) * len(predicted_probs) / n_bins)
        
        goals = sum(sorted_goals[start_idx:end_idx])
        total_shots = end_idx - start_idx
        
        goal_rate = 100 * goals / total_shots
        goal_rates.append(goal_rate)

    axes[1,0].plot(midpoints*100, goal_rates, linestyle='-')
    axes[1,0].set_xlim([100,0])
    axes[1,0].set_ylim([0,100])
    axes[1,0].set_xlabel('Centile of Probability')
    axes[1,0].set_ylabel('Rate of Goals')
    axes[1,0].set_title('Rate of Goals vs. Centile of Probability')

    cumulative_goals = np.cumsum(sorted_goals[::-1])
    tot_goals = y_valid.sum()

    axes[1,1].plot(np.arange(len(y_prob), 0, -1) * 100 / len(y_prob), cumulative_goals * 100 / tot_goals, linestyle='-')
    axes[1,1].set_xlim([100,0])
    axes[1,1].set_ylim([0,100])
    axes[1,1].set_xlabel('Centile of Probability')
    axes[1,1].set_ylabel('Cumulative Proportion of Goals')
    axes[1,1].set_title('Cumulative Proportion of Goals vs. Centile of Probability')

    # Display the figure with two subplots side by side
    plt.show()

In [None]:
plots(y_val, predictions.reshape(-1), "test")

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
fpr, tpr, _ = roc_curve(val_labels, predictions)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(val_labels, predictions)
auc_score = auc(recall, precision)

# Plotting ROC curve
plt.figure()
lw = 2
plt.plot(recall, precision, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
print(model.summary())