In [None]:
import pandas as pd
import numpy as np
import math
import requests
import json
import os
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve, CalibrationDisplay

In [None]:
df = pd.read_csv('tidy_df.csv')

In [None]:
# training (train+validation sets for model) and test tests creation

df_train = df[df['season'] != 20192020]
df_test = df[df['season'] == 20192020]

In [None]:
# Class 0 (no-goal) vs. Class 1 (goal) of the whole training set
sns.countplot(x='goal_ind', data = df_train, palette = 'Set3')

In [None]:
# Class 0 (no-goal) vs. Class 1 (goal) of the final test set

sns.countplot(x='goal_ind', data = df_test, palette = 'Set3')

In [None]:
sns.countplot(x='shot_type', data = df_test, palette = 'Set3', hue = 'goal_ind')

In [None]:
# Functions

# Function1: Function to generate ROC curve with AUC metric

def curves(X_train,y_train,X_valid,y_valid,y_pred,predicted_prob):

#--------------------Determine ROC curve----------------------------------

    from sklearn.metrics import roc_curve, auc
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    fpr, tpr, _ = roc_curve(y_valid, y_pred)
    roc_auc = auc(fpr, tpr)
#--------------------Determine Goal Rate (#goals / (#no_goals + #goals)) & cum_goal_rate ----------------------------------
    dummy_goal = []
    for i in y_valid:
        dummy_goal.append(i)
    df_goals = pd.DataFrame(dummy_goal, columns = ["goal_ind"])
# loop to get probabilities of 2 classes separately
    dummy_list = []
    for i in predicted_prob:
        dummy_list.append(i)
    df_prob = pd.DataFrame(dummy_list, columns = ["No_Goal_Prob","Goal_Prob"])
    df_new = pd.concat([df_goals, df_prob], axis=1)
    df_new['shot_count'] = 1
    df_new['percentile'] = df_new['Goal_Prob'].rank(pct=True)

# The goal rate (#goals / (#no_goals + #goals)) as a function of the shot probability model percentile
    total_goals = df_new['goal_ind'].sum()
    df_perc = []
    goal_count = []
    shot_count = []
    goal_rate = []
    cum_goal_rate = []
    pctile = []
    pctile_prop = []
    quantile_list = np.linspace(0,1,21).round(4).tolist()
    q = df_new.quantile(quantile_list)
    col = 'Goal_Prob'
    temp = 0
    for i in np.arange(5,101,5):
        df_perc = df_new[((df_new[col]>=q[col][(i-5)/100]) & (df_new[col]<q[col][i/100]))]
        goal_count.append(df_perc.goal_ind.sum())
        shot_count.append(df_perc.shot_count.sum())
        goal_rate.append(df_perc.goal_ind.sum()/df_perc.shot_count.sum())
        pctile.append(i/100)
        temp = temp + df_perc.goal_ind.sum()
        cum_goal_rate.append(temp/total_goals)
        pctile_prop.append(1-(i/100))
    df_perc_prop = pd.DataFrame(list(zip(goal_count,shot_count,goal_rate,pctile,cum_goal_rate,pctile_prop)),columns=['goal_count',"sum_shot_count",'goal_rate','pctile','proportion','pctile_prop'])

#--------------------plot figures 1----------------------------------
#    ax = plt.gca()
    plt.figure(figsize=(10,5))
    lw = 2
    plt.plot(
        fpr,
        tpr,
        color="darkorange",
        lw=lw,
        label="ROC curve (area = %0.3f)" % roc_auc,
        linestyle="-"
    )
    
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, label="Random Baseline", linestyle="--")
    
    plt.plot(
        df_perc_prop['pctile'],
        df_perc_prop['goal_rate'],
        color="green",
        lw=lw,
        label="goal rate",
        linestyle="-."
    )
    
    plt.plot(
        df_perc_prop['pctile'],
        df_perc_prop['proportion'],
        color="red",
        lw=lw,
        label="culumative goal rate",
        linestyle=":"
    )    
    
    plt.title(f'Feature {ele} trained with Logistic Regression')
    plt.legend()
    plt.show()
#--------------------plot figures 4----------------------------------
    #disp = CalibrationDisplay.from_predictions(df_new.goal_ind,df_new.Goal_Prob,n_bins=200)
    #plt.show()

In [None]:
list_model_iter = [['distance_from_net'],
                   ['angle_from_net'],
                   ['distance_from_net','angle_from_net']
                  ]
for ele in list_model_iter:
    if df_train[ele].shape[1] == 1:
        X = df_train[ele].values.reshape(-1,1)
    else:
        X = df_train[ele].values
        
    y = df_train['goal_ind']
    X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.3)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    cm = sklearn.metrics.confusion_matrix(y_valid, y_pred, labels=None, sample_weight=None, normalize=None)

# Loop to check number of goals predicted by classifier
    cnt = 0
    for i in range(len(X_valid)):
        if y_pred[i] == 1:
            cnt += 1
#    print(f'Number of goal events predicted by classifier with selected features {ele} is : {cnt} with accuracy score of {clf.score(X_train, y_train)} and {clf.score(X_valid, y_valid)} on training set and validation set respectively')
    
#Predicted Probabilities - Outputs probability for class 0 (not a goal event) and class (goal event)   
    predicted_prob = clf.predict_proba(X_valid)

# Plot ROC curve
    curves(X_train,y_train,X_valid,y_valid,y_pred,predicted_prob)

In [None]:
# Random Baseline
df_train_random = df_train[['goal_ind','shot_ind']].copy()
random_select = []
for ele in range(df_train.shape[0]):
    random_select.append(np.random.uniform(0, 1))  
df_train_random['Goal_Prob'] = random_select
df_train_random['shot_count'] = 1

In [None]:
# The goal rate (#goals / (#no_goals + #goals)) as a function of the shot probability model percentile
total_goals = df_train_random['goal_ind'].sum()

df_perc = []

goal_count = []
shot_count = []
goal_rate = []
cum_goal_rate = []
pctile = []
pctile_prop = []
quantile_list = np.linspace(0,1,21).round(4).tolist()
q = df_train_random.quantile(quantile_list)
col = 'Goal_Prob'

temp = 0

for i in np.arange(5,101,5):
    df_perc = df_train_random[((df_train_random[col]>=q[col][(i-5)/100]) & (df_train_random[col]<q[col][i/100]))]
    goal_count.append(df_perc.goal_ind.sum())
    shot_count.append(df_perc.shot_count.sum())
    goal_rate.append(df_perc.goal_ind.sum()/df_perc.shot_count.sum())
    pctile.append(i/100)
    temp = temp + df_perc.goal_ind.sum()
    cum_goal_rate.append(temp/total_goals)
    pctile_prop.append(1-(i/100))
df_perc_prop = pd.DataFrame(list(zip(goal_count,shot_count,goal_rate,pctile,cum_goal_rate,pctile_prop)),columns=['goal_count',"sum_shot_count",'goal_rate','pctile','proportion','pctile_prop'])

y1_max = max(df_perc_prop['goal_rate'])

fig = plt.figure(figsize = (14,7))
ax = sns.lineplot(x = 'pctile', y = 'goal_rate', label='goal percentile function of distance from net', data = df_perc_prop, color='b', legend = False, linewidth = 2.5)
ax.set_xlim(left=1.05, right=-.05)
ax.set_ylim(bottom=0, top=1)
fig.legend(loc="upper right")
plt.title(f"goal rate (#goals / (#no_goals + #goals)) as a function of the shot probability model percentile")
fig.legend(loc="upper right")
#plt.autoscale()
fig.show()

In [None]:
y1_max = max(df_perc_prop['proportion'])

fig = plt.figure(figsize = (14,7))
ax = sns.lineplot(x = 'pctile', y = 'proportion', label='goal percentile function of distance from net', data = df_perc_prop, color='b', legend = False, linewidth = 2.5)
ax.set_xlim(left=1.05, right=-.05)
ax.set_ylim(bottom=0, top=1)
fig.legend(loc="upper right")
plt.title(f"goal percentile function of distance from net")
fig.legend(loc="upper right")
#plt.autoscale()
fig.show()

In [None]:
from sklearn.calibration import calibration_curve, CalibrationDisplay
prob_true, prob_pred = calibration_curve(df_train_random.goal_ind, df_train_random.Goal_Prob, n_bins=50)
disp = CalibrationDisplay(prob_true, prob_pred, df_train_random.Goal_Prob)
disp.plot()

## Question 3 - baseline models other version

In [None]:
df_train['angle_from_net_abs'] = df_train['angle_from_net'].abs()
df_train['angle_from_net_bucket_abs'] = pd.qcut(df_train['angle_from_net_abs'] , n_buckets, labels = False) +1

In [None]:
#1st model

ele = ['distance_from_net']

if df_train[ele].shape[1] == 1:
    X = df_train[ele].values.reshape(-1,1)
else:
    X = df_train[ele].values

y = df_train['goal_ind']
X_train, X_valid, y_train, y_valid1 = train_test_split(X,y,test_size=0.3)
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)
y_pred1 = clf1.predict(X_valid)
y_pred_prob1 = clf1.predict_proba(X_valid)[:,1]
cm1 = sklearn.metrics.confusion_matrix(y_valid1, y_pred1, labels=None, sample_weight=None, normalize=None)

#2nd model

ele = ['angle_from_net_abs']

if df_train[ele].shape[1] == 1:
    X = df_train[ele].values.reshape(-1,1)
else:
    X = df_train[ele].values

y = df_train['goal_ind']
X_train, X_valid, y_train, y_valid2 = train_test_split(X,y,test_size=0.3)
clf2 = LogisticRegression()
clf2.fit(X_train, y_train)
y_pred2 = clf2.predict(X_valid)
y_pred_prob2 = clf2.predict_proba(X_valid)[:,1]
cm2 = sklearn.metrics.confusion_matrix(y_valid2, y_pred2, labels=None, sample_weight=None, normalize=None)


#3rd model

ele = ['distance_from_net','angle_from_net_abs']

if df_train[ele].shape[1] == 1:
    X = df_train[ele].values.reshape(-1,1)
else:
    X = df_train[ele].values

y = df_train['goal_ind']
X_train, X_valid, y_train, y_valid3 = train_test_split(X,y,test_size=0.3)
clf3 = LogisticRegression()
clf3.fit(X_train, y_train)
y_pred3 = clf3.predict(X_valid)
y_pred_prob3 = clf3.predict_proba(X_valid)[:,1]
cm3 = sklearn.metrics.confusion_matrix(y_valid3, y_pred3, labels=None, sample_weight=None, normalize=None)

In [None]:
from sklearn.metrics import roc_curve, auc

fpr1 = dict()
tpr1 = dict()
roc_auc1 = dict()
fpr1, tpr1, _ = roc_curve(y_valid1, y_pred_prob1)
roc_auc1 = auc(fpr1, tpr1)

fpr2 = dict()
tpr2 = dict()
roc_auc2 = dict()
fpr2, tpr2, _ = roc_curve(y_valid2, y_pred_prob2)
roc_auc2 = auc(fpr2, tpr2)

fpr3 = dict()
tpr3 = dict()
roc_auc3 = dict()
fpr3, tpr3, _ = roc_curve(y_valid3, y_pred_prob3)
roc_auc3 = auc(fpr3, tpr3)

fpr4 = dict()
tpr4 = dict()
roc_auc4 = dict()
fpr4, tpr4, _ = roc_curve(y_valid3, np.random.uniform(0,1,len(y_valid3)))
roc_auc4 = auc(fpr4, tpr4)

In [None]:
plt.figure(figsize=(10,5))
lw = 2
plt.plot(
    fpr1,
    tpr1,
    color="darkorange",
    lw=lw,
    label="distance model ROC curve (area = %0.3f)" % roc_auc1,
    linestyle="-"
)

plt.plot(
    fpr2,
    tpr2,
    color="red",
    lw=lw,
    label="angle model ROC curve (area = %0.3f)" % roc_auc2,
    linestyle="-"
)

plt.plot(
    fpr3,
    tpr3,
    color="blue",
    lw=lw,
    label="distance and angle model ROC curve (area = %0.3f)" % roc_auc3,
    linestyle="-"
)

plt.plot(
    fpr4,
    tpr4,
    color="black",
    lw=lw,
    label="%0.3f" % roc_auc4,
    linestyle="-"
)

plt.plot([0, 1], [0, 1], color="navy", lw=lw, label="Random Baseline", linestyle="--")
plt.legend()

In [None]:
df1 = pd.DataFrame({'goal_ind': y_valid1, 'Goal_Prob': y_pred_prob1})
df2 = pd.DataFrame({'goal_ind': y_valid2, 'Goal_Prob': y_pred_prob3})
df3 = pd.DataFrame({'goal_ind': y_valid3, 'Goal_Prob': y_pred_prob3})    

In [None]:
def prep_df(df, n_buckets = 20):
    df['shot_count'] = 1
    df['Goal_Prob_bucket'] = pd.qcut(df['Goal_Prob'], n_buckets, labels = False) + 0
    df = df.groupby(['Goal_Prob_bucket']).sum().reset_index()
    df['goal_rate'] = df['goal_ind']/df['shot_count']
    df['pred_percentile'] = df['Goal_Prob_bucket']*(100/n_buckets)
    df = df.sort_values('pred_percentile', ascending = False)
    df['cumul_goal_pct'] = np.cumsum(df['goal_ind'])/sum(df['goal_ind'])
    
    return df

In [None]:
df1 = prep_df(df1)
df2 = prep_df(df2)
df3 = prep_df(df3)

In [None]:
plt.figure(figsize=(10,5))
lw = 2
plt.plot(
    df1['pred_percentile'],
    df1['goal_rate'],
    color="darkorange",
    lw=lw,
    label="model 1",
    linestyle="-"
)

plt.plot(
    df2['pred_percentile'],
    df2['goal_rate'],
    color="red",
    lw=lw,
    label="model 2",
    linestyle="-"
)

plt.plot(
    df3['pred_percentile'],
    df3['goal_rate'],
    color="blue",
    lw=lw,
    label="model 3",
    linestyle="-"
)

plt.xlim(100, -5)
plt.ylim(0, 1)
plt.title('goal rate')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
lw = 2
plt.plot(
    df1['pred_percentile'],
    df1['cumul_goal_pct'],
    color="darkorange",
    lw=lw,
    label="model 1",
    linestyle="-"
)

plt.plot(
    df2['pred_percentile'],
    df2['cumul_goal_pct'],
    color="red",
    lw=lw,
    label="model 2",
    linestyle="-"
)

plt.plot(
    df3['pred_percentile'],
    df3['cumul_goal_pct'],
    color="blue",
    lw=lw,
    label="model 3",
    linestyle="-"
)

plt.xlim(100, -5)
plt.ylim(0, 1.1)
plt.title('goal rate')
plt.legend()
plt.show()