In [1]:
import numpy as np
import pandas as pd

In [2]:
path = ("C:/Users/Ong Jia Yi/Desktop/STUDY/Summer 2021"
        "/Rotman MMA Summer 2021 Datathon")

# hereafter called 'the original dataframe'
load = pd.read_csv(path + "/NWHL.csv")

### DEFINITIONS
1) "outcomes" are defined to be the end of a sequence of prior events that are of interest.

## Function - prior_events

In [4]:
def prior_events(outcome_data):
    """Return a dataframe of the details of events that led to an outcome
    of interest.
    
    Arguments:
    outcome_data -- subset of the original Pandas DataFrame where
                    the outcome(s) of interest occur.Indexes of <outcome_data>
                    preserves that of the original dataframe.
    Returns:
    new_frame -- Pandas DataFrame containing info. on prior events.
    """
    new_frame = []
    for i in outcome_data.index:

        new_row = []
        for j in range(1, window_len+1):
            last_event = load.loc[i-j, outcome_data.columns]
            last_event = np.array(last_event)
            new_row.append(last_event)
        new_row = np.concatenate(new_row)

        new_frame.append([new_row])
    
    new_frame = pd.DataFrame(np.concatenate(new_frame))
    colnames = ["{}_l{}".format(name, i) for i in range(1, window_len+1) \
                for name in outcome_data.columns]
    new_frame.columns = colnames
    
    return new_frame

## Function - encode_player

In [5]:
def encode_players(outcome_data, prior_events_data, all_players, window_len):
    """Encode the outcome player and prior event player(s) by their IDs,
    which are their order of appearance in the original dataset.
    
    Arguments:
    outcome_data -- subset of the original Pandas DataFrame where
                    the outcome(s) of interest occur.
    prior_events_data -- output from prior_events() function with 
                         the same <outcome_data> input.
    all_players -- a Numpy array of all player names in the original 
                   dataset (no repetition).
    window_len -- (int) the number of previous events to include.
    
    Returns:
    player_encoded -- a Pandas DataFrame of the IDs of the player associated
                      to the outcome and prior events.
    """
    outcome_player = outcome_data.Player.reset_index(drop=True)
    prior_players = prior_events_data.filter(regex="Player").reset_index(drop=True)

    merged = pd.concat([outcome_player, prior_players], ignore_index=True, axis=1)
    
    # plus one so ID starts at 1
    players_encoded = merged.applymap(lambda x: np.where(x == all_players)[0][0] + 1)
    
    return players_encoded

## Function - one_hot_encoder

In [6]:
def one_hot_encoder(categories):
    """Creates a one-hot encoding for categorical data
    e.g. name of event or name of player.
    
    Arguments:
    categories -- Numpy array of all instances of the categorical data.
    
    Returns:
    oh_dict -- dictionary that maps instance of categorical data to 
               a Numpy array of one-hot encoding integers.
    """
    oh_mat = pd.get_dummies(categories)
    
    oh_dict = dict()
    for cat in categories:
        oh_dict[cat] = np.array(oh_mat.loc[:, cat])
        
    return oh_dict

## Function - encode_events

In [90]:
def encode_events(outcome_data, prior_events_data, event_oh, window_len):
    """Encode the outcome and prior event(s) by their IDs,
    which are their order of appearance in the original dataset.
    
    Arguments:
    outcome_data -- subset of the original Pandas DataFrame where
                    the outcome(s) of interest occur.
    prior_events_data -- output from prior_events() function with 
                         the same <outcome_data> input.
    event_oh -- one-hot-encoding dictionary for events.
    window_len -- (int) the number of previous events to include.
    
    Returns:
    events_encoded -- a Pandas DataFrame of the IDs of the event associated
                      to the outcome and prior events.
    """
    outcome_events = outcome_data.Event.reset_index(drop=True)
    outcome_events = pd.DataFrame(outcome_events).applymap(lambda x: event_oh[x])
    
    prior_events = prior_events_data.filter(regex="Event").reset_index(drop=True)
    prior_events = pd.DataFrame(prior_events).applymap(lambda x: event_oh[x])
    
    target_events_encoded = pd.DataFrame(outcome_events.iloc[:,0].to_list())
    events_encoded = pd.DataFrame(prior_events.iloc[:,0].to_list())
    
    for i in range(1, prior_events.shape[1]):
        prior_encoded_sub = pd.DataFrame(prior_events.iloc[:,i].to_list())
        events_encoded = pd.concat([events_encoded, prior_encoded_sub], axis=1)
    
    
    return target_events_encoded, events_encoded

## Checking functions

In [8]:
# interested in events leading to outcomes of Shot/Goal
outcomes_interest = ["Shot", "Goal"]

# capture two prior events leading to outcome
window_len = 2

#  a list of variable names to describe prior events.
detail_names = ['Period', 'Home Team Skaters', 'Away Team Skaters',
                'X Coordinate', 'Y Coordinate', 'Player', 'Event']

# Subset rows only with Event == Shot/Goal
# Indexes of <outcome_data> preserves that of the original dataframe.
outcome_data = load.loc[load.Event.isin(outcomes_interest), detail_names]

In [9]:
prior_events_data = prior_events(outcome_data)
prior_events_data.head()

Unnamed: 0,Period_l1,Home Team Skaters_l1,Away Team Skaters_l1,X Coordinate_l1,Y Coordinate_l1,Player_l1,Event_l1,Period_l2,Home Team Skaters_l2,Away Team Skaters_l2,X Coordinate_l2,Y Coordinate_l2,Player_l2,Event_l2
0,1,5,5,125,28,McKenna Brand,Zone Entry,1,5,5,107,40,McKenna Brand,Puck Recovery
1,1,5,5,124,2,Allie Thunstrom,Zone Entry,1,5,5,112,0,Allie Thunstrom,Takeaway
2,1,5,5,168,10,Allie Thunstrom,Play,1,5,5,124,20,Nina Rodgers,Zone Entry
3,1,5,5,135,52,Jillian Dempsey,Play,1,5,5,125,70,Jillian Dempsey,Zone Entry
4,1,5,5,196,20,Lauren Kelly,Puck Recovery,1,5,5,174,82,Jillian Dempsey,Incomplete Play


In [10]:
encoded_players = encode_players(outcome_data, prior_events_data, load.Player.unique(), window_len)
encoded_players.head()

Unnamed: 0,0,1,2
0,2,2,2
1,11,11,11
2,13,11,13
3,2,1,1
4,19,19,1


In [12]:
event_oh = one_hot_encoder(load.Event.unique())
event_oh["Shot"]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [116]:
load.Event.unique()

array(['Faceoff Win', 'Puck Recovery', 'Zone Entry', 'Shot', 'Play',
       'Takeaway', 'Dump In/Out', 'Incomplete Play', 'Penalty Taken',
       'Goal'], dtype=object)

In [91]:
target_encoded_events, encoded_events = encode_events(outcome_data, prior_events_data, event_oh, window_len)
encoded_events.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1
0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


## Finalize data

### Event data

In [40]:
final_event_data_1 = outcome_data.drop(["Player", "Event"], axis=1)
final_event_data_1 = final_event_data_1.reset_index(drop=True)
final_event_data_1.head()

Unnamed: 0,Period,Home Team Skaters,Away Team Skaters,X Coordinate,Y Coordinate
0,1,5,5,131,28
1,1,5,5,162,2
2,1,5,5,167,18
3,1,5,5,175,68
4,1,5,5,196,20


In [39]:
final_event_data_2 = prior_events_data.drop(list(prior_events_data.filter(regex="Event|Player")), axis=1)
final_event_data_2 = final_event_data_2.reset_index(drop=True)
final_event_data_2.head()

Unnamed: 0,Period_l1,Home Team Skaters_l1,Away Team Skaters_l1,X Coordinate_l1,Y Coordinate_l1,Period_l2,Home Team Skaters_l2,Away Team Skaters_l2,X Coordinate_l2,Y Coordinate_l2
0,1,5,5,125,28,1,5,5,107,40
1,1,5,5,124,2,1,5,5,112,0
2,1,5,5,168,10,1,5,5,124,20
3,1,5,5,135,52,1,5,5,125,70
4,1,5,5,196,20,1,5,5,174,82


In [92]:
final_event_data = pd.concat([encoded_events, final_event_data_1, final_event_data_2], ignore_index=True, axis=1)
final_event_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0,0,1,0,0,0,0,0,0,0,...,1,5,5,125,28,1,5,5,107,40
1,0,0,1,0,0,0,0,0,0,0,...,1,5,5,124,2,1,5,5,112,0
2,0,0,0,0,1,0,0,0,0,0,...,1,5,5,168,10,1,5,5,124,20
3,0,0,0,0,1,0,0,0,0,0,...,1,5,5,135,52,1,5,5,125,70
4,0,1,0,0,0,0,0,0,0,0,...,1,5,5,196,20,1,5,5,174,82


### Player data

In [45]:
encoded_players.head()

Unnamed: 0,0,1,2
0,2,2,2
1,11,11,11
2,13,11,13
3,2,1,1
4,19,19,1


## Train-test split

In [70]:
import random

In [87]:
nrow = encoded_players.shape[0]
random_ind = [i for i in range(0, nrow)]
random.shuffle(random_ind)

In [102]:
train_test_split = 0.8
split = int(train_test_split*nrow)

In [121]:
X_train_player = encoded_players.loc[random_ind[:split]]
X_train_event = final_event_data.loc[random_ind[:split]]
y_train = target_encoded_events.iloc[random_ind[:split], -1:] # last column is Goal

X_test_player = encoded_players.loc[random_ind[split:]]
X_test_event = final_event_data.loc[random_ind[split:]]
y_test = target_encoded_events.iloc[random_ind[split:], -1:] # last column is Goal

## Keras Hypermodel

In [None]:
from kerastuner import HyperModel

import tensorflow
from tensorflow import keras
from tensorflow.keras.layers import (Embedding, InputLayer, Dense,
                                     Activation)
from tensorflow.keras import optimizers
from tensorflow.keras import metrics

In [None]:
class Hypermodel(HyperModel):
    def __init__(self, input_shape, n_players, emb_dim):
        self.input_shape = input_shape
        self.n_players = n_players
        self.emb_dim = emb_dim
        
    def build(self, hp):
        model = keras.Sequential()
        model.add(InputLayer(input_shape=self.input_shape))
        model.add(Embedding(self.n_players, self.emb_dim, trainable=True))
        
        model.add(
            Dense(units=hp.Int('units_1', 
                               min_value=16, 
                               max_value=128, 
                               step=16),
                  activation=hp.Choice('activation_1', 
                                       values=['relu', 'tanh', 'sigmoid']))
        )
        model.add(
            Dense(units=hp.Int('units_2', 
                               min_value=16, 
                               max_value=128, 
                               step=16),
                  activation=hp.Choice('activation_2', 
                                       values=['relu', 'tanh', 'sigmoid']))
        )
        model.add(Dense(units=1))
        model.add(Activation(activation='sigmoid'))
        
        model.compile(
            optimizer=optimizers.Adam(
                learning_rate=hp.Float('learning_rate', 
                                       min_value=1e-4, 
                                       max_value=1e-2,
                                       sampling='LOG'),
                loss='binary_crossentropy',
                metrics=['accuracy'])
        )
        
        return model

### Hyperparameter Tuning