In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
path = ("C:/Users/Ong Jia Yi/Desktop/STUDY/Summer 2021"
        "/Rotman MMA Summer 2021 Datathon")

load = pd.read_csv(path + "/NWHL.csv")

## Basic Preprocessing

In [3]:
load['Home Team Y/N'] = (load.Team == load.loc[:,'Home Team'])*1

In [4]:
def time_elapsed(dataframe):
    """Return game time elapsed in seconds based on period number"""
    period, clock = dataframe['Period'], dataframe['Clock']
    m, s = clock.split(':')
    seconds = int(m)*60 + int(s)
    
    if period == 4:
        elapsed = 5*60 - seconds
    else:
        elapsed = 20*60 - seconds
    return elapsed

In [5]:
load['Elapsed'] = load.apply(lambda x: time_elapsed(x), axis=1)

In [6]:
load.Event = load.Event.replace('Goal', 'Shot')

## Function - prior_events

In [7]:
def prior_events(outcome_data):
    """Return a dataframe of the details of events that led to an outcome
    of interest.
    
    Arguments:
    outcome_data -- subset of the original Pandas DataFrame where
                    the outcome(s) of interest occur.Indexes of <outcome_data>
                    preserves that of the original dataframe.
    Returns:
    new_frame -- Pandas DataFrame containing info. on prior events.
    """
    new_frame = []
    for i in outcome_data.index:
        
        if i < window_len:
            continue

        new_row = []
        for j in range(1, window_len+1):
            last_event = load.loc[i-j, outcome_data.columns]
            last_event = np.array(last_event)
            new_row.append(last_event)
        new_row = np.concatenate(new_row)

        new_frame.append([new_row])
    
    new_frame = pd.DataFrame(np.concatenate(new_frame))
    colnames = ["{}_l{}".format(name, i) for i in range(1, window_len+1) \
                for name in outcome_data.columns]
    new_frame.columns = colnames
    
    return new_frame

## Function - encode_player

In [8]:
def encode_players(outcome_data, prior_events_data, all_players, window_len):
    """Encode the outcome player and prior event player(s) by their IDs,
    which are their order of appearance in the original dataset.
    
    Arguments:
    outcome_data -- subset of the original Pandas DataFrame where
                    the outcome(s) of interest occur.
    prior_events_data -- output from prior_events() function with 
                         the same <outcome_data> input.
    all_players -- a Numpy array of all player names in the original 
                   dataset (no repetition).
    window_len -- (int) the number of previous events to include.
    
    Returns:
    player_encoded -- a Pandas DataFrame of the IDs of the player associated
                      to the outcome and prior events.
    """
    outcome_player = outcome_data.loc[window_len:,'Player'].reset_index(drop=True)
    prior_players = prior_events_data.filter(regex="Player").reset_index(drop=True)

    merged = pd.concat([outcome_player, prior_players], ignore_index=True, axis=1)
    
    players_encoded = merged.applymap(lambda x: np.where(x == all_players)[0][0])
    
    return players_encoded

## Function - one_hot_encoder

In [9]:
def one_hot_encoder(categories):
    """Creates a one-hot encoding for categorical data
    e.g. name of event or name of player.
    
    Arguments:
    categories -- Numpy array of all instances of the categorical data.
    
    Returns:
    oh_dict -- dictionary that maps instance of categorical data to 
               a Numpy array of one-hot encoding integers.
    """
    oh_mat = pd.get_dummies(categories)
    
    oh_dict = dict()
    for cat in categories:
        oh_dict[cat] = np.array(oh_mat.loc[:, cat])
        
    return oh_dict

## Function - encode_events

In [10]:
def encode_events(outcome_data, prior_events_data, event_oh, window_len):
    """Encode the outcome and prior event(s) by their IDs,
    which are their order of appearance in the original dataset.
    
    Arguments:
    outcome_data -- subset of the original Pandas DataFrame where
                    the outcome(s) of interest occur.
    prior_events_data -- output from prior_events() function with 
                         the same <outcome_data> input.
    event_oh -- one-hot-encoding dictionary for events.
    window_len -- (int) the number of previous events to include.
    
    Returns:
    events_encoded -- a Pandas DataFrame of the IDs of the event associated
                      to the outcome and prior events.
    """
    outcome_events = outcome_data.loc[window_len:, 'Event'].reset_index(drop=True)
    outcome_events = pd.DataFrame(outcome_events).applymap(lambda x: event_oh[x])
    
    prior_events = prior_events_data.filter(regex="Event").reset_index(drop=True)
    prior_events = pd.DataFrame(prior_events).applymap(lambda x: event_oh[x])
    
    target_events_encoded = pd.DataFrame(outcome_events.iloc[:,0].to_list())
    events_encoded = pd.DataFrame(prior_events.iloc[:,0].to_list())
    
    for i in range(1, prior_events.shape[1]):
        prior_encoded_sub = pd.DataFrame(prior_events.iloc[:,i].to_list())
        events_encoded = pd.concat([events_encoded, prior_encoded_sub], axis=1)
    
    
    return target_events_encoded, events_encoded

## Implementing functions

In [11]:
# interested in events leading to outcomes of Shot/Goal
outcomes_interest = load.Event.unique()

# capture two prior events leading to outcome
window_len = 3

#  a list of variable names to describe prior events.
detail_names = ['Period', 'Elapsed', 'Home Team Y/N',
                'Home Team Skaters', 'Away Team Skaters',
                'Home Team Goals', 'Away Team Goals',
                'X Coordinate', 'Y Coordinate', 'Player', 'Event']

# Subset rows only with Event == Shot/Goal
# Indexes of <outcome_data> preserves that of the original dataframe.
outcome_data = load.loc[load.Event.isin(outcomes_interest), detail_names]

In [12]:
prior_events_data = prior_events(outcome_data)
prior_events_data.head()

Unnamed: 0,Period_l1,Elapsed_l1,Home Team Y/N_l1,Home Team Skaters_l1,Away Team Skaters_l1,Home Team Goals_l1,Away Team Goals_l1,X Coordinate_l1,Y Coordinate_l1,Player_l1,...,Elapsed_l3,Home Team Y/N_l3,Home Team Skaters_l3,Away Team Skaters_l3,Home Team Goals_l3,Away Team Goals_l3,X Coordinate_l3,Y Coordinate_l3,Player_l3,Event_l3
0,1,3,0,5,5,0,0,125,28,McKenna Brand,...,0,0,5,5,0,0,100,43,Jillian Dempsey,Faceoff Win
1,1,5,0,5,5,0,0,131,28,McKenna Brand,...,2,0,5,5,0,0,107,40,McKenna Brand,Puck Recovery
2,1,7,0,5,5,0,0,169,21,Tereza Vanisova,...,3,0,5,5,0,0,125,28,McKenna Brand,Zone Entry
3,1,8,0,5,5,0,0,159,26,Samantha Davis,...,5,0,5,5,0,0,131,28,McKenna Brand,Shot
4,1,10,1,5,5,0,0,101,31,Stephanie Anderson,...,7,0,5,5,0,0,169,21,Tereza Vanisova,Faceoff Win


In [13]:
encoded_players = encode_players(outcome_data, prior_events_data, load.Player.unique(), window_len)
encoded_players.head()

Unnamed: 0,0,1,2,3
0,1,1,1,0
1,2,1,1,1
2,3,2,1,1
3,4,3,2,1
4,4,4,3,2


In [14]:
event_oh = one_hot_encoder(load.Event.unique())
event_oh["Shot"]

array([0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=uint8)

In [15]:
target_encoded_events, encoded_events = encode_events(outcome_data, prior_events_data, event_oh, window_len)
encoded_events.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,0.1,...,8.1,0.2,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.2
0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [16]:
target_encoded_events.shape

(26879, 9)

In [17]:
target_encoded_events.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0


In [18]:
# for multi-class
# target = target_encoded_events.loc[:, (target_encoded_events != 0).any(axis=0)]

In [34]:
# for binary-class
target = target_encoded_events.iloc[:,3]

## Finalize data

### Event data

In [20]:
final_event_data_1 = outcome_data.drop(["Player", "Event"], axis=1)
final_event_data_1 = final_event_data_1.reset_index(drop=True)
final_event_data_1.head()

Unnamed: 0,Period,Elapsed,Home Team Y/N,Home Team Skaters,Away Team Skaters,Home Team Goals,Away Team Goals,X Coordinate,Y Coordinate
0,1,0,0,5,5,0,0,100,43
1,1,2,0,5,5,0,0,107,40
2,1,3,0,5,5,0,0,125,28
3,1,5,0,5,5,0,0,131,28
4,1,7,0,5,5,0,0,169,21


In [21]:
final_event_data_2 = prior_events_data.drop(list(prior_events_data.filter(regex="Event|Player")), axis=1)
final_event_data_2 = final_event_data_2.reset_index(drop=True)
final_event_data_2.head()

Unnamed: 0,Period_l1,Elapsed_l1,Home Team Y/N_l1,Home Team Skaters_l1,Away Team Skaters_l1,Home Team Goals_l1,Away Team Goals_l1,X Coordinate_l1,Y Coordinate_l1,Period_l2,...,Y Coordinate_l2,Period_l3,Elapsed_l3,Home Team Y/N_l3,Home Team Skaters_l3,Away Team Skaters_l3,Home Team Goals_l3,Away Team Goals_l3,X Coordinate_l3,Y Coordinate_l3
0,1,3,0,5,5,0,0,125,28,1,...,40,1,0,0,5,5,0,0,100,43
1,1,5,0,5,5,0,0,131,28,1,...,28,1,2,0,5,5,0,0,107,40
2,1,7,0,5,5,0,0,169,21,1,...,28,1,3,0,5,5,0,0,125,28
3,1,8,0,5,5,0,0,159,26,1,...,21,1,5,0,5,5,0,0,131,28
4,1,10,1,5,5,0,0,101,31,1,...,26,1,7,0,5,5,0,0,169,21


In [22]:
final_event_data = pd.concat([encoded_events, final_event_data_1, final_event_data_2], ignore_index=True, axis=1)
final_event_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,40,1,0,0,5,5,0,0,100,43
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28,1,2,0,5,5,0,0,107,40
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28,1,3,0,5,5,0,0,125,28
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,21,1,5,0,5,5,0,0,131,28
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26,1,7,0,5,5,0,0,169,21


### Player data

In [23]:
encoded_players.head()

Unnamed: 0,0,1,2,3
0,1,1,1,0
1,2,1,1,1
2,3,2,1,1
3,4,3,2,1
4,4,4,3,2


## Train-test split

In [24]:
import random

In [25]:
nrow = encoded_players.shape[0]
random_ind = [i for i in range(0, nrow)]
random.shuffle(random_ind)

In [26]:
train_test_split = 0.9
split = int(train_test_split*nrow)

In [27]:
X_train_player = encoded_players.loc[random_ind[:split]]
X_train_event = final_event_data.loc[random_ind[:split]].astype('float32')
y_train = target.iloc[random_ind[:split], ].astype('float32')
# y_train = np.array(y_train).reshape(y_train.shape[0],)

X_test_player = encoded_players.loc[random_ind[split:]]
X_test_event = final_event_data.loc[random_ind[split:]].astype('float32')
y_test = target.iloc[random_ind[split:], ].astype('float32')
# y_test = np.array(y_test).reshape(y_test.shape[0],)

In [28]:
X_train_event.shape

(24191, 63)

In [29]:
y_train.shape

(24191,)

In [30]:
# percentage of positive class in training
np.sum(y_train)/y_train.shape[0]

0.07465586375098177

In [31]:
# percentage of positive class in test
np.sum(y_test)/y_test.shape[0]

0.0665922619047619

In [32]:
data = {'X_train_player': X_train_player, 'X_train_event': X_train_event, 'y_train': y_train,
        'X_test_player': X_test_player, 'X_test_event': X_test_event, 'y_test': y_test}

In [33]:
pickle.dump(data, open("DATA_shot_goal_merged.p", "wb"))