In [None]:
# Version 2


# Version 2

Some things I need to address in this version:

1. Currently the model has no understanding of empty nets
2. Currently, 5x4 / 4x5 refers to home and away, so the model doesn't know if the shot is coming from the powerplay or penalty kill
3. Should also include missed shots (go from shots to fenwick, can't do Corsi because blocked shots don't show shot origin)


4/26, Currently don't understand what a 5x3 is, but working on it!

In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import seaborn as sns
import matplotlib.pyplot as plt

import data_prep

In [16]:
def create_input_data_from_csv(path_to_csv):
    df = pd.read_csv(path_to_csv)

    # get list of ilocs of rows where the 'Event' column is 'GOAL or SHOT or MISS'
    goal_shot_rows = df.loc[df['Event'].isin(['GOAL', 'SHOT', 'MISS'])]

    # shift the dataframe by 1 row
    shifted_df = df.shift(1)
    shifted_df.columns = ['prev_' + name for name in df.columns]

    # combine goal_shot_rows with shifted_df, but using rows from goal_shot_rows
    goal_shot_rows = pd.concat([goal_shot_rows, shifted_df.loc[goal_shot_rows.index]], axis=1)

    goal_shot_rows = goal_shot_rows[goal_shot_rows['Period'] != 0]
    goal_shot_rows = goal_shot_rows[goal_shot_rows['Period'] != 5]

    # Create "home_or_away" column based on if "Ev_team" is the same as "Home_Team" or "Away_Team"
    goal_shot_rows['home_or_away'] = goal_shot_rows.apply(lambda x: 'home' if x['Ev_Team'] == x['Home_Team'] else 'away', axis=1)
    
    # check to see if on power play if "home_or_away" is equal to "home" and "Strength" is equal to "5x4" or "home_or_away" is equal to "away" and "Strength" is equal to "4x5"
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "5v4" if (x['home_or_away'] == 'home' and x['Strength'] == '5x4' ) or (x['home_or_away'] == 'away' and x['Strength'] == '4x5') else "5v5", axis=1)

    # check to see if a shot is on an empty net by checking if home_or_away is equal to "home" and "Away_Goalie" is equal to "None" or home_or_away is equal to "away" and "Home_Goalie" is equal to "None"
    goal_shot_rows['on_empty_net'] = goal_shot_rows.apply(lambda x: 1 if (x['home_or_away'] == 'home' and x['Away_Goalie'] == '') or (x['home_or_away'] == 'away' and x['Home_Goalie'] == '') else 0, axis=1)


    val_input_data = goal_shot_rows[['Event', 'Period', 'Seconds_Elapsed', 'Strength', 'Type', 'xC', 'yC', 'home_or_away', 'true_strength', 'on_empty_net', 'prev_Event', 'prev_Period', 'prev_Seconds_Elapsed', 'prev_Strength', 'prev_Type', 'prev_xC', 'prev_yC']]
    val_result_data = goal_shot_rows['Event']
    val_result_data = val_result_data.apply(lambda x: 1 if x == "GOAL" else 0)

    val_input_data = val_input_data.drop(columns=['Event'])

    # Split out the data between numeric values (can carry forward) and categorical values (need to be turned into binary columns)
    val_input_data_numeric = val_input_data[['Period', 'Seconds_Elapsed', 'xC', 'yC', 'on_empty_net','prev_Seconds_Elapsed', 'prev_xC', 'prev_yC']]
    val_input_data_categorical = pd.DataFrame()

    for column in ['Type', 'prev_Event', 'true_strength', 'home_or_away']:
        dummy_columns = pd.get_dummies(val_input_data[column])
        val_input_data_categorical = pd.concat([dummy_columns, val_input_data_categorical], axis=1)

    for column in ['prev_Event', 'prev_Strength', 'prev_Type']:
        dummy_columns = pd.get_dummies(val_input_data[column])

        # rename all columns with a prefix of "prev_"
        dummy_columns.columns = ['prev_' + str(col) for col in dummy_columns.columns]
        val_input_data_categorical = pd.concat([val_input_data_categorical, dummy_columns], axis=1)
        
    # change all True/False to 1/0
    val_input_data_categorical = val_input_data_categorical.applymap(lambda x: 1 if x == True else 0)

    # combine the two dataframes
    val_input_data_combined = pd.concat([val_input_data_numeric, val_input_data_categorical], axis=1)

    #set val_input_data2 to be all floats
    val_input_data_combined = val_input_data_combined.astype(float)

    columns = ['Period', 'Seconds_Elapsed', 'xC', 'yC', 'prev_Seconds_Elapsed',
            'prev_xC', 'prev_yC', 'BLOCK', 'CHL', 'DELPEN', 'FAC', 'GIVE', 'HIT',
            'MISS', 'PENL', 'SHOT', 'STOP', 'TAKE', 'BACKHAND', 'DEFLECTED',
            'SLAP SHOT', 'SNAP SHOT', 'TIP-IN', 'WRAP-AROUND', 'WRIST SHOT', '0x0',
            '3x3', '3x4', '3x5', '4x3', '4x4', '4x5', '5x3', '5x4', '5x5', '6x5',
            'on_empty_net', 'away', 'home', '5v4', '5v5',
            'prev_BLOCK', 'prev_CHL', 'prev_DELPEN', 'prev_FAC', 'prev_GIVE',
            'prev_HIT', 'prev_MISS', 'prev_PENL', 'prev_SHOT', 'prev_STOP',
            'prev_TAKE', 'prev_0x5', 'prev_3x3', 'prev_3x4', 'prev_3x5', 'prev_4x3',
            'prev_4x4', 'prev_4x5', 'prev_5x3', 'prev_5x4', 'prev_5x5', 'prev_5x6',
            'prev_BACKHAND', 'prev_DEFLECTED',
            'prev_PS-Covering puck in crease(0 min)',
            'prev_PS-Goalkeeper displaced net(0 min)',
            'prev_PS-Holding on breakaway(0 min)',
            'prev_PS-Hooking on breakaway(0 min)',
            'prev_PS-Slash on breakaway(0 min)',
            'prev_PS-Throw object at puck(0 min)',
            'prev_PS-Tripping on breakaway(0 min)', 'prev_SLAP SHOT',
            'prev_SNAP SHOT', 'prev_TIP-IN', 'prev_WRAP-AROUND', 'prev_WRIST SHOT']

    # For each column in input_data_combined, check if it exists in val_input_data_combined, if not add it as all 0s
    for column in columns:
        if column not in val_input_data_combined.columns:
                val_input_data_combined[column] = 0

    for column in val_input_data_combined.columns:
        if column not in columns:
                if column in val_input_data_combined.columns:
                    print(f"Column {column} not in columns")
                    val_input_data_combined = val_input_data_combined.drop(column, axis=1)


    # reduce val_input_data_combined to have the same columns as input_data_combined
    val_input_data_combined = val_input_data_combined[columns]

    # change all NaN numbers to 0
    val_input_data_combined = val_input_data_combined.fillna(0)

    print(val_input_data_combined.shape)

    return val_input_data_combined, val_result_data, goal_shot_rows

In [17]:
input_data, result_data, original_input = create_input_data_from_csv(r"C:\Users\JoshG\hockey_scraper_data\csvs\nhl_pbp_20222023.csv")

  val_input_data_categorical = val_input_data_categorical.applymap(lambda x: 1 if x == True else 0)


Column prev_6x5 not in columns
Column prev_GY #8 TANEV PS-Hooking on breakaway(0 min) not in columns
(121992, 77)


In [19]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

x = torch.tensor(input_data.values, dtype=torch.float32)
y = torch.tensor(result_data.values, dtype=torch.float32)

x.to(device)
y.to(device)

model = nn.Sequential(
    nn.Linear(77, 120),
    nn.ReLU(),
    nn.Linear(120, 60),
    nn.ReLU(),
    nn.Linear(60, 1),
    nn.Sigmoid()
)

print(model)

model.to(device)


loss_fn = nn.BCELoss()  # binary cross entropy
loss_fn.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


Using device: cuda

Sequential(
  (0): Linear(in_features=77, out_features=120, bias=True)
  (1): ReLU()
  (2): Linear(in_features=120, out_features=60, bias=True)
  (3): ReLU()
  (4): Linear(in_features=60, out_features=1, bias=True)
  (5): Sigmoid()
)


In [None]:
n_epochs = 1000000
batch_size = 10000
 
for epoch in range(n_epochs):
    for i in range(0, len(x), batch_size):
        Xbatch = x[i:i+batch_size]
        y_pred = model(Xbatch.cuda())
        ybatch = y[i:i+batch_size]
        #loss = loss_fn(y_pred, ybatch)
        loss = loss_fn(y_pred.squeeze(), ybatch.cuda())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'{epoch}, {loss}')


# compute accuracy (no_grad is optional)
y_pred = model(x.cuda()).cuda()
 
accuracy = (y_pred.round() == y.cuda()).float().mean()
print(f"Accuracy {accuracy}")

torch.save(model.state_dict(), '0426Model_1000k.pt')