In [None]:
# Version 2


# Version 3

Very minor updates:

- Standardize on using 2018, 2020, 2022 seasons for training
- Turn Periods into a classification statistic
- Fix some of the columns so that Penalty Shots make sense?


In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import seaborn as sns
import matplotlib.pyplot as plt

import data_prep

In [20]:
def create_input_data_from_csv(path_to_csv):
    df = pd.read_csv(path_to_csv)

    # get list of ilocs of rows where the 'Event' column is 'GOAL or SHOT or MISS'
    goal_shot_rows = df.loc[df['Event'].isin(['GOAL', 'SHOT', 'MISS'])]

    # shift the dataframe by 1 row
    shifted_df = df.shift(1)
    shifted_df.columns = ['prev_' + name for name in df.columns]

    # combine goal_shot_rows with shifted_df, but using rows from goal_shot_rows
    goal_shot_rows = pd.concat([goal_shot_rows, shifted_df.loc[goal_shot_rows.index]], axis=1)

    goal_shot_rows = goal_shot_rows[goal_shot_rows['Period'] != 0]
    goal_shot_rows = goal_shot_rows[goal_shot_rows['Period'] != 5]

    # Create "home_or_away" column based on if "Ev_team" is the same as "Home_Team" or "Away_Team"
    goal_shot_rows['home_or_away'] = goal_shot_rows.apply(lambda x: 'home' if x['Ev_Team'] == x['Home_Team'] else 'away', axis=1)
    
    # check to see if on power play if "home_or_away" is equal to "home" and "Strength" is equal to "5x4" or "home_or_away" is equal to "away" and "Strength" is equal to "4x5"
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "5x4" if (x['home_or_away'] == 'home' and x['Strength'] == '5x4' ) or (x['home_or_away'] == 'away' and x['Strength'] == '4x5') else "5x5", axis=1)

    # similar as above, but for 5x3
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "5x3" if (x['home_or_away'] == 'home' and x['Strength'] == '5x3' ) or (x['home_or_away'] == 'away' and x['Strength'] == '3x5') else x['true_strength'], axis=1)

    # similar as above, but for 6x5
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "6x5" if (x['home_or_away'] == 'home' and x['Strength'] == '6x5' ) or (x['home_or_away'] == 'away' and x['Strength'] == '5x6') else x['true_strength'], axis=1)

    # similar as above, but for 4x3
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "4x3" if (x['home_or_away'] == 'home' and x['Strength'] == '4x3' ) or (x['home_or_away'] == 'away' and x['Strength'] == '3x4') else x['true_strength'], axis=1)

    # similar as above,but for 3x3
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "3x3" if (x['home_or_away'] == 'home' and x['Strength'] == '3x3' ) or (x['home_or_away'] == 'away' and x['Strength'] == '3x3') else x['true_strength'], axis=1)

    # check to see if a shot is on an empty net by checking if home_or_away is equal to "home" and "Away_Goalie" is equal to "None" or home_or_away is equal to "away" and "Home_Goalie" is equal to "None"
    goal_shot_rows['on_empty_net'] = goal_shot_rows.apply(lambda x: 1 if (x['home_or_away'] == 'home' and x['Away_Goalie'] == '') or (x['home_or_away'] == 'away' and x['Home_Goalie'] == '') else 0, axis=1)


    val_input_data = goal_shot_rows[['Event', 'Period', 'Seconds_Elapsed', 'Strength', 'Type', 'xC', 'yC', 'home_or_away', 'true_strength', 'on_empty_net', 'prev_Event', 'prev_Period', 'prev_Seconds_Elapsed', 'prev_Strength', 'prev_Type', 'prev_xC', 'prev_yC']]
    val_result_data = goal_shot_rows['Event']
    val_result_data = val_result_data.apply(lambda x: 1 if x == "GOAL" else 0)

    # Create a new column "is_PS" if prev_Type starts with "PS"
    val_input_data['prev_Type'] = val_input_data['prev_Type'].fillna("")
    val_input_data['is_PS'] = val_input_data['prev_Type'].str.startswith('PS').astype(int)

    # For "prev_Type" column, if "min)" or "maj)", then convert it to just "PENL"
    val_input_data['prev_Type'] = val_input_data['prev_Type'].apply(lambda x: 'PENL' if x.endswith('min)') or x.endswith('maj)') else x)
    
    # For "prev_Event" column, if it start with "PS", then convert it to just "PS"
    # val_input_data['prev_Event'] = val_input_data['prev_Event'].apply(lambda x: 'PS' if x.startswith('PS') else x)
    print(val_input_data['prev_Type'].unique())

    val_input_data = val_input_data.drop(columns=['Event'])

    # Split out the data between numeric values (can carry forward) and categorical values (need to be turned into binary columns)
    val_input_data_numeric = val_input_data[['Seconds_Elapsed', 'xC', 'yC', 'on_empty_net','prev_Seconds_Elapsed', 'prev_xC', 'prev_yC', 'is_PS']]
    val_input_data_categorical = pd.DataFrame()



    for column in ['Period', 'Type', 'true_strength', 'home_or_away']:
        dummy_columns = pd.get_dummies(val_input_data[column])
        val_input_data_categorical = pd.concat([dummy_columns, val_input_data_categorical], axis=1)

    for column in ['prev_Event', 'prev_Strength', 'prev_Type']:
        dummy_columns = pd.get_dummies(val_input_data[column])

        # rename all columns with a prefix of "prev_"
        dummy_columns.columns = ['prev_' + str(col) for col in dummy_columns.columns]
        val_input_data_categorical = pd.concat([val_input_data_categorical, dummy_columns], axis=1)
        
    # change all True/False to 1/0
    val_input_data_categorical = val_input_data_categorical.applymap(lambda x: 1 if x == True else 0)

    # combine the two dataframes
    val_input_data_combined = pd.concat([val_input_data_numeric, val_input_data_categorical], axis=1)

    #set val_input_data2 to be all floats
    val_input_data_combined = val_input_data_combined.astype(float)

    print(val_input_data_combined.columns)

    columns = ['Period', 'Seconds_Elapsed', 'xC', 'yC', 'prev_Seconds_Elapsed',
            'prev_xC', 'prev_yC', 'BLOCK', 'CHL', 'DELPEN', 'FAC', 'GIVE', 'HIT',
            'MISS', 'PENL', 'SHOT', 'STOP', 'TAKE', 'BACKHAND', 'DEFLECTED',
            'SLAP SHOT', 'SNAP SHOT', 'TIP-IN', 'WRAP-AROUND', 'WRIST SHOT', '0x0',
            '3x3', '3x4', '3x5', '4x3', '4x4', '4x5', '5x3', '5x4', '5x5', '6x5',
            'on_empty_net', 'away', 'home', '5v4', '5v5',
            'prev_BLOCK', 'prev_CHL', 'prev_DELPEN', 'prev_FAC', 'prev_GIVE',
            'prev_HIT', 'prev_MISS', 'prev_PENL', 'prev_SHOT', 'prev_STOP',
            'prev_TAKE', 'prev_0x5', 'prev_3x3', 'prev_3x4', 'prev_3x5', 'prev_4x3',
            'prev_4x4', 'prev_4x5', 'prev_5x3', 'prev_5x4', 'prev_5x5', 'prev_5x6', 'prev_6x5',
            'prev_BACKHAND', 'prev_DEFLECTED',
            'prev_PS-Covering puck in crease(0 min)',
            'prev_PS-Goalkeeper displaced net(0 min)',
            'prev_PS-Holding on breakaway(0 min)',
            'prev_PS-Hooking on breakaway(0 min)',
            'prev_PS-Slash on breakaway(0 min)',
            'prev_PS-Throw object at puck(0 min)',
            'prev_PS-Tripping on breakaway(0 min)', 'prev_SLAP SHOT',
            'prev_SNAP SHOT', 'prev_TIP-IN', 'prev_WRAP-AROUND', 'prev_WRIST SHOT']

    # For each column in input_data_combined, check if it exists in val_input_data_combined, if not add it as all 0s
    for column in columns:
        if column not in val_input_data_combined.columns:
                val_input_data_combined[column] = 0

    for column in val_input_data_combined.columns:
        if column not in columns:
                if column in val_input_data_combined.columns:
                    print(f"Column {column} not in columns")
                    val_input_data_combined = val_input_data_combined.drop(column, axis=1)


    # reduce val_input_data_combined to have the same columns as input_data_combined
    val_input_data_combined = val_input_data_combined[columns]

    # change all NaN numbers to 0
    val_input_data_combined = val_input_data_combined.fillna(0)

    print(val_input_data_combined.shape)

    return val_input_data_combined, val_result_data, goal_shot_rows

In [21]:
input_data, result_data, original_input = create_input_data_from_csv(r"C:\Users\JoshG\hockey_scraper_data\csvs\nhl_pbp_20182019.csv")

# input_data2, result_data2, original_input2 = create_input_data_from_csv(r"C:\Users\JoshG\hockey_scraper_data\csvs\nhl_pbp_20202021.csv")

# input_data3, result_data3, original_input3 = create_input_data_from_csv(r"C:\Users\JoshG\hockey_scraper_data\csvs\nhl_pbp_20222023.csv")

# combine the two dataframes
input_data = pd.concat([input_data, input_data2])
result_data = pd.concat([result_data, result_data2])
original_input = pd.concat([original_input, original_input2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Type'] = val_input_data['prev_Type'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['is_PS'] = val_input_data['prev_Type'].str.startswith('PS').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Event'] = val_input_data['pr

['' 'BACKHAND' 'SNAP SHOT' 'TIP-IN' 'WRIST SHOT' 'SLAP SHOT' 'WRAP-AROUND'
 'DEFLECTED' 'Holding(2 min)' 'PS-Slash on breakaway(0 min)'
 'Hooking(2 min)' 'PS-Hooking on breakaway(0 min)'
 'Unsportsmanlike conduct(2 min)' 'Misconduct (10 min)' 'Slashing(2 min)'
 'PS-Holding on breakaway(0 min)' 'PS-Tripping on breakaway(0 min)'
 'Hi-sticking(2 min)' 'Too many men/ice - bench(2 min)'
 'PS-Throw object at puck(0 min)' 'Tripping(2 min)'
 'DM #15 PETROVIC\xa0PS-Holding on breakaway(0 min)' 'Roughing(2 min)'
 'Fighting (maj)' 'Interference(2 min)' 'Game Misconduct(10 min)'
 'SH #76 SUBBAN\xa0PS-Slash on breakaway(0 min)']
Index([                                    'Seconds_Elapsed',
                                                        'xC',
                                                        'yC',
                                              'on_empty_net',
                                      'prev_Seconds_Elapsed',
                                                   'prev_xC',
    

In [30]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

x = torch.tensor(input_data.values, dtype=torch.float32)
y = torch.tensor(result_data.values, dtype=torch.float32)

x.to(device)
y.to(device)

model = nn.Sequential(
    nn.Linear(78, 240),
    nn.ReLU(),
    nn.Linear(240, 60),
    nn.ReLU(),
    nn.Linear(60, 1),
    nn.Sigmoid()
)

model.to(device)


loss_fn = nn.BCELoss()  # binary cross entropy
loss_fn.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)


Using device: cuda

Sequential(
  (0): Linear(in_features=78, out_features=240, bias=True)
  (1): ReLU()
  (2): Linear(in_features=240, out_features=60, bias=True)
  (3): ReLU()
  (4): Linear(in_features=60, out_features=1, bias=True)
  (5): Sigmoid()
)


In [35]:
n_epochs = 1000000
batch_size = 20000

optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

for epoch in range(n_epochs):
    for i in range(0, len(x), batch_size):
        Xbatch = x[i:i+batch_size]
        y_pred = model(Xbatch.cuda())
        ybatch = y[i:i+batch_size]
        #loss = loss_fn(y_pred, ybatch)
        loss = loss_fn(y_pred.squeeze(), ybatch.cuda())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch % 1000 == 0):
        print(f'{epoch}, {loss}')


# compute accuracy (no_grad is optional)
# y_pred = model(x.cuda()).cuda()
 

torch.save(model.state_dict(), '0427Model_2M.pt')

0, 0.06435119360685349
1000, 0.029508115723729134
2000, 0.02843782864511013
3000, 0.027066782116889954
4000, 0.031083621084690094
5000, 0.02637661062180996
6000, 0.024489188566803932
7000, 0.0261500496417284
8000, 0.024999871850013733
9000, 0.03033571131527424
10000, 0.027510501444339752
11000, 0.02336697094142437
12000, 0.02637491747736931
13000, 0.022671477869153023
14000, 0.023110779002308846
15000, 0.02338496409356594
16000, 0.024180704727768898
17000, 0.022812016308307648
18000, 0.022526882588863373
19000, 0.022776436060667038
20000, 0.024123717099428177
21000, 0.021103115752339363
22000, 0.022420629858970642
23000, 0.02945699915289879
24000, 0.02226332388818264
25000, 0.021382685750722885
26000, 0.05597173050045967
27000, 0.021139968186616898
28000, 0.021723579615354538
29000, 0.021278994157910347
30000, 0.021835947409272194
31000, 0.021502727642655373
32000, 0.02030188962817192
33000, 0.021840710192918777
34000, 0.02101147174835205
35000, 0.020563242956995964
36000, 0.0231709722

: 

In [11]:
torch.save(model.state_dict(), '0426Model_1M.pt')

In [7]:
filepath = '0426Model_100k.pt'

model.load_state_dict(torch.load(filepath))
model.eval()
model.cpu()



with torch.no_grad():
    y_pred = model(x.cpu())

# print dimensions of y_pred and y
print(y_pred.shape)
print(y.shape)

y_rounded = y_pred.round()
y_rounded.to('cpu')
y.to('cpu')

accuracy = (y_rounded.cpu() == y.cpu()).float().mean()

torch.save(model.state_dict(), '0426Model_100k.pt')

torch.Size([121992, 1])
torch.Size([121992])
