In [None]:
# Version 2


# Version 3

Very minor updates:

- Standardize on using 2018, 2020, 2022 seasons for training
- Turn Periods into a classification statistic
- Fix some of the columns so that Penalty Shots make sense?


In [18]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import seaborn as sns
import matplotlib.pyplot as plt

import data_prep

In [19]:
def create_input_data_from_csv(path_to_csv):
    df = pd.read_csv(path_to_csv)

    # get list of ilocs of rows where the 'Event' column is 'GOAL or SHOT or MISS'
    goal_shot_rows = df.loc[df['Event'].isin(['GOAL', 'SHOT', 'MISS'])]

    # shift the dataframe by 1 row
    shifted_df = df.shift(1)
    shifted_df.columns = ['prev_' + name for name in df.columns]

    # combine goal_shot_rows with shifted_df, but using rows from goal_shot_rows
    goal_shot_rows = pd.concat([goal_shot_rows, shifted_df.loc[goal_shot_rows.index]], axis=1)

    goal_shot_rows = goal_shot_rows[goal_shot_rows['Period'] != 0]
    goal_shot_rows = goal_shot_rows[goal_shot_rows['Period'] != 5]

    # Create "home_or_away" column based on if "Ev_team" is the same as "Home_Team" or "Away_Team"
    goal_shot_rows['home_or_away'] = goal_shot_rows.apply(lambda x: 'home' if x['Ev_Team'] == x['Home_Team'] else 'away', axis=1)
    
    # check to see if on power play if "home_or_away" is equal to "home" and "Strength" is equal to "5x4" or "home_or_away" is equal to "away" and "Strength" is equal to "4x5"
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "5x4" if (x['home_or_away'] == 'home' and x['Strength'] == '5x4' ) or (x['home_or_away'] == 'away' and x['Strength'] == '4x5') else "5x5", axis=1)

    # similar as above, but for 5x3
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "5x3" if (x['home_or_away'] == 'home' and x['Strength'] == '5x3' ) or (x['home_or_away'] == 'away' and x['Strength'] == '3x5') else x['true_strength'], axis=1)

    # similar as above, but for 6x5
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "6x5" if (x['home_or_away'] == 'home' and x['Strength'] == '6x5' ) or (x['home_or_away'] == 'away' and x['Strength'] == '5x6') else x['true_strength'], axis=1)

    # similar as above, but for 4x3
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "4x3" if (x['home_or_away'] == 'home' and x['Strength'] == '4x3' ) or (x['home_or_away'] == 'away' and x['Strength'] == '3x4') else x['true_strength'], axis=1)

    # similar as above,but for 3x3
    goal_shot_rows['true_strength'] = goal_shot_rows.apply(lambda x: "3x3" if (x['home_or_away'] == 'home' and x['Strength'] == '3x3' ) or (x['home_or_away'] == 'away' and x['Strength'] == '3x3') else x['true_strength'], axis=1)

    # check to see if a shot is on an empty net by checking if home_or_away is equal to "home" and "Away_Goalie" is equal to "None" or home_or_away is equal to "away" and "Home_Goalie" is equal to "None"
    goal_shot_rows['on_empty_net'] = goal_shot_rows.apply(lambda x: 1 if (x['home_or_away'] == 'home' and x['Away_Goalie'] == '') or (x['home_or_away'] == 'away' and x['Home_Goalie'] == '') else 0, axis=1)


    val_input_data = goal_shot_rows[['Event', 'Period', 'Seconds_Elapsed', 'Strength', 'Type', 'xC', 'yC', 'home_or_away', 'true_strength', 'on_empty_net', 'prev_Event', 'prev_Period', 'prev_Seconds_Elapsed', 'prev_Strength', 'prev_Type', 'prev_xC', 'prev_yC']]
    val_result_data = goal_shot_rows['Event']
    val_result_data = val_result_data.apply(lambda x: 1 if x == "GOAL" else 0)

    # Create a new column "is_PS" if prev_Type starts with "PS"
    val_input_data['prev_Type'] = val_input_data['prev_Type'].fillna("")
    val_input_data['is_PS'] = val_input_data['prev_Type'].str.startswith('PS').astype(int)

    # For "prev_Type" column, if "min)" or "maj)", then convert it to just "PENL"
    val_input_data['prev_Type'] = val_input_data['prev_Type'].apply(lambda x: 'PENL' if x.endswith('min)') else x)
    
    # For "prev_Event" column, if it start with "PS", then convert it to just "PS"
    # val_input_data['prev_Event'] = val_input_data['prev_Event'].apply(lambda x: 'PS' if x.startswith('PS') else x)
    print(val_input_data['prev_Type'].unique())

    val_input_data = val_input_data.drop(columns=['Event'])

    # Split out the data between numeric values (can carry forward) and categorical values (need to be turned into binary columns)
    val_input_data_numeric = val_input_data[['Seconds_Elapsed', 'xC', 'yC', 'on_empty_net','prev_Seconds_Elapsed', 'prev_xC', 'prev_yC', 'is_PS']]
    val_input_data_categorical = pd.DataFrame()



    for column in ['Period', 'Type', 'true_strength', 'home_or_away']:
        dummy_columns = pd.get_dummies(val_input_data[column])
        val_input_data_categorical = pd.concat([dummy_columns, val_input_data_categorical], axis=1)

    for column in ['prev_Event', 'prev_Strength', 'prev_Type']:
        dummy_columns = pd.get_dummies(val_input_data[column])

        # rename all columns with a prefix of "prev_"
        dummy_columns.columns = ['prev_' + str(col) for col in dummy_columns.columns]
        val_input_data_categorical = pd.concat([val_input_data_categorical, dummy_columns], axis=1)
        
    # change all True/False to 1/0
    val_input_data_categorical = val_input_data_categorical.applymap(lambda x: 1 if x == True else 0)

    # combine the two dataframes
    val_input_data_combined = pd.concat([val_input_data_numeric, val_input_data_categorical], axis=1)

    #set val_input_data2 to be all floats
    val_input_data_combined = val_input_data_combined.astype(float)

    print(val_input_data_combined.columns)

    columns = ['Seconds_Elapsed',                   'xC',                   'yC',
               'on_empty_net', 'prev_Seconds_Elapsed',              'prev_xC',
                    'prev_yC',                'is_PS',                 'away',
                       'home',                  '3x3',                  '4x3',
                        '5x3',                  '5x4',                  '5x5',
                        '6x5',             'BACKHAND',            'DEFLECTED',
                  'SLAP SHOT',            'SNAP SHOT',               'TIP-IN',
                'WRAP-AROUND',           'WRIST SHOT',                      1,
                            2,                      3,                      4,
                 'prev_BLOCK',             'prev_CHL',             'prev_FAC',
                  'prev_GEND',            'prev_GIVE',            'prev_GOAL',
                   'prev_HIT',            'prev_MISS',            'prev_PENL',
                  'prev_SHOT',            'prev_STOP',            'prev_TAKE',
                  'prev_-1x3',            'prev_-1x5',             'prev_0x1',
                   'prev_0x5',             'prev_3x3',             'prev_3x4',
                   'prev_3x5',             'prev_4x3',             'prev_4x4',
                   'prev_4x5',             'prev_5x0',             'prev_5x3',
                   'prev_5x4',             'prev_5x5',             'prev_5x6',
                   'prev_6x5',                'prev_',        'prev_BACKHAND',
             'prev_DEFLECTED',       'prev_SLAP SHOT',       'prev_WRIST SHOT',
             'prev_SNAP SHOT',          'prev_TIP-IN',     'prev_WRAP-AROUND',
            ]

    # For each column in input_data_combined, check if it exists in val_input_data_combined, if not add it as all 0s
    for column in columns:
        if column not in val_input_data_combined.columns:
                val_input_data_combined[column] = 0

    for column in val_input_data_combined.columns:
        if column not in columns:
                if column in val_input_data_combined.columns:
                    print(f"Column {column} not in columns")
                    val_input_data_combined = val_input_data_combined.drop(column, axis=1)


    # reduce val_input_data_combined to have the same columns as input_data_combined
    val_input_data_combined = val_input_data_combined[columns]

    # change all NaN numbers to 0
    val_input_data_combined = val_input_data_combined.fillna(0)

    print(val_input_data_combined.shape)
    print(val_input_data_combined.columns)

    return val_input_data_combined, val_result_data, goal_shot_rows

In [20]:
input_data, result_data, original_input = create_input_data_from_csv(r"C:\Users\JoshG\hockey_scraper_data\csvs\nhl_pbp_20182019.csv")

input_data2, result_data2, original_input2 = create_input_data_from_csv(r"C:\Users\JoshG\hockey_scraper_data\csvs\nhl_pbp_20202021.csv")
input_data3, result_data3, original_input3 = create_input_data_from_csv(r"C:\Users\JoshG\hockey_scraper_data\csvs\nhl_pbp_20222023.csv")

# combine the two dataframes
input_data = pd.concat([input_data, input_data2, input_data3])
result_data = pd.concat([result_data, result_data2, result_data3])
original_input = pd.concat([original_input, original_input2, original_input3])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Type'] = val_input_data['prev_Type'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['is_PS'] = val_input_data['prev_Type'].str.startswith('PS').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Type'] = val_input_data['pre

['' 'BACKHAND' 'SNAP SHOT' 'TIP-IN' 'WRIST SHOT' 'SLAP SHOT' 'WRAP-AROUND'
 'DEFLECTED' 'PENL' 'Fighting (maj)']
Index([     'Seconds_Elapsed',                   'xC',                   'yC',
               'on_empty_net', 'prev_Seconds_Elapsed',              'prev_xC',
                    'prev_yC',                'is_PS',                 'away',
                       'home',                  '3x3',                  '4x3',
                        '5x3',                  '5x4',                  '5x5',
                        '6x5',             'BACKHAND',            'DEFLECTED',
                  'SLAP SHOT',            'SNAP SHOT',               'TIP-IN',
                'WRAP-AROUND',           'WRIST SHOT',                      1,
                            2,                      3,                      4,
                 'prev_BLOCK',             'prev_CHL',             'prev_FAC',
                  'prev_GEND',            'prev_GIVE',            'prev_GOAL',
                  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Type'] = val_input_data['prev_Type'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['is_PS'] = val_input_data['prev_Type'].str.startswith('PS').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Type'] = val_input_data['pre

['' 'SNAP SHOT' 'SLAP SHOT' 'WRIST SHOT' 'BACKHAND' 'TIP-IN' 'WRAP-AROUND'
 'DEFLECTED' 'PENL']
Index([     'Seconds_Elapsed',                   'xC',                   'yC',
               'on_empty_net', 'prev_Seconds_Elapsed',              'prev_xC',
                    'prev_yC',                'is_PS',                 'away',
                       'home',                  '3x3',                  '4x3',
                        '5x3',                  '5x4',                  '5x5',
                        '6x5',             'BACKHAND',            'DEFLECTED',
                  'SLAP SHOT',            'SNAP SHOT',               'TIP-IN',
                'WRAP-AROUND',           'WRIST SHOT',                      1,
                            2,                      3,                      4,
                            6,           'prev_BLOCK',             'prev_CHL',
                'prev_DELPEN',             'prev_FAC',            'prev_GEND',
                  'prev_GIVE',     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Type'] = val_input_data['prev_Type'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['is_PS'] = val_input_data['prev_Type'].str.startswith('PS').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_input_data['prev_Type'] = val_input_data['pre

['' 'WRIST SHOT' 'SLAP SHOT' 'BACKHAND' 'TIP-IN' 'SNAP SHOT' 'WRAP-AROUND'
 'PENL' 'DEFLECTED']
Index([     'Seconds_Elapsed',                   'xC',                   'yC',
               'on_empty_net', 'prev_Seconds_Elapsed',              'prev_xC',
                    'prev_yC',                'is_PS',                 'away',
                       'home',                  '3x3',                  '4x3',
                        '5x3',                  '5x4',                  '5x5',
                        '6x5',             'BACKHAND',            'DEFLECTED',
                  'SLAP SHOT',            'SNAP SHOT',               'TIP-IN',
                'WRAP-AROUND',           'WRIST SHOT',                      1,
                            2,                      3,                      4,
                            6,                      7,           'prev_BLOCK',
                   'prev_CHL',          'prev_DELPEN',             'prev_FAC',
                  'prev_GIVE',     

In [21]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()


x = torch.tensor(input_data.values, dtype=torch.float32)
y = torch.tensor(result_data.values, dtype=torch.float32)

x.to(device)
y.to(device)

model = nn.Sequential(
    nn.Linear(64, 240),
    nn.ReLU(),
    nn.Linear(240, 100),
    nn.ReLU(),
    nn.Linear(100, 1),
    nn.Sigmoid()
)

model.to(device)


loss_fn = nn.BCELoss()  # binary cross entropy
loss_fn.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)


Using device: cuda



In [22]:
n_epochs = 1500000
batch_size = 16384

optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

for epoch in range(n_epochs):
    for i in range(0, len(x), batch_size):
        Xbatch = x[i:i+batch_size]
        y_pred = model(Xbatch.cuda()).cuda()
        ybatch = y[i:i+batch_size]
        #loss = loss_fn(y_pred, ybatch)
        loss = loss_fn(y_pred.squeeze(), ybatch.cuda())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch % 1000 == 0):
        print(f'{epoch}, {loss}')


# compute accuracy (no_grad is optional)
# y_pred = model(x.cuda()).cuda()
 

torch.save(model.state_dict(), '0502_xG_Model_1500K.pt')

0, 5.781966209411621
1000, 0.1983824521303177
2000, 0.18026137351989746
3000, 0.17135003209114075
4000, 0.1625477820634842
5000, 0.16341452300548553
6000, 0.15771806240081787
7000, 0.15439613163471222
8000, 0.15275517106056213
9000, 0.15104830265045166
10000, 0.14905911684036255
11000, 0.1498469114303589
12000, 0.14819978177547455
13000, 0.14753302931785583
14000, 0.14657320082187653
15000, 0.14396199584007263
16000, 0.1452147215604782
17000, 0.14606447517871857
18000, 0.14020782709121704
19000, 0.14548298716545105
20000, 0.14245696365833282
21000, 0.14258472621440887
22000, 0.13930466771125793
23000, 0.13882975280284882
24000, 0.13739077746868134
25000, 0.1386507898569107
26000, 0.13623517751693726
27000, 0.13720868527889252
28000, 0.13779503107070923
29000, 0.13688500225543976
30000, 0.1365344226360321
31000, 0.136291965842247
32000, 0.13677605986595154
33000, 0.1374916434288025
34000, 0.13595855236053467
35000, 0.13818129897117615
36000, 0.13539732992649078
37000, 0.1330936104059219

In [7]:
torch.save(model.state_dict(), '0430_xG_Model_1500K.pt')

In [24]:
filepath = '0502_xG_Model_1500K.pt'

model = nn.Sequential(
    nn.Linear(64, 240),
    nn.ReLU(),
    nn.Linear(240, 100),
    nn.ReLU(),
    nn.Linear(100, 1),
    nn.Sigmoid()
)

model.load_state_dict(torch.load('0502_xG_Model_1500K.pt'))
model.eval()
model.cpu()

years = ["20172018", "20182019", "20192020", "20202021", "20212022", "20222023", "20232024"]
# years = ["20172018", "20192020", "20212022", "20232024"]

meta_training_predictions = []
meta_results = []
meta_predictions = []

for year in years:
    csv_path = f"C:\\Users\\JoshG\\hockey_scraper_data\\csvs\\nhl_pbp_{year}.csv"
    val_input_data, val_result_data, original_input = create_input_data_from_csv(csv_path)

    x = torch.tensor(val_input_data.values, dtype=torch.float32)
    y = torch.tensor(val_result_data.values, dtype=torch.float32)

    y_pred = model(x)


    # convert the tensor to a numpy array
    y_pred = y_pred.detach().numpy()
    y = y.detach().numpy()


    accuracy_results = []

    for i in range(len(y_pred)):
        # print(f"Prediction: {y_pred[i][0]} Actual: {y[i]}")
        if y_pred[i][0].round() == y[i]:
            accuracy_results.append(1)
        else:
            accuracy_results.append(0)

    accuracy = sum(accuracy_results) / len(accuracy_results)


    y_pred_flat = y_pred.flatten()
    y_flat = y.flatten()

    results_tensor = torch.tensor(y_pred_flat, dtype=torch.float32)
    predictions_tensor = torch.tensor(y_pred_flat, dtype=torch.float32)
    predictions_rounded = np.round(y_pred_flat)
    predictions_rounded_tensor = torch.tensor(predictions_rounded, dtype=torch.float32)

    # accuracy = (predictions_rounded_tensor == results_tensor).float().mean()

    # calculate log loss of predicitons
    log_loss = nn.BCELoss()
    log_loss_results = log_loss(predictions_tensor, results_tensor)
        
    print(f"year: {year} Accuracy {accuracy} loss {log_loss_results}")

RuntimeError: Error(s) in loading state_dict for Sequential:
	size mismatch for 0.weight: copying a param with shape torch.Size([180, 66]) from checkpoint, the shape in current model is torch.Size([240, 64]).
	size mismatch for 0.bias: copying a param with shape torch.Size([180]) from checkpoint, the shape in current model is torch.Size([240]).
	size mismatch for 2.weight: copying a param with shape torch.Size([60, 180]) from checkpoint, the shape in current model is torch.Size([100, 240]).
	size mismatch for 2.bias: copying a param with shape torch.Size([60]) from checkpoint, the shape in current model is torch.Size([100]).
	size mismatch for 4.weight: copying a param with shape torch.Size([1, 60]) from checkpoint, the shape in current model is torch.Size([1, 100]).