In [1]:
# Apply linear regression to predict the parameters used for each Yahoo fantasy stat
# Linearly dependant features to remove from data for modelling:
# points ( = goals + assists)
# saves ( = evenSaves + powerPlaySaves + shortHandedSaves)
# timeOnIce ( = evenTimeOnIce + powerPlayTimeOnIce + shortHandedTimeOnIce)
# overTimeGoals ( = gameWinningGoals)
# *shots ( = saves + goalsAgainst)

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import Ridge
import datetime
import time
import os

In [3]:
def toi_to_min(toi):
    t = toi.split(":")
    t = list(map(int, t)) 
    if(len(t) < 2):
        return t[0]
    else:
        return t[0] * 60 + t[1]

In [4]:
data = pd.read_csv('/Users/joelhoward/repos/NHLFantasyPy/data/game_logs/game_logs_2010_2020.csv')

In [5]:
data.shape

(479245, 45)

In [6]:
data[0:5]

Unnamed: 0,season,date,game_id,player_id,position,team_id,opp_id,isHome,isWin,isOT,...,shortHandedShots,evenShots,powerPlayShots,savePercentage,shotsAgainst,goalsAgainst,powerPlaySavePercentage,shortHandedSavePercentage,evenStrengthSavePercentage,fanpts
0,20192020,2020-03-11,2019021081,8471686,D,3,21,False,False,True,...,0,0,0,0.0,0,0,0.0,0.0,0.0,1.7
1,20192020,2020-03-11,2019021081,8474090,D,3,21,False,False,True,...,0,0,0,0.0,0,0,0.0,0.0,0.0,3.0
2,20192020,2020-03-11,2019021081,8475735,C,3,21,False,False,True,...,0,0,0,0.0,0,0,0.0,0.0,0.0,0.4
3,20192020,2020-03-11,2019021081,8475855,RW,3,21,False,False,True,...,0,0,0,0.0,0,0,0.0,0.0,0.0,1.15
4,20192020,2020-03-11,2019021081,8476458,C,3,21,False,False,True,...,0,0,0,0.0,0,0,0.0,0.0,0.0,2.3


In [7]:
list(data.keys())

['season',
 'date',
 'game_id',
 'player_id',
 'position',
 'team_id',
 'opp_id',
 'isHome',
 'isWin',
 'isOT',
 'fws',
 'timeOnIce',
 'assists',
 'goals',
 'pim',
 'shots',
 'hits',
 'powerPlayGoals',
 'powerPlayPoints',
 'powerPlayTimeOnIce',
 'evenTimeOnIce',
 'gameWinningGoals',
 'overTimeGoals',
 'shortHandedGoals',
 'shortHandedPoints',
 'shortHandedTimeOnIce',
 'blocked',
 'plusMinus',
 'points',
 'shifts',
 'shutouts',
 'saves',
 'powerPlaySaves',
 'shortHandedSaves',
 'evenSaves',
 'shortHandedShots',
 'evenShots',
 'powerPlayShots',
 'savePercentage',
 'shotsAgainst',
 'goalsAgainst',
 'powerPlaySavePercentage',
 'shortHandedSavePercentage',
 'evenStrengthSavePercentage',
 'fanpts']

In [8]:
data = data[data['position'] != 'G']
data = data.drop(columns=['points'])
data = data.loc[:, (data != 0).any(axis=0)]
data_yahoo = data[list(data.keys())[7:]]

In [9]:
data_yahoo['timeOnIce'] = data_yahoo['timeOnIce'].map(toi_to_min)
data_yahoo['powerPlayTimeOnIce'] = data_yahoo['powerPlayTimeOnIce'].map(toi_to_min)
data_yahoo['evenTimeOnIce'] = data_yahoo['evenTimeOnIce'].map(toi_to_min)
data_yahoo['shortHandedTimeOnIce'] = data_yahoo['shortHandedTimeOnIce'].map(toi_to_min)

data_yahoo['isHome'] = data_yahoo['isHome'].map(int)
data_yahoo['isWin'] = data_yahoo['isWin'].map(int)
data_yahoo['isOT'] = data_yahoo['isOT'].map(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_yahoo['timeOnIce'] = data_yahoo['timeOnIce'].map(toi_to_min)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_yahoo['powerPlayTimeOnIce'] = data_yahoo['powerPlayTimeOnIce'].map(toi_to_min)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_yahoo['evenTimeOnIce'] = data_yahoo['evenTimeOnIce

In [10]:
data_yahoo

Unnamed: 0,isHome,isWin,isOT,fws,timeOnIce,assists,goals,pim,shots,hits,...,evenTimeOnIce,gameWinningGoals,overTimeGoals,shortHandedGoals,shortHandedPoints,shortHandedTimeOnIce,blocked,plusMinus,shifts,fanpts
0,0,0,1,0,932,0,0,0,2,1,...,736,0,0,0,0,196,0,1,21,1.70
1,0,0,1,0,1124,0,0,4,0,3,...,932,0,0,0,0,177,2,0,26,3.00
2,0,0,1,2,594,0,0,0,0,1,...,592,0,0,0,0,2,0,0,15,0.40
3,0,0,1,0,986,0,0,0,3,1,...,835,0,0,0,0,136,1,0,24,1.15
4,0,0,1,2,1307,0,0,0,2,0,...,766,0,0,0,0,170,3,1,25,2.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479240,0,0,0,0,936,0,0,0,0,0,...,793,0,0,0,0,143,0,0,21,0.00
479241,1,0,1,0,1026,0,0,4,2,0,...,958,0,0,0,0,68,3,1,21,4.10
479242,0,1,0,0,732,0,0,0,2,1,...,732,0,0,0,0,0,0,1,19,1.70
479243,0,0,0,0,831,0,0,0,2,2,...,831,0,0,0,0,0,1,0,23,1.10


In [11]:
data_yahoo.dtypes

isHome                    int64
isWin                     int64
isOT                      int64
fws                       int64
timeOnIce                 int64
assists                   int64
goals                     int64
pim                       int64
shots                     int64
hits                      int64
powerPlayGoals            int64
powerPlayPoints           int64
powerPlayTimeOnIce        int64
evenTimeOnIce             int64
gameWinningGoals          int64
overTimeGoals             int64
shortHandedGoals          int64
shortHandedPoints         int64
shortHandedTimeOnIce      int64
blocked                   int64
plusMinus                 int64
shifts                    int64
fanpts                  float64
dtype: object

In [12]:
data_np = np.array(data_yahoo)

In [13]:
np.random.shuffle(data_np)

In [14]:
nrow = data_np.shape[0]
n_train = int(nrow * 0.8)
n_cv = int((nrow - n_train) / 2)
n_test = nrow - n_train - n_cv



In [15]:
Y_train = data_np[0:n_train, -1]
X_train = data_np[0:n_train, 0:-1]

Y_cv = data_np[(n_train + 1):(n_train + n_cv), -1]
X_cv = data_np[(n_train + 1):(n_train + n_cv), 0:-1]

Y_test = data_np[(n_train + n_cv + 1):, -1]
X_test = data_np[(n_train + n_cv + 1):, 0:-1]

In [16]:
mean = np.mean(X_train, axis = 0)
sigma = np.std(X_train, axis = 0)

X_train_norm = (X_train - mean)/sigma

X_cv_norm = (X_cv - mean)/sigma

X_test_norm = (X_test - mean)/sigma

In [17]:
clf = Ridge(alpha=1, fit_intercept = False)
clf.fit(X_train, Y_train)

Ridge(alpha=1, fit_intercept=False)

In [18]:
Y_cv_pred = clf.score(X_cv, Y_cv)

In [19]:
Y_cv_pred

0.9999999993863935

In [20]:
Y_test_pred = clf.score(X_test, Y_test)

In [21]:
Y_test_pred

0.9999999993356493

In [22]:
pd.DataFrame(data = {'param': list(data_yahoo.keys())[0:-1], 'value_pred': list(np.round(clf.coef_, 2)), 'value_true': [0,0,0,0.1,0,3,3,0.5,0.25,0.2,0,1.5,0,0,1,0,0,2,0,0.2,1,0]})

Unnamed: 0,param,value_pred,value_true
0,isHome,0.0,0.0
1,isWin,0.0,0.0
2,isOT,-0.0,0.0
3,fws,0.1,0.1
4,timeOnIce,0.0,0.0
5,assists,3.0,3.0
6,goals,3.0,3.0
7,pim,0.5,0.5
8,shots,0.25,0.25
9,hits,0.2,0.2
