<a href="https://colab.research.google.com/github/LiamMcFall/NHL-Skater-Position-Classification/blob/master/NHL_Skater_positionClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

NHL Skater stats 1940-2018

In [107]:
url = "https://raw.githubusercontent.com/LiamMcFall/NHL-Skater-Position-Classification/master/skater_stats.csv"

skater_stats = pd.read_csv(url,encoding= 'unicode_escape', index_col= 0, na_values= "-", dtype={'TOI': str, 'ATOI' : str, "Pos" : str})

skater_stats.head(10)

skater_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37825 entries, 2018JustinAbdelkader to 1940MiltSchmidt*
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Season  37825 non-null  int64  
 1   Player  37825 non-null  object 
 2   Age     37823 non-null  float64
 3   Tm      37825 non-null  object 
 4   Pos     37825 non-null  object 
 5   GP      37825 non-null  int64  
 6   G       29889 non-null  float64
 7   GPG     37825 non-null  float64
 8   A       32402 non-null  float64
 9   PTS     33417 non-null  float64
 10  +/-     30889 non-null  float64
 11  PIM     34007 non-null  float64
 12  EVG     26622 non-null  float64
 13  PPG     16189 non-null  float64
 14  SHG     6013 non-null   float64
 15  GWG     16634 non-null  float64
 16  EVA     28704 non-null  float64
 17  PPA     18536 non-null  float64
 18  SHA     6792 non-null   float64
 19  S       32723 non-null  float64
 20  S%      26819 non-null  float64
 21  TOI     16

In [0]:
# Drop unneeded and string columns
skater_stats = skater_stats.drop(["ATOI", "Player", "Tm"], axis = 1)

In [0]:
def clean_toi(x):
  ''' If TOI is a string, remove the comma and convert to float. '''
  if isinstance(x, str):
    return(x.replace(',', ''))
  return(x)

In [0]:
# Remove commas and convert TOI to float
skater_stats['TOI'] = skater_stats['TOI'].apply(clean_toi).astype('float')

In [0]:
# Create new ATOI by minute float so : is no longer needed
skater_stats["ATOI"] = skater_stats["TOI"] / skater_stats["GP"]

In [112]:
skater_stats.head()

Unnamed: 0_level_0,Season,Age,Pos,GP,G,GPG,A,PTS,+/-,PIM,EVG,PPG,SHG,GWG,EVA,PPA,SHA,S,S%,TOI,BLK,HIT,FOwin,FOloss,FO%,ATOI
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2018JustinAbdelkader,2018,30.0,LW,75,13.0,0.1733,22.0,35.0,-11.0,78.0,9.0,4.0,,,17.0,5.0,,110.0,12.0,1241.0,40.0,174.0,47.0,50.0,48.5,16.546667
2018PontusAberg,2018,24.0,LW,53,4.0,0.0755,12.0,16.0,9.0,10.0,4.0,,,3.0,11.0,1.0,,70.0,6.0,645.0,8.0,24.0,4.0,8.0,33.3,12.169811
2018NoelAcciari,2018,26.0,C,60,10.0,0.1667,1.0,11.0,-6.0,9.0,9.0,,1.0,,1.0,,,66.0,15.0,775.0,41.0,152.0,42.0,51.0,45.2,12.916667
2018KennyAgostino,2018,25.0,LW,5,,0.0,1.0,1.0,-1.0,4.0,,,,,,1.0,,11.0,,60.0,1.0,4.0,0.0,1.0,0.0,12.0
2018SebastianAho,2018,20.0,RW,78,29.0,0.3718,36.0,65.0,4.0,24.0,21.0,8.0,,4.0,28.0,8.0,,200.0,15.0,1398.0,17.0,65.0,78.0,94.0,45.3,17.923077


FO % was not recorded before 2008, so I am choosing just to use 2008 to 2018 to train this model.

In [0]:
skater_stats0818 = skater_stats[skater_stats["Season"] > 2007]

In [114]:
skater_stats0818.tail()

Unnamed: 0_level_0,Season,Age,Pos,GP,G,GPG,A,PTS,+/-,PIM,EVG,PPG,SHG,GWG,EVA,PPA,SHA,S,S%,TOI,BLK,HIT,FOwin,FOloss,FO%,ATOI
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2008PeterVandermeer,2008,32.0,LW,2,,0.0,,,,,,,,,,,,,,15.0,1.0,2.0,0.0,0.0,,7.5
2008NoahWelch,2008,25.0,D,4,,0.0,,,1.0,7.0,,,,,,,,,,32.0,2.0,6.0,0.0,0.0,,8.0
2008JesseWinchester,2008,24.0,C,1,,0.0,,,,2.0,,,,,,,,1.0,,14.0,0.0,1.0,0.0,0.0,,14.0
2008BryanYoung,2008,21.0,D,2,,0.0,,,-1.0,,,,,,,,,,,4.0,0.0,1.0,0.0,0.0,,2.0
2008IlyaZubov,2008,20.0,C,1,,0.0,,,,,,,,,,,,,,15.0,0.0,3.0,2.0,3.0,40.0,15.0


In [115]:
skater_stats0818.head()

Unnamed: 0_level_0,Season,Age,Pos,GP,G,GPG,A,PTS,+/-,PIM,EVG,PPG,SHG,GWG,EVA,PPA,SHA,S,S%,TOI,BLK,HIT,FOwin,FOloss,FO%,ATOI
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2018JustinAbdelkader,2018,30.0,LW,75,13.0,0.1733,22.0,35.0,-11.0,78.0,9.0,4.0,,,17.0,5.0,,110.0,12.0,1241.0,40.0,174.0,47.0,50.0,48.5,16.546667
2018PontusAberg,2018,24.0,LW,53,4.0,0.0755,12.0,16.0,9.0,10.0,4.0,,,3.0,11.0,1.0,,70.0,6.0,645.0,8.0,24.0,4.0,8.0,33.3,12.169811
2018NoelAcciari,2018,26.0,C,60,10.0,0.1667,1.0,11.0,-6.0,9.0,9.0,,1.0,,1.0,,,66.0,15.0,775.0,41.0,152.0,42.0,51.0,45.2,12.916667
2018KennyAgostino,2018,25.0,LW,5,,0.0,1.0,1.0,-1.0,4.0,,,,,,1.0,,11.0,,60.0,1.0,4.0,0.0,1.0,0.0,12.0
2018SebastianAho,2018,20.0,RW,78,29.0,0.3718,36.0,65.0,4.0,24.0,21.0,8.0,,4.0,28.0,8.0,,200.0,15.0,1398.0,17.0,65.0,78.0,94.0,45.3,17.923077


In [116]:
# Replace NaNs with 0
skater_stats0818= skater_stats0818.fillna(0)
skater_stats0818.head()

Unnamed: 0_level_0,Season,Age,Pos,GP,G,GPG,A,PTS,+/-,PIM,EVG,PPG,SHG,GWG,EVA,PPA,SHA,S,S%,TOI,BLK,HIT,FOwin,FOloss,FO%,ATOI
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2018JustinAbdelkader,2018,30.0,LW,75,13.0,0.1733,22.0,35.0,-11.0,78.0,9.0,4.0,0.0,0.0,17.0,5.0,0.0,110.0,12.0,1241.0,40.0,174.0,47.0,50.0,48.5,16.546667
2018PontusAberg,2018,24.0,LW,53,4.0,0.0755,12.0,16.0,9.0,10.0,4.0,0.0,0.0,3.0,11.0,1.0,0.0,70.0,6.0,645.0,8.0,24.0,4.0,8.0,33.3,12.169811
2018NoelAcciari,2018,26.0,C,60,10.0,0.1667,1.0,11.0,-6.0,9.0,9.0,0.0,1.0,0.0,1.0,0.0,0.0,66.0,15.0,775.0,41.0,152.0,42.0,51.0,45.2,12.916667
2018KennyAgostino,2018,25.0,LW,5,0.0,0.0,1.0,1.0,-1.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,0.0,60.0,1.0,4.0,0.0,1.0,0.0,12.0
2018SebastianAho,2018,20.0,RW,78,29.0,0.3718,36.0,65.0,4.0,24.0,21.0,8.0,0.0,4.0,28.0,8.0,0.0,200.0,15.0,1398.0,17.0,65.0,78.0,94.0,45.3,17.923077


In [117]:
skater_stats0818["Pos"].value_counts()

D        3315
C        2786
LW       1830
RW       1742
C/LW        4
LW/RW       3
RW/LW       2
LW/C        1
RW/C        1
Name: Pos, dtype: int64

In [0]:
# Very few hybrid positions. Going to drop them for ease of use.

skater_statstest = skater_stats0818[(skater_stats0818["Pos"] == "D") | (skater_stats0818["Pos"] == "C") | (skater_stats0818["Pos"] == "LW") | (skater_stats0818["Pos"] == "RW") ]

In [120]:
skater_statstest["Pos"].value_counts()

D     3315
C     2786
LW    1830
RW    1742
Name: Pos, dtype: int64

In [0]:
skater_stats0818 = skater_stats0818[(skater_stats0818["Pos"] == "D") | (skater_stats0818["Pos"] == "C") | (skater_stats0818["Pos"] == "LW") | (skater_stats0818["Pos"] == "RW") ]

In [0]:
from sklearn.model_selection import train_test_split 
train, test = train_test_split(skater_stats0818, test_size=0.25, stratify=skater_stats0818['Pos'], random_state = 7)

In [0]:
train_skater_stats0818 = train.drop(["Pos", "Season"], axis = 1)

In [0]:
test_skater_stats0818 = test.drop(["Pos", "Season"], axis = 1)

In [0]:
train_labels = train["Pos"]

In [0]:
test_labels = test["Pos"]

In [189]:
# Random forest test

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100,max_depth=10, random_state= 7)

rf.fit(train_skater_stats0818,train_labels)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=7, verbose=0,
                       warm_start=False)

In [198]:
rf.classes_

array(['C', 'D', 'LW', 'RW'], dtype=object)

In [0]:
from sklearn.model_selection import cross_val_predict

train_pred = cross_val_predict(rf, train_skater_stats0818, train_labels, cv=3)

In [199]:
from sklearn.metrics import confusion_matrix

conf_mx = confusion_matrix(train_labels, train_pred)
conf_mx

array([[1726,   29,  182,  152],
       [   1, 2409,   60,   16],
       [ 255,   92,  588,  437],
       [ 230,   69,  518,  490]])

In [0]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, train_skater_stats0818, train_labels,
                         scoring="accuracy", cv=10)

In [203]:
scores

array([0.7231405 , 0.71900826, 0.71349862, 0.72727273, 0.69931034,
       0.72137931, 0.71448276, 0.7337931 , 0.72      , 0.72551724])

In [204]:
# Test accuracy

from sklearn.metrics import accuracy_score

test_preds = rf.predict(test_skater_stats0818)
accuracy_score(test_labels, test_pred)

0.729226953286482

In [205]:
test_conf_mx = confusion_matrix(test_labels, test_pred)
test_conf_mx

array([[581,  16,  53,  47],
       [  0, 808,  19,   2],
       [ 82,  27, 205, 144],
       [ 76,  32, 157, 170]])