# Taking a look at our Standard Player data #

Here we will take a quick look at the standard player metrics collected. This data includes things much of the traditional, and likely most important data we have. It includes metrics like goals, assists, points, +/-, and more.

In [24]:
import pandas as pd
from per_60_functions import per_60_to_totals, per_60_to_GP
import os

In [25]:
# Load bio data from directory "../../Data/NHL_PlayerData_NatrualStatTrick/oi" using pandas dataframe

directory = "../../Data/NHL_PlayerData_NaturalStatTrick/std"

std_df = pd.DataFrame()

for year in range(2007, 2025):
    filename = f'regSeason_allStrengths_allScores_std_{year}.csv'
    df = pd.read_csv(f'{directory}/{filename}')
    # Concatenate dataframes with bio_df
    std_df = pd.concat([std_df, df])

std_df.head(-5)

Unnamed: 0,Year,Player,Team,Position,GP,TOI,TOI/GP,Goals/60,Total Assists/60,First Assists/60,...,Misconduct/60,Penalties Drawn/60,Giveaways/60,Takeaways/60,Hits/60,Hits Taken/60,Shots Blocked/60,Faceoffs Won/60,Faceoffs Lost/60,Faceoffs %
0,2007,Aaron Downey,DET,R,56,256.933333,4.588095,0.00,0.70,0.70,...,0.47,4.90,0.93,0.70,18.45,11.21,1.40,0.00,0.00,-
1,2007,Aaron Johnson,NYI,D,30,415.850000,13.861667,0.00,0.29,0.00,...,0.00,0.58,2.31,2.60,5.92,5.63,4.04,0.00,0.00,-
2,2007,Aaron Miller,VAN,D,57,988.183333,17.336550,0.06,0.49,0.06,...,0.06,0.24,1.76,0.91,2.25,4.49,4.86,0.00,0.00,-
3,2007,Aaron Rome,CBJ,D,17,309.033333,18.178431,0.19,0.19,0.00,...,0.19,0.97,0.78,0.39,6.41,4.27,3.30,0.00,0.00,-
4,2007,Aaron Voros,MIN,L,55,504.600000,9.174545,0.83,0.83,0.71,...,0.24,3.80,1.19,0.95,11.41,7.97,0.59,0.36,1.43,2.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876,2024,Zach Dean,STL,C,1,8.683333,8.683333,0.00,0.00,0.00,...,0.00,0.00,0.00,6.91,0.00,6.91,0.00,6.91,13.82,230.33
877,2024,Zach Hyman,EDM,L,66,1292.750000,19.587121,2.23,0.88,0.46,...,0.00,0.88,0.74,1.53,2.27,5.80,1.58,0.32,0.42,2.03
878,2024,Zach Parise,COL,L,17,234.533333,13.796078,1.02,0.77,0.51,...,0.00,0.26,0.26,1.53,3.33,4.86,3.33,2.56,4.86,8.82
879,2024,Zach Sanford,"ARI, CHI",L,29,314.266667,10.836782,0.00,1.15,0.95,...,0.00,0.57,1.15,3.05,3.63,5.73,3.63,8.02,14.70,6.74


In [26]:
std_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16113 entries, 0 to 885
Data columns (total 36 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year                 16113 non-null  int64  
 1   Player               16113 non-null  object 
 2   Team                 16113 non-null  object 
 3   Position             16113 non-null  object 
 4   GP                   16113 non-null  int64  
 5   TOI                  16113 non-null  float64
 6   TOI/GP               16113 non-null  float64
 7   Goals/60             16113 non-null  float64
 8   Total Assists/60     16113 non-null  float64
 9   First Assists/60     16113 non-null  float64
 10  Second Assists/60    16113 non-null  float64
 11  Total Points/60      16113 non-null  float64
 12  IPP                  16113 non-null  object 
 13  Shots/60             16113 non-null  float64
 14  SH%                  16113 non-null  object 
 15  ixG/60               16113 non-null  float6

Here we see that given 15227 rows, not a single column has a null entry, which is again fantastic.n Let's take a look at each of the columns to try to recognize if the values make sense.

In [27]:
# Use .describe() to get summary statistics of the data in a way that we view all columns

std_df.describe()

Unnamed: 0,Year,GP,TOI,TOI/GP,Goals/60,Total Assists/60,First Assists/60,Second Assists/60,Total Points/60,Shots/60,...,Major/60,Misconduct/60,Penalties Drawn/60,Giveaways/60,Takeaways/60,Hits/60,Hits Taken/60,Shots Blocked/60,Faceoffs Won/60,Faceoffs Lost/60
count,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,...,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0
mean,2015.604419,47.177807,778.430285,14.98249,0.496651,0.816885,0.450256,0.366641,1.313563,5.849258,...,0.149402,0.035031,0.877195,1.522993,1.343703,5.550889,5.091776,2.721521,5.319401,5.569171
std,5.183898,27.806134,553.378814,4.554586,0.527633,0.637831,0.438511,0.380833,0.941622,2.721122,...,0.626152,0.272973,1.138037,1.070373,1.009584,4.690641,2.808683,1.947884,9.294357,9.120115
min,2007.0,1.0,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,20.0,237.9,11.667521,0.09,0.42,0.14,0.13,0.66,3.99,...,0.0,0.0,0.34,0.95,0.73,2.35,3.38,1.35,0.0,0.0
50%,2016.0,53.0,784.85,15.010833,0.39,0.76,0.39,0.33,1.2,5.69,...,0.0,0.0,0.65,1.42,1.25,4.28,4.69,2.3,0.2,0.44
75%,2020.0,72.0,1228.883333,18.246753,0.78,1.14,0.66,0.52,1.86,7.52,...,0.06,0.0,1.07,1.98,1.81,7.36,6.28,3.81,6.18,7.62
max,2024.0,85.0,2411.95,29.414024,15.84,12.37,9.89,12.37,15.84,35.29,...,22.09,13.28,41.86,21.18,22.43,76.6,83.72,34.29,51.18,80.72


In [28]:
std_df.columns

Index(['Year', 'Player', 'Team', 'Position', 'GP', 'TOI', 'TOI/GP', 'Goals/60',
       'Total Assists/60', 'First Assists/60', 'Second Assists/60',
       'Total Points/60', 'IPP', 'Shots/60', 'SH%', 'ixG/60', 'iCF/60',
       'iFF/60', 'iSCF/60', 'iHDCF/60', 'Rush Attempts/60',
       'Rebounds Created/60', 'PIM/60', 'Total Penalties/60', 'Minor/60',
       'Major/60', 'Misconduct/60', 'Penalties Drawn/60', 'Giveaways/60',
       'Takeaways/60', 'Hits/60', 'Hits Taken/60', 'Shots Blocked/60',
       'Faceoffs Won/60', 'Faceoffs Lost/60', 'Faceoffs %'],
      dtype='object')

In [29]:
std_df['Position'].unique()

array(['R', 'D', 'L', 'C', 'C, L', 'L, R', 'C, R', 'D, L'], dtype=object)

# Converting Rates to Totals

The features of our data are all in rate form, and it occurs to me that it may be useful to utilize the totals for our features, instead of the rates. Here I will convert all features that are in a rate format "feature/60" back to simply "feature".

Something convenient that I noticed is that TOI is already in the correct format. As opposed to minutes:seconds it has already been converted to total minutes

In [30]:
per_60_columns = std_df.filter(like='/60').columns

totals_df = per_60_to_totals(std_df, per_60_columns)

# Merge std_df with new_df on axis = 1
std_df = pd.concat([std_df, totals_df], axis=1)


In [31]:
gp_df = per_60_to_GP(std_df, per_60_columns)

std_df = pd.concat([std_df, gp_df], axis=1)

In [32]:
# Extract all columns that end in '/60'

per_60_columns = std_df.filter(like='/60')

per_GP_columns = std_df.filter(like='/GP')

percentage_columns = std_df.filter(like='%')

# Extract all columns that are not per 60 or per GP

non_per_columns = std_df.drop(per_60_columns.columns, axis=1).drop(per_GP_columns.columns, axis=1).drop(percentage_columns.columns, axis=1)

final_data = pd.concat([non_per_columns, per_GP_columns, per_60_columns, percentage_columns], axis=1)
final_data.columns.tolist()

['Year',
 'Player',
 'Team',
 'Position',
 'GP',
 'TOI',
 'IPP',
 'Goals',
 'Total Assists',
 'First Assists',
 'Second Assists',
 'Total Points',
 'Shots',
 'ixG',
 'iCF',
 'iFF',
 'iSCF',
 'iHDCF',
 'Rush Attempts',
 'Rebounds Created',
 'PIM',
 'Total Penalties',
 'Minor',
 'Major',
 'Misconduct',
 'Penalties Drawn',
 'Giveaways',
 'Takeaways',
 'Hits',
 'Hits Taken',
 'Shots Blocked',
 'Faceoffs Won',
 'Faceoffs Lost',
 'TOI/GP',
 'Goals/GP',
 'Total Assists/GP',
 'First Assists/GP',
 'Second Assists/GP',
 'Total Points/GP',
 'Shots/GP',
 'ixG/GP',
 'iCF/GP',
 'iFF/GP',
 'iSCF/GP',
 'iHDCF/GP',
 'Rush Attempts/GP',
 'Rebounds Created/GP',
 'PIM/GP',
 'Total Penalties/GP',
 'Minor/GP',
 'Major/GP',
 'Misconduct/GP',
 'Penalties Drawn/GP',
 'Giveaways/GP',
 'Takeaways/GP',
 'Hits/GP',
 'Hits Taken/GP',
 'Shots Blocked/GP',
 'Faceoffs Won/GP',
 'Faceoffs Lost/GP',
 'Goals/60',
 'Total Assists/60',
 'First Assists/60',
 'Second Assists/60',
 'Total Points/60',
 'Shots/60',
 'ixG/60',
 

# Save this data to disk

In [33]:
dir_path = '../../Data/Warehouse/PlayerData/'
filename = 'std.csv'

if not os.path.exists(dir_path):
    os.makedirs(dir_path)

final_data.to_csv(f'{dir_path}/{filename}', index=False)