# Taking a look at our On Ice data #

Here we will take a quick look at the player metrics collected

In [98]:
import pandas as pd
import numpy as np
from per_60_functions import per_60_to_totals, per_60_to_GP
import os

In [99]:
# Load bio data from directory "../../Data/NHL_PlayerData_NatrualStatTrick/oi" using pandas dataframe

directory = "../../Data/NHL_PlayerData_NaturalStatTrick/oi"

oi_df = pd.DataFrame()

for year in range(2007, 2025):
    filename = f'regSeason_allStrengths_allScores_oi_{year}.csv'
    df = pd.read_csv(f'{directory}/{filename}')
    # Concatenate dataframes with bio_df
    oi_df = pd.concat([oi_df, df])

oi_df.head(-5)

Unnamed: 0,Year,Player,Team,Position,GP,TOI,TOI/GP,CF/60,CA/60,CF%,...,PDO,Off. Zone Starts/60,Neu. Zone Starts/60,Def. Zone Starts/60,On The Fly Starts/60,Off. Zone Start %,Off. Zone Faceoffs/60,Neu. Zone Faceoffs/60,Def. Zone Faceoffs/60,Off. Zone Faceoff %
0,2007,Aaron Downey,DET,R,56,256.933333,4.588095,45.77,44.60,50.65,...,0.991,18.21,12.14,2.34,65.62,88.64,24.29,15.65,5.14,82.54
1,2007,Aaron Johnson,NYI,D,30,415.850000,13.861667,53.38,50.35,51.46,...,1.02,7.79,11.54,4.04,50.21,65.85,16.59,20.92,13.13,55.83
2,2007,Aaron Miller,VAN,D,57,988.183333,17.336550,38.80,57.99,40.09,...,1.0,4.37,11.72,9.96,53.43,30.51,9.71,18.58,18.09,34.93
3,2007,Aaron Rome,CBJ,D,17,309.033333,18.178431,43.30,46.60,48.16,...,0.944,5.63,11.65,8.54,53.20,39.73,11.26,18.83,15.92,41.43
4,2007,Aaron Voros,MIN,L,55,504.600000,9.174545,49.11,51.37,48.88,...,1.013,13.67,18.07,8.44,59.10,61.83,18.79,24.38,17.12,52.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876,2024,Zach Dean,STL,C,1,8.683333,8.683333,0.00,62.19,0.0,...,1.00,6.91,13.82,0.00,55.28,100.00,6.91,13.82,0.00,100.00
877,2024,Zach Hyman,EDM,L,66,1292.750000,19.587121,84.98,45.99,64.88,...,1.025,15.04,9.98,5.62,31.75,72.81,30.82,14.06,13.37,69.75
878,2024,Zach Parise,COL,L,17,234.533333,13.796078,57.05,68.56,45.42,...,0.978,8.44,12.28,11.77,53.72,41.77,13.81,17.65,22.77,37.76
879,2024,Zach Sanford,"ARI, CHI",L,29,314.266667,10.836782,55.18,59.76,48.01,...,1.009,9.55,12.22,5.35,50.21,64.10,17.95,15.66,12.98,58.02


## The importance of our On Ice Data ##

This data is likely to be composed of very valuable features we will likely want to use for training our model, so we should do a fairly thorough job of cleaning and ensuring it is ready for proper analysis.

This data is composed of what are called "advanced metrics".

In [100]:
oi_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16113 entries, 0 to 885
Data columns (total 55 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   16113 non-null  int64  
 1   Player                 16113 non-null  object 
 2   Team                   16113 non-null  object 
 3   Position               16113 non-null  object 
 4   GP                     16113 non-null  int64  
 5   TOI                    16113 non-null  float64
 6   TOI/GP                 16113 non-null  float64
 7   CF/60                  16113 non-null  float64
 8   CA/60                  16113 non-null  float64
 9   CF%                    16113 non-null  object 
 10  FF/60                  16113 non-null  float64
 11  FA/60                  16113 non-null  float64
 12  FF%                    16113 non-null  object 
 13  SF/60                  16113 non-null  float64
 14  SA/60                  16113 non-null  float64
 15  SF%      

Here we see that given 15227 rows, not a single column has a null entry, which is fantastic.

In [101]:
# Use .describe() to get summary statistics of the data in a way that we view all columns

oi_df.describe()

Unnamed: 0,Year,GP,TOI,TOI/GP,CF/60,CA/60,FF/60,FA/60,SF/60,SA/60,...,LDCA/60,LDGF/60,LDGA/60,Off. Zone Starts/60,Neu. Zone Starts/60,Def. Zone Starts/60,On The Fly Starts/60,Off. Zone Faceoffs/60,Neu. Zone Faceoffs/60,Def. Zone Faceoffs/60
count,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,...,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0,16113.0
mean,2015.604419,47.177807,778.430285,14.98249,53.927255,55.339922,40.188855,41.235117,28.882536,29.689641,...,37.651246,0.623821,0.668877,10.207633,12.808478,9.315112,47.719222,18.545348,18.620042,17.886219
std,5.183898,27.806134,553.378814,4.554586,10.862991,9.659843,8.272385,7.350659,6.228328,5.726489,...,9.41126,0.781456,0.761003,3.881856,3.266758,4.291803,8.950246,6.016696,4.031421,6.084187
min,2007.0,1.0,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,20.0,237.9,11.667521,46.96,49.55,35.02,37.13,25.14,26.68,...,31.94,0.26,0.34,7.6,11.16,6.57,41.64,14.47,16.69,13.97
50%,2016.0,53.0,784.85,15.010833,53.76,54.8,40.15,40.86,28.89,29.42,...,36.38,0.57,0.61,9.96,12.65,8.72,47.05,18.08,18.57,17.16
75%,2020.0,72.0,1228.883333,18.246753,61.08,60.56,45.59,44.9,32.88,32.3,...,41.85,0.87,0.86,12.56,14.22,11.56,52.88,22.43,20.38,21.28
max,2024.0,85.0,2411.95,29.414024,141.39,193.72,135.85,129.15,101.89,128.57,...,148.57,60.47,20.55,60.0,80.0,45.3,153.19,81.57,160.0,62.62


# Converting Rates to Totals

The features of our data are all in rate form, and it occurs to me that it may be useful to utilize the totals for our features, instead of the rates. Here I will convert all features that are in a rate format "feature/60" back to simply "feature".

Something convenient that I noticed is that TOI is already in the correct format. As opposed to minutes:seconds it has already been converted to total minutes

In [102]:
per_60_columns = oi_df.filter(like='/60').columns

totals_df = per_60_to_totals(oi_df, per_60_columns)

# Merge oi_df with new_df on axis = 1
oi_df = pd.concat([oi_df, totals_df], axis=1)


In [103]:
gp_df = per_60_to_GP(oi_df, per_60_columns)

oi_df = pd.concat([oi_df, gp_df], axis=1)

In [104]:
# Extract all columns that end in '/60'

per_60_columns = oi_df.filter(like='/60')

per_GP_columns = oi_df.filter(like='/GP')

percentage_columns = oi_df.filter(like='%')

# Extract all columns that are not per 60 or per GP

non_per_columns = oi_df.drop(per_60_columns.columns, axis=1).drop(per_GP_columns.columns, axis=1).drop(percentage_columns.columns, axis=1)

final_data = pd.concat([non_per_columns, per_GP_columns, per_60_columns, percentage_columns], axis=1)
final_data.columns.tolist()

['Year',
 'Player',
 'Team',
 'Position',
 'GP',
 'TOI',
 'PDO',
 'CF',
 'CA',
 'FF',
 'FA',
 'SF',
 'SA',
 'GF',
 'GA',
 'xGF',
 'xGA',
 'SCF',
 'SCA',
 'HDCF',
 'HDCA',
 'HDGF',
 'HDGA',
 'MDCF',
 'MDCA',
 'MDGF',
 'MDGA',
 'LDCF',
 'LDCA',
 'LDGF',
 'LDGA',
 'Off.\xa0Zone Starts',
 'Neu.\xa0Zone Starts',
 'Def.\xa0Zone Starts',
 'On\xa0The\xa0Fly Starts',
 'Off.\xa0Zone Faceoffs',
 'Neu.\xa0Zone Faceoffs',
 'Def.\xa0Zone Faceoffs',
 'TOI/GP',
 'CF/GP',
 'CA/GP',
 'FF/GP',
 'FA/GP',
 'SF/GP',
 'SA/GP',
 'GF/GP',
 'GA/GP',
 'xGF/GP',
 'xGA/GP',
 'SCF/GP',
 'SCA/GP',
 'HDCF/GP',
 'HDCA/GP',
 'HDGF/GP',
 'HDGA/GP',
 'MDCF/GP',
 'MDCA/GP',
 'MDGF/GP',
 'MDGA/GP',
 'LDCF/GP',
 'LDCA/GP',
 'LDGF/GP',
 'LDGA/GP',
 'Off.\xa0Zone Starts/GP',
 'Neu.\xa0Zone Starts/GP',
 'Def.\xa0Zone Starts/GP',
 'On\xa0The\xa0Fly Starts/GP',
 'Off.\xa0Zone Faceoffs/GP',
 'Neu.\xa0Zone Faceoffs/GP',
 'Def.\xa0Zone Faceoffs/GP',
 'CF/60',
 'CA/60',
 'FF/60',
 'FA/60',
 'SF/60',
 'SA/60',
 'GF/60',
 'GA/60',
 'x

# Save this data to disk

In [105]:
dir_path = '../../Data/Warehouse/PlayerData/'
filename = 'oi.csv'

if not os.path.exists(dir_path):
    os.makedirs(dir_path)

final_data.to_csv(f'{dir_path}/{filename}', index=False)