# Taking a look at our Bio data #

Here we will take a quick look at the player metrics collected

In [202]:
import pandas as pd
import numpy as np
import dataprep as dp
from dataprep.eda import plot

In [203]:
# Load bio data from directory "../../Data/NHL_PlayerData_NatrualStatTrick/bio" using pandas dataframe

directory = "../../Data/NHL_PlayerData_NaturalStatTrick/bio"

bio_df = pd.DataFrame()

for year in range(2007, 2025):
    filename = f'regSeason_allStrengths_allScores_bio_{year}.csv'
    df = pd.read_csv(f'{directory}/{filename}')
    # Concatenate dataframes with bio_df
    bio_df = pd.concat([bio_df, df])

bio_df.shape

(16113, 17)

In [204]:
unique_rows = bio_df.drop_duplicates(subset=['Player', 'Year']).shape
print(unique_rows)
print(bio_df.shape)

(16104, 17)
(16113, 17)


## Cleaning Required ##

One thing that I noticed right away was that when I take a look at the Nationality column, some of the values are integers!

In [205]:
# View the column 'Nationality'
print(bio_df['Nationality'].head(20))
print(bio_df['Nationality'].sort_values().unique())

0     CAN
1     CAN
2     USA
3     CAN
4     CAN
5     CAN
6     CAN
7     USA
8     CAN
9     USA
10    CAN
11    USA
12    CAN
13    CAN
14     72
15     73
16    USA
17     74
18    USA
19     74
Name: Nationality, dtype: object
['-' '185' '187' '189' '196' '198' '2019' '2020' '2021' '2022' '2023'
 '207' '220' '68' '69' '70' '71' '72' '73' '74' '75' '76' '77' '78' '79'
 '80' '81' 'CAN' 'CHE' 'SWE' 'USA']


The values here are rather telling. CAN, USA, CHE, SWE and - make sense. Otherwise, whereas the rest do not. 

Digging into this a little further I can see that:
1. Values in range 68-81 are inches (height).
2. Values from 185 - 220 are lbs (weight)
3. Values from 2019 - 2023 are Draft Years

So I suspect what has happened is that when procuring the data, certain values were missing and those rows were truncated, with the incorrect values being extracted and placed in those columns. So, working from Nationality to the end of the rows, I will try to correct our bio data.

In [206]:
# Write a function that checks if a string is an integer
def is_integer(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# Create a mask that filters 'Nationality' column for rows that contain integers
mask = bio_df['Nationality'].apply(lambda x: is_integer(x))

In [207]:
mask

0      False
1      False
2      False
3      False
4      False
       ...  
881    False
882    False
883    False
884    False
885     True
Name: Nationality, Length: 16113, dtype: bool

In [208]:
# Get the columns to shift
columns_to_shift = bio_df.columns[9:-1]
columns_to_shift

Index(['Nationality', 'Height (in)', 'Weight (lbs)', 'Draft Year',
       'Draft Team', 'Draft Round', 'Round Pick'],
      dtype='object')

In [209]:
# Shift data to the right by 1 for the rows that returned True
bio_df_copy1 = bio_df.copy()

# Step 1: Extract the relevant data into a numpy array
data_to_shift = bio_df_copy1.loc[mask, columns_to_shift].to_numpy()

# Step 2: Create a new array for the shifted data, initialized with NaN
shifted_data = np.full(data_to_shift.shape, np.nan, dtype=object)

# Perform the horizontal shift by moving data one column to the right
shifted_data[:, 1:] = data_to_shift[:, :-1]

# Step 3: Update the original DataFrame with the shifted data
bio_df_copy1.loc[mask, columns_to_shift] = shifted_data

# Verify the changes
bio_df_copy1.loc[mask, columns_to_shift].head(20)

Unnamed: 0,Nationality,Height (in),Weight (lbs),Draft Year,Draft Team,Draft Round,Round Pick
14,,72,177,2001,EDM,1,13
15,,73,225,1998,BUF,6,19
17,,74,210,2000,L.A,1,20
19,,74,222,1991,NYR,1,15
20,,75,238,2004,WSH,1,1
22,,76,210,2004,VAN,3,26
23,,73,198,2004,OTT,4,24
24,,73,200,2004,NSH,1,15
25,,74,211,2002,WSH,1,13
30,,76,225,1998,TOR,4,1


## Checking Height (in) Column ##

Now lets do an analysis on the Height (in) column to make sure it is appropriate.

In [212]:
# View all the values in column Height (in)
print(bio_df_copy1['Height (in)'].unique())

[73 75 74 '72' '73' 72 '74' 71 '75' '76' '78' '71' 76 77 70 69 78 '79' 67
 68 '77' '70' 79 '69' '68' '81' 80 64 '80' 66 '189' '66' '67' '64' '187'
 '185' '220' '2023' '2020' '-' '2019' '2021' '198' '2022' '207' '196']


In [213]:
# Replace '-' with 0
bio_df_copy1['Height (in)'] = bio_df_copy1['Height (in)'].replace('-', 0)

# Convert the column 'Height (in)' to integer
bio_df_copy1['Height (in)'] = bio_df_copy1['Height (in)'].astype(int)

Here again we see values that do not make sense. Values from 185 - 220 are weights, while 2019 - 2023 are draft years. So lets again push the everything that doesn't belong one column to the right.

In [90]:
# Function that checks if a rows value is a float
def is_large(num):
    if pd.isnull(num) or num < 90:
        return False
    else:
        return True

In [214]:
# Create a mask that filters 'Height (in)' column for rows that contain large numbers
mask2 = bio_df_copy1['Height (in)'].apply(lambda x: is_large(x))

# Get the columns to shift
columns_to_shift2 = bio_df_copy1.columns[10:-1]

# Step 1: Extract the relevant data into a numpy array
data_to_shift2 = bio_df_copy1.loc[mask2, columns_to_shift2].to_numpy()

# Step 2: Create a new array for the shifted data, initialized with NaN
shifted_data2 = np.full(data_to_shift2.shape, np.nan, dtype=object)

# Perform the horizontal shift by moving data one column to the right
shifted_data2[:, 1:] = data_to_shift2[:, :-1]

# Step 3: Update the original DataFrame with the shifted data
bio_df_copy1.loc[mask2, columns_to_shift2] = shifted_data2

# Verify the changes
bio_df_copy1.loc[mask2, columns_to_shift2].head(20)

Unnamed: 0,Height (in),Weight (lbs),Draft Year,Draft Team,Draft Round,Round Pick
470,,189,-,-,-,-
458,,189,-,-,-,-
670,,187,-,-,-,-
461,,189,-,-,-,-
686,,187,-,-,-,-
512,,189,-,-,-,-
961,,185,2019,OTT,4,1
4,,220,2019,NYR,6,6
6,,2023,CBJ,1,3,3
110,,2020,VGK,1,29,29


In [219]:
# Replace NaN with 0
bio_df_copy1['Height (in)'] = bio_df_copy1['Height (in)'].fillna(0)

# Convert the column 'Height (in)' to integer
bio_df_copy1['Height (in)'] = bio_df_copy1['Height (in)'].astype(int)

# View the column 'Height (in)'
print(bio_df_copy1['Height (in)'].head(20))

0     73
1     73
2     75
3     73
4     74
5     74
6     75
7     73
8     74
9     74
10    73
11    73
12    74
13    73
14    72
15    73
16    72
17    74
18    71
19    74
Name: Height (in), dtype: int64


Now these values make sense. So we can go on to check the weight column

## Checking Weight (lbs) Column ##

Now lets do an analysis on the Weight (lbs) column to make sure it is appropriate.

In [222]:
# Replace all '-' values in Weight (lbs) column with 0
bio_df_copy1['Weight (lbs)'] = bio_df_copy1['Weight (lbs)'].replace('-', 0)

# Convert the Weight (lbs) column to int numeric data type
bio_df_copy1['Weight (lbs)'] = bio_df_copy1['Weight (lbs)'].fillna(0).astype('int64')

# View the column 'Height (in)'
print(bio_df_copy1['Weight (lbs)'].head(20))
print(bio_df_copy1['Weight (lbs)'].sort_values().unique())

0     215
1     204
2     210
3     220
4     210
5     209
6     205
7     195
8     220
9     212
10    208
11    201
12    215
13    204
14    177
15    225
16    200
17    210
18    173
19    222
Name: Weight (lbs), dtype: int64
[   0  140  153  155  156  160  161  162  163  164  165  166  167  168
  169  170  171  172  173  174  175  176  177  178  179  180  181  182
  183  184  185  186  187  188  189  190  191  192  193  194  195  196
  197  198  199  200  201  202  203  204  205  206  207  208  209  210
  211  212  213  214  215  216  217  218  219  220  221  222  223  224
  225  226  227  228  229  230  231  232  233  234  235  236  237  238
  239  240  242  243  244  245  246  247  248  249  250  253  255  257
  258  260  265 2019 2020 2021 2022 2023]


Most of these make sense but we still have Draft year data. So again lets push these columns over.

In [223]:
# Function that checks if a rows value is a float
def is_larger(num):
    if pd.isnull(num) or num < 500:
        return False
    else:
        return True

In [224]:
# Create a mask that filters 'Weight (lbs)' column for rows that contain large numbers
mask3 = bio_df_copy1['Weight (lbs)'].apply(lambda x: is_larger(x))

# Get the columns to shift
columns_to_shift3 = bio_df_copy1.columns[11:-1]

# Step 1: Extract the relevant data into a numpy array
data_to_shift3 = bio_df_copy1.loc[mask3, columns_to_shift3].to_numpy()

# Step 2: Create a new array for the shifted data, initialized with NaN
shifted_data3 = np.full(data_to_shift3.shape, np.nan, dtype=object)

# Perform the horizontal shift by moving data one column to the right
shifted_data3[:, 1:] = data_to_shift3[:, :-1]

# Step 3: Update the original DataFrame with the shifted data
bio_df_copy1.loc[mask3, columns_to_shift3] = shifted_data3

# Verify the changes
bio_df_copy1.loc[mask3, columns_to_shift3].head(20)

Unnamed: 0,Weight (lbs),Draft Year,Draft Team,Draft Round,Round Pick
6,,2023,CBJ,1,3
110,,2020,VGK,1,29
177,,2023,CHI,1,1
192,,2020,MIN,3,3
220,,2019,CBJ,4,21
247,,2021,ARI,4,11
275,,2020,T.B,2,31
303,,2022,WSH,1,20
318,,2020,T.B,3,31
363,,2019,MTL,2,15


In [226]:
# Convert the Weight (lbs) column to int numeric data type
bio_df_copy1['Weight (lbs)'] = bio_df_copy1['Weight (lbs)'].fillna(0).astype('int64')

# View the column 'Height (in)'
print(bio_df_copy1['Weight (lbs)'].head(20))
print(bio_df_copy1['Weight (lbs)'].sort_values().unique())

0     215
1     204
2     210
3     220
4     210
5     209
6     205
7     195
8     220
9     212
10    208
11    201
12    215
13    204
14    177
15    225
16    200
17    210
18    173
19    222
Name: Weight (lbs), dtype: int64
[  0 140 153 155 156 160 161 162 163 164 165 166 167 168 169 170 171 172
 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
 227 228 229 230 231 232 233 234 235 236 237 238 239 240 242 243 244 245
 246 247 248 249 250 253 255 257 258 260 265]


## Assessing the rest of the columns ##

Hopefully most of the hard work is done. Lets take a look at the remaining columns and their values

In [227]:
# Replace all '-' values in 'Draft Year' column with 0
bio_df_copy1['Draft Year'] = bio_df_copy1['Draft Year'].replace('-', 0)

# Use astype to modify the 'Draft Year' column to numeric
bio_df_copy1['Draft Year'] = bio_df_copy1['Draft Year'].astype(int)

# View the column 'Draft Year'
print(bio_df_copy1['Draft Year'].head(20))
print(bio_df_copy1['Draft Year'].sort_values().unique())

0        0
1     2001
2     1989
3     2002
4     2001
5     1991
6     2004
7     2002
8     1989
9     1999
10    1997
11    2004
12    1992
13    1993
14    2001
15    1998
16       0
17    2000
18    2004
19    1991
Name: Draft Year, dtype: int64
[   0 1981 1983 1984 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023]


This looks pretty good. Lets do the same with Draft Team

In [228]:
# View the column 'Draft Team'
print(bio_df_copy1['Draft Team'].head(20))
print(bio_df_copy1['Draft Team'].sort_values().unique())

0       -
1     CBJ
2     NYR
3     L.A
4     N.J
5     WIN
6     CHI
7     CHI
8     QUE
9     NSH
10    TOR
11    CBJ
12    VAN
13    FLA
14    EDM
15    BUF
16      -
17    L.A
18    PIT
19    NYR
Name: Draft Team, dtype: object
['-' '6' '7' 'ANA' 'ARI' 'ATL' 'BOS' 'BUF' 'CAR' 'CBJ' 'CGY' 'CHI' 'COL'
 'DAL' 'DET' 'EDM' 'FLA' 'HFD' 'L.A' 'MIN' 'MNS' 'MTL' 'N.J' 'NSH' 'NYI'
 'NYR' 'OTT' 'PHI' 'PHX' 'PIT' 'QUE' 'S.J' 'SEA' 'STL' 'T.B' 'TOR' 'VAN'
 'VGK' 'WIN' 'WPG' 'WSH']


'6' and '7' are not expected. Lets see what's going on here.

In [231]:
# Select all rows where 'Draft Team' is '6' or '7'
mask4 = bio_df_copy1['Draft Team'].isin(['6', '7'])

# Get the columns to shift
columns_to_shift4 = bio_df_copy1.columns[12:-1]

# Step 1: Extract the relevant data into a numpy array
data_to_shift4 = bio_df_copy1.loc[mask4, columns_to_shift4].to_numpy()

# Step 2: Create a new array for the shifted data, initialized with NaN
shifted_data4 = np.full(data_to_shift4.shape, np.nan, dtype=object)

# Perform the horizontal shift by moving data one column to the right
shifted_data4[:, 1:] = data_to_shift4[:, :-1]

# Step 3: Update the original DataFrame with the shifted data
bio_df_copy1.loc[mask4, columns_to_shift4] = shifted_data4

# Verify the changes
bio_df_copy1.loc[mask4, columns_to_shift4].head(20)


Unnamed: 0,Draft Year,Draft Team,Draft Round,Round Pick
312,,2002,7,0
317,,2002,7,0
427,,2008,6,19


In [232]:
# View the column 'Draft Round'
print(bio_df_copy1['Draft Round'].head(20))
print(bio_df_copy1['Draft Round'].sort_values().unique())

0     -
1     3
2     5
3     4
4     8
5     1
6     3
7     9
8     2
9     2
10    4
11    2
12    5
13    6
14    1
15    6
16    -
17    1
18    2
19    1
Name: Draft Round, dtype: object
['-' '1' '10' '11' '2' '3' '4' '5' '6' '7' '8' '9' nan]


In [101]:
# View the column 'Round Pick'
print(bio_df['Round Pick'].head(20))
print(bio_df['Round Pick'].sort_values().unique())

0      -
1     22
2     20
3      8
4      6
5      4
6      3
7     19
8      4
9     19
10     4
11     -
12     4
13    16
14    29
15    15
16    16
17    29
18     9
19     2
Name: Round Pick, dtype: object
['-' '1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21'
 '22' '23' '24' '25' '26' '27' '28' '29' '3' '30' '31' '32' '33' '36' '4'
 '5' '6' '7' '8' '9' nan]


In [233]:
# View the column 'Overall Draft Position'
print(bio_df_copy1['Overall Draft Position'].head(20))
print(bio_df_copy1['Overall Draft Position'].sort_values().unique())

0       -
1      85
2      88
3     104
4     229
5       5
6      68
7     282
8      22
9      52
10     84
11     46
12    117
13    135
14    NaN
15    NaN
16      -
17    NaN
18     61
19    NaN
Name: Overall Draft Position, dtype: object
['-' '1' '10' '100' '101' '102' '103' '104' '105' '106' '107' '108' '109'
 '11' '110' '111' '112' '113' '114' '115' '116' '117' '118' '119' '12'
 '120' '121' '122' '123' '124' '126' '127' '128' '129' '13' '130' '131'
 '132' '133' '134' '135' '136' '137' '138' '139' '14' '140' '141' '142'
 '143' '144' '145' '146' '147' '148' '149' '15' '150' '151' '152' '153'
 '154' '155' '156' '157' '158' '159' '16' '160' '161' '162' '163' '164'
 '165' '166' '167' '168' '169' '17' '170' '171' '172' '173' '174' '175'
 '176' '177' '178' '179' '18' '180' '181' '182' '183' '184' '185' '186'
 '187' '188' '189' '19' '190' '191' '192' '193' '194' '195' '196' '197'
 '198' '199' '2' '20' '200' '201' '202' '203' '204' '205' '206' '207'
 '208' '209' '21' '210' '211' '212' '

It looks like our data has now been properly cleaned, so lets save the cleaned files.

In [237]:
# Confirm that the shape has been maintained and no strange things have happened:
unique_rows = bio_df_copy1.drop_duplicates(subset=['Player', 'Year']).shape
print(unique_rows)
unique_rows_original = bio_df.drop_duplicates(subset=['Player', 'Year']).shape
print(unique_rows_original)

(16104, 17)
(16104, 17)


In [238]:
# Sort the data by Player and then by Year
bio_df_copy1 = bio_df_copy1.sort_values(by=['Player', 'Year'])

# Save the cleaned bio_df to a csv file
bio_df_copy1.to_csv('../../Data/NHL_PlayerData_NaturalStatTrick/bio/cleaned_bio.csv', index=False)