In [26]:
import pandas as pd
import re
from os import listdir

In [23]:
def read_advanced_season_stats(filepath):
    '''
    Function reads in the file at 'filepath' and returns a dataframe of the advanced stats
    Inputs: filepath--a path to the file (*advanced.csv)
    Returns: dataframe of per season advanced stats
    '''
    df = pd.read_csv(filepath)
    #pull the seasonal years from the filepath
    years = re.findall(r"\d\d\d\d",filepath)
    #remove the "blank" columns that were added via the data mining recipe
    df.drop(['Blank', 'Blank.1'], axis=1, inplace=True)
    #Get the ending year for the season
    df['Season'] = int(years[-1])
    
    return df

In [24]:
df0607 = read_advanced_season_stats('../data/advanced/2006-2007_player_advanced.csv')

Some of the rows have NANs in them, probably from players who do not have many minutes

In [25]:
df0607.info()
df0607[df0607.isnull().any(axis=1)]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 28 columns):
Rk        516 non-null int64
Player    516 non-null object
Pos       516 non-null object
Age       516 non-null int64
Tm        516 non-null object
G         516 non-null int64
MP        516 non-null int64
PER       516 non-null float64
TS%       513 non-null float64
3PAr      513 non-null float64
FTr       513 non-null float64
ORB%      516 non-null float64
DRB%      516 non-null float64
TRB%      516 non-null float64
AST%      516 non-null float64
STL%      516 non-null float64
BLK%      516 non-null float64
TOV%      513 non-null float64
USG%      516 non-null float64
OWS       516 non-null float64
DWS       516 non-null float64
WS        516 non-null float64
WSp48     516 non-null float64
OBPM      516 non-null float64
DBPM      516 non-null float64
BPM       516 non-null float64
VORP      516 non-null float64
Season    516 non-null int64
dtypes: float64(20), int64(5), object(3)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,USG%,OWS,DWS,WS,WSp48,OBPM,DBPM,BPM,VORP,Season
9,9,Lou Amundson,PF,24,UTA,1,2,0.0,,,...,0.0,0.0,0.0,0.0,-0.004,-5.3,0.2,-5.1,0.0,2007
94,88,Will Conroy,SG,24,MEM,3,17,4.2,,,...,0.0,0.0,0.0,0.0,0.035,-2.9,-0.1,-3.1,0.0,2007
398,351,Jeremy Richardson,SF,22,POR,1,1,0.0,,,...,0.0,0.0,0.0,0.0,-0.021,-5.7,-0.2,-5.8,0.0,2007


In [14]:
df0607[df0607.isnull().any(axis=1)].T

Unnamed: 0,9,94,398
Rk,9,88,351
Player,Lou Amundson,Will Conroy,Jeremy Richardson
Pos,PF,SG,SF
Age,24,24,22
Tm,UTA,MEM,POR
G,1,3,1
MP,2,17,1
PER,0,4.2,0
TS%,,,
3PAr,,,


In [28]:
listdir('../data/advanced')

['1998-1999_player_advanced.csv',
 '2006-2007_player_advanced.csv',
 '2007-2008_player_advanced.csv',
 '2015-2016_player_advanced.csv',
 '2011-2012_player_advanced.csv',
 '2003-2003_player_advanced.csv',
 '2010-2011_player_advanced.csv',
 '1999-2000_player_advanced.csv',
 '2014-2015_player_advanced.csv',
 '2003-2004_player_advanced.csv',
 '2012-2013_player_advanced.csv',
 '2001-2002_player_advanced.csv',
 '2017-2018_player_advanced.csv',
 '1997-1998_player_advanced.csv',
 '2016-2017_player_advanced.csv',
 '2008-2009_player_advanced.csv',
 '2005-2006_player_advanced.csv',
 '2004-2005_player_advanced.csv',
 '2009-2010_player_advanced.csv',
 '2013-2014_player_advanced.csv',
 '2000-2001_player_advanced.csv']

In [36]:
def read_all_advanced(pathtodir):
    '''
    Read every advanced stat file into separate dataframes
    Concats the per_season dataframes together, and returns a big dataframe 
    
    Input: pathtodir -- the path to the directory that contains the per_season advanced stat csv files
    NOTE: the directory should only have advanced stat files within.  Do not mix file types.
    Returns: dataframe of all advanced stats, uncleaned
    '''
    #get the individual files
    files = listdir(pathtodir)
    #get a list of dataframes, one for every file
    dataframes = [read_advanced_season_stats(pathtodir + '/' + file) for file in files]
    #assemble the dataframes together, and return them.
    return pd.concat(dataframes)

In [33]:
advanced = read_all_advanced('../data/advanced')

In [34]:
advanced.groupby("Season").mean()

Unnamed: 0_level_0,Rk,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,...,TOV%,USG%,OWS,DWS,WS,WSp48,OBPM,DBPM,BPM,VORP
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998,226.605119,27.471664,49.091408,1164.248629,12.505667,0.496919,0.157774,0.33991,6.51755,13.295978,...,15.637064,19.206399,1.231079,1.168556,2.400548,0.065322,-1.805484,-0.637112,-2.442413,0.53894
1999,215.287968,27.481262,31.406312,731.723866,12.024063,0.47875,0.155952,0.354135,6.56213,13.199408,...,15.595238,19.544379,0.812032,0.741223,1.556805,0.055609,-2.066272,-0.957396,-3.023866,0.362327
2000,216.266129,27.620968,51.340726,1200.512097,12.353831,0.494147,0.155715,0.310613,6.159274,13.896573,...,14.716566,18.678427,1.283065,1.221774,2.507258,0.067054,-1.805645,-0.564113,-2.36875,0.58246
2001,221.061453,27.86406,49.126629,1161.080074,11.996834,0.485656,0.14981,0.303987,6.026071,13.990503,...,15.613109,18.441341,1.245251,1.177095,2.423091,0.060337,-2.079516,-0.529236,-2.609125,0.548417
2002,219.704,27.24,50.964,1221.54,12.677,0.492022,0.164268,0.304308,5.974,13.8276,...,14.048893,18.7788,1.3418,1.2222,2.5654,0.070872,-1.6888,-0.5174,-2.2078,0.6014
2003,212.923395,27.140787,52.440994,1259.111801,12.212836,0.480056,0.156915,0.299722,6.159627,13.750518,...,14.76639,18.75176,1.380538,1.264803,2.640787,0.062516,-1.961905,-0.484886,-2.44472,0.614286
2004,225.576068,27.242735,46.953846,1116.661538,11.980684,0.483508,0.158675,0.307187,5.85812,13.918803,...,15.419316,18.683419,1.208205,1.122222,2.332137,0.061056,-2.141709,-0.627692,-2.77094,0.517265
2005,233.553846,27.206838,47.998291,1131.010256,12.378291,0.49763,0.181257,0.31618,6.077949,13.777436,...,14.582192,18.517094,1.195214,1.141538,2.337778,0.069586,-1.765812,-0.471966,-2.237607,0.53812
2006,228.639432,26.657194,48.539964,1147.493783,12.004635,0.504821,0.181578,0.345953,5.827094,14.363993,...,14.854562,18.556863,1.225933,1.152043,2.38206,0.062248,-2.067673,-0.533215,-2.600355,0.538721
2007,228.893411,26.544574,51.397287,1220.99031,12.18469,0.508273,0.198981,0.352622,5.846318,14.12655,...,15.157895,18.563953,1.317829,1.23876,2.557558,0.065671,-1.777713,-0.507752,-2.283915,0.595155


In [35]:
advanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11922 entries, 0 to 536
Data columns (total 28 columns):
Rk        11922 non-null int64
Player    11922 non-null object
Pos       11922 non-null object
Age       11922 non-null int64
Tm        11922 non-null object
G         11922 non-null int64
MP        11922 non-null int64
PER       11917 non-null float64
TS%       11862 non-null float64
3PAr      11858 non-null float64
FTr       11858 non-null float64
ORB%      11917 non-null float64
DRB%      11917 non-null float64
TRB%      11917 non-null float64
AST%      11917 non-null float64
STL%      11917 non-null float64
BLK%      11917 non-null float64
TOV%      11873 non-null float64
USG%      11917 non-null float64
OWS       11922 non-null float64
DWS       11922 non-null float64
WS        11922 non-null float64
WSp48     11917 non-null float64
OBPM      11922 non-null float64
DBPM      11922 non-null float64
BPM       11922 non-null float64
VORP      11922 non-null float64
Season    119