# NBA Data Wrangling

## Import relevant packages


In [56]:
import pandas as pd
import datetime as dt
import numpy as np
import json
import pickle
import time
import nba_py
from nba_py.player import PlayerList, PlayerGameLogs
from nba_py.game import Boxscore

## Clean Data
  
Cleaning the data scraped from NBA.com is going to be much simpler. The stats are split up nicely, the player names are already formatted the way we want them, its just a matter of unpacking the json and indexing by datetime.

### Import NBA Packages and Data

In [57]:
#bring in list of players
players = PlayerList(league_id='00', only_current=0).info()

#from Data Acquisition
def getlogs(player, season):
        try:
            playerlogs = PlayerGameLogs(players.PERSON_ID.loc[players.DISPLAY_FIRST_LAST == player],season = season)
        except: 
            return ['ERROR']# Draft Kings Data Wrangling
        print(season, player)
        time.sleep(.5)
        return  playerlogs.info()
    
gamelogs = pickle.load(open('gamelogsraw.p','rb'))

### Clean Up Errors and Unpack JSON

It doesn't make sense that the NBA API would give players that are in the season, and also not return values for them. First lets identify the players that returned errors.

In [58]:
#create a dictionary with season and player key value pairs
{(s,player) for s in gamelogs for player in gamelogs[s] if (type(gamelogs[s][player]) == list)}

{('2007-08', 'Channing Frye'),
 ('2007-08', 'Mehmet Okur'),
 ('2008-09', 'Alan Anderson'),
 ('2008-09', 'Jason Hart'),
 ('2008-09', 'Josh Smith'),
 ('2009-10', 'Jannero Pargo'),
 ('2010-11', 'Devin Ebanks'),
 ('2010-11', 'Kenyon Martin'),
 ('2010-11', 'Paul Pierce'),
 ('2011-12', 'Gerald Wallace'),
 ('2011-12', 'Tony Allen'),
 ('2011-12', 'Walker Russell'),
 ('2012-13', 'Matt Carroll'),
 ('2017-18', 'Ian Clark')}

A quick check on NBA.com/stats and you can see that these players did play games in those years. I will just run the scaping function again for these years and players to fill out the data.

In [59]:
#For every item in the dictionayry of dataframes that is a list (['ERROR']), pull the data for that player for the NBA API
#using the getlogs function
#This will run for about 15-20 seconds
gamelogs = {s:{player:gamelogs[s][player] if (type(gamelogs[s][player]) != list)  
               else getlogs(player, s) for player in gamelogs[s]} for s in gamelogs}
{s:player for s in gamelogs for player in gamelogs[s] if (type(gamelogs[s][player]) == list)}

2007-08 Channing Frye
2007-08 Mehmet Okur
2008-09 Alan Anderson
2008-09 Jason Hart
2008-09 Josh Smith
2009-10 Jannero Pargo
2010-11 Devin Ebanks
2010-11 Kenyon Martin
2010-11 Paul Pierce
2011-12 Gerald Wallace
2011-12 Tony Allen
2011-12 Walker Russell
2012-13 Matt Carroll
2017-18 Ian Clark


{}

### Unpack Data into 1 Table

First, let's take a look at one of the dataframes in the dictionary to see what steps to take in order to unpack it.

In [60]:
gamelogs['2015-16']['Stephen Curry'].head()

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22015,201939,21501227,"APR 13, 2016",GSW vs. MEM,W,30,15,24,0.625,...,3,4,6,2,0,2,2,46,19,1
1,22015,201939,21501201,"APR 10, 2016",GSW @ SAS,W,35,13,22,0.591,...,5,5,5,2,0,4,4,37,7,1
2,22015,201939,21501190,"APR 09, 2016",GSW @ MEM,W,34,7,22,0.318,...,9,9,8,1,0,2,1,17,10,1
3,22015,201939,21501177,"APR 07, 2016",GSW vs. SAS,W,36,11,19,0.579,...,5,5,9,2,0,3,2,27,14,1
4,22015,201939,21501163,"APR 05, 2016",GSW vs. MIN,L,43,7,25,0.28,...,4,6,15,3,0,3,4,21,3,1


- The goal here is to stack each of these DataFrames on top of each other in a datatime index with Player Name as its own column

In [61]:
#Every dataframe for each player will be combined into 1 table and assigned as a value to each season, keeping the 
#keeping the name of the player as the index 
gamelogsdf = {season:pd.concat(gamelogs[season]) for season in gamelogs}

#Now concatenate all the season tables into one big table, pd.concat wil keep the keys as the index and add a generic
#range index as well, drop that index to keep just the season and player names
gamelogsdf = (pd.concat(gamelogsdf)).reset_index(level=2,drop = True)
gamelogsdf.head().append(gamelogsdf.tail())

Unnamed: 0,Unnamed: 1,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
2007-08,Aaron Brooks,22007,201166,20701226,"APR 16, 2008",HOU vs. LAC,W,17,3,8,0.375,...,1,1,1,2,0,0,2,8,-7,0
2007-08,Aaron Brooks,22007,201166,20701208,"APR 14, 2008",HOU @ UTA,L,14,2,5,0.4,...,0,0,3,0,0,1,4,8,-5,0
2007-08,Aaron Brooks,22007,201166,20701201,"APR 13, 2008",HOU @ DEN,L,17,2,8,0.25,...,0,0,3,0,0,1,3,7,-5,0
2007-08,Aaron Brooks,22007,201166,20701184,"APR 11, 2008",HOU vs. PHX,W,18,3,4,0.75,...,2,2,3,0,0,1,4,7,7,0
2007-08,Aaron Brooks,22007,201166,20701171,"APR 09, 2008",HOU vs. SEA,W,17,4,8,0.5,...,4,4,4,0,1,4,1,11,10,0
2017-18,Zhou Qi,22017,1627753,21700140,"NOV 05, 2017",HOU vs. UTA,W,5,1,2,0.5,...,1,1,0,0,0,1,0,4,4,1
2017-18,Zhou Qi,22017,1627753,21700123,"NOV 03, 2017",HOU @ ATL,W,7,0,3,0.0,...,1,1,0,0,0,0,1,2,-15,1
2017-18,Zhou Qi,22017,1627753,21700111,"NOV 01, 2017",HOU @ NYK,W,5,1,3,0.333,...,0,0,0,0,0,1,2,3,-4,2
2017-18,Zhou Qi,22017,1627753,21700080,"OCT 28, 2017",HOU @ MEM,L,3,0,1,0.0,...,1,1,0,0,0,0,0,0,5,1
2017-18,Zhou Qi,22017,1627753,21700032,"OCT 21, 2017",HOU vs. DAL,W,7,0,1,0.0,...,3,3,0,0,1,1,1,0,-7,1


### Create a datetime index

In [62]:
#First, shift the Multi-index into 2 separate columns by giving the indexes names, then resetting the index
gamelogsdf.index = gamelogsdf.index.set_names(['Season','Player'])
gamelogsdf = gamelogsdf.reset_index(inplace=False)
gamelogsdf.head()

Unnamed: 0,Season,Player,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,2007-08,Aaron Brooks,22007,201166,20701226,"APR 16, 2008",HOU vs. LAC,W,17,3,...,1,1,1,2,0,0,2,8,-7,0
1,2007-08,Aaron Brooks,22007,201166,20701208,"APR 14, 2008",HOU @ UTA,L,14,2,...,0,0,3,0,0,1,4,8,-5,0
2,2007-08,Aaron Brooks,22007,201166,20701201,"APR 13, 2008",HOU @ DEN,L,17,2,...,0,0,3,0,0,1,3,7,-5,0
3,2007-08,Aaron Brooks,22007,201166,20701184,"APR 11, 2008",HOU vs. PHX,W,18,3,...,2,2,3,0,0,1,4,7,7,0
4,2007-08,Aaron Brooks,22007,201166,20701171,"APR 09, 2008",HOU vs. SEA,W,17,4,...,4,4,4,0,1,4,1,11,10,0


In [63]:
#Now that we have season and name columns, lets make the GAME_DATE column a data time index
#First convert the column to datetime format, only keeping the date
gamelogsdf.GAME_DATE = pd.to_datetime(gamelogsdf.GAME_DATE, format = "%b %d, %Y")

#Set the datetime column as the index and sort it
gamelogsdf = gamelogsdf.set_index('GAME_DATE')
gamelogsdf.index = pd.to_datetime(gamelogsdf.index)
gamelogsdf.head()

Unnamed: 0_level_0,Season,Player,SEASON_ID,Player_ID,Game_ID,MATCHUP,WL,MIN,FGM,FGA,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-04-16,2007-08,Aaron Brooks,22007,201166,20701226,HOU vs. LAC,W,17,3,8,...,1,1,1,2,0,0,2,8,-7,0
2008-04-14,2007-08,Aaron Brooks,22007,201166,20701208,HOU @ UTA,L,14,2,5,...,0,0,3,0,0,1,4,8,-5,0
2008-04-13,2007-08,Aaron Brooks,22007,201166,20701201,HOU @ DEN,L,17,2,8,...,0,0,3,0,0,1,3,7,-5,0
2008-04-11,2007-08,Aaron Brooks,22007,201166,20701184,HOU vs. PHX,W,18,3,4,...,2,2,3,0,0,1,4,7,7,0
2008-04-09,2007-08,Aaron Brooks,22007,201166,20701171,HOU vs. SEA,W,17,4,8,...,4,4,4,0,1,4,1,11,10,0


### Make sure stat data are numeric type

In [64]:
#create tuples of column name and the unique types of the values in each column
[(col,set([type(x) for x in gamelogsdf[col]])) for col in list(gamelogsdf.columns)]

[('Season', {str}),
 ('Player', {str}),
 ('SEASON_ID', {str}),
 ('Player_ID', {int}),
 ('Game_ID', {str}),
 ('MATCHUP', {str}),
 ('WL', {str}),
 ('MIN', {int}),
 ('FGM', {int}),
 ('FGA', {int}),
 ('FG_PCT', {numpy.float64}),
 ('FG3M', {int}),
 ('FG3A', {int}),
 ('FG3_PCT', {numpy.float64}),
 ('FTM', {int}),
 ('FTA', {int}),
 ('FT_PCT', {numpy.float64}),
 ('OREB', {int}),
 ('DREB', {int}),
 ('REB', {int, float}),
 ('AST', {int}),
 ('STL', {int}),
 ('BLK', {int}),
 ('TOV', {int}),
 ('PF', {int}),
 ('PTS', {int}),
 ('PLUS_MINUS', {int}),
 ('VIDEO_AVAILABLE', {int})]

In [70]:
#convert all of the stat types fo pd.numeric in order to take averages and such
cols = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
   'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
   'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']

for col in cols:
    gamelogsdf[col] = pd.to_numeric(gamelogsdf[col]) 
    
[(col,set([type(x) for x in gamelogsdf[col]])) for col in list(gamelogsdf.columns)]

[('Season', {str}),
 ('Player', {str}),
 ('SEASON_ID', {str}),
 ('Player_ID', {int}),
 ('Game_ID', {str}),
 ('MATCHUP', {str}),
 ('WL', {str}),
 ('MIN', {numpy.int64}),
 ('FGM', {numpy.int64}),
 ('FGA', {numpy.int64}),
 ('FG_PCT', {numpy.float64}),
 ('FG3M', {numpy.int64}),
 ('FG3A', {numpy.int64}),
 ('FG3_PCT', {numpy.float64}),
 ('FTM', {numpy.int64}),
 ('FTA', {numpy.int64}),
 ('FT_PCT', {numpy.float64}),
 ('OREB', {numpy.int64}),
 ('DREB', {numpy.int64}),
 ('REB', {numpy.float64}),
 ('AST', {numpy.int64}),
 ('STL', {numpy.int64}),
 ('BLK', {numpy.int64}),
 ('TOV', {numpy.int64}),
 ('PF', {numpy.int64}),
 ('PTS', {numpy.int64}),
 ('PLUS_MINUS', {numpy.int64}),
 ('VIDEO_AVAILABLE', {int})]

### Check for nulls


In [71]:
#print the amount of null values in each column
gamelogsdf.isnull().sum()

Season              0
Player              0
SEASON_ID           0
Player_ID           0
Game_ID             0
MATCHUP             0
WL                  0
MIN                 0
FGM                 0
FGA                 0
FG_PCT              0
FG3M                0
FG3A                0
FG3_PCT             0
FTM                 0
FTA                 0
FT_PCT              0
OREB                0
DREB                0
REB                19
AST                 0
STL                 0
BLK                 0
TOV                 0
PF                  0
PTS                 0
PLUS_MINUS          0
VIDEO_AVAILABLE     0
dtype: int64

### Fill in nulls
Looks like we only have 19 values in the entire dataset that are null values. This is pretty good news, but we should fill in those values. Instead of filling in the null values with 0, lets fill them in with the amount of rebounds each player was averaging that season.

In [99]:
gamelogsdf.loc[gamelogsdf.REB.isnull()].head()

Unnamed: 0_level_0,Season,Player,SEASON_ID,Player_ID,Game_ID,MATCHUP,WL,MIN,FGM,FGA,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-08,2010-11,Alexis Ajinca,22010,201582,21000542,DAL vs. ORL,L,2,2,2,...,1,,1,1,0,0,1,5,7,0
2011-01-08,2010-11,Brandon Bass,22010,101138,21000542,ORL @ DAL,W,26,5,7,...,3,,1,0,0,1,0,11,1,0
2011-01-08,2010-11,Brendan Haywood,22010,2217,21000542,DAL vs. ORL,L,22,2,3,...,6,,1,0,4,2,3,5,-8,0
2011-01-08,2010-11,Brian Cardinal,22010,2073,21000542,DAL vs. ORL,L,19,3,5,...,0,,1,2,0,1,1,9,-1,0
2011-01-08,2010-11,DeShawn Stevenson,22010,2052,21000542,DAL vs. ORL,L,35,8,13,...,1,,2,0,0,2,3,24,-7,0


In [139]:
#save dataframe of null rebounds
nullreb = gamelogsdf.loc[gamelogsdf.REB.isnull()]

#identify dates where rebounds were null
set([date for date in nullreb.index])

{Timestamp('2011-01-08 00:00:00')}

In [157]:
#group the data by season and player, fill in the null values with the rounded averae of the season
#You can't have half a rebound
gamelogsdf['REB'] = gamelogsdf.groupby(['Season','Player'])['REB'].transform(lambda x: x.fillna(round(x.mean())))

#since our index has duplicates for each date, we need to specify the sliced df 
#wherever it appears to avoid duplicate index error
fillrebdate = gamelogsdf['2011-01-08':'2011-01-08']

#show the rebound data for the players that had null values before
fillrebdate.loc[fillrebdate.Player.isin(list(nullreb.Player))][['Player','REB']]

Unnamed: 0_level_0,Player,REB
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-08,Alexis Ajinca,2.333333
2011-01-08,Brandon Bass,5.586667
2011-01-08,Brendan Haywood,5.197183
2011-01-08,Brian Cardinal,1.072727
2011-01-08,DeShawn Stevenson,1.478873
2011-01-08,Dominique Jones,1.352941
2011-01-08,Dwight Howard,14.090909
2011-01-08,Gilbert Arenas,2.695652
2011-01-08,Hedo Turkoglu,4.4125
2011-01-08,Ian Mahinmi,2.181818


In [161]:
gamelogsdf.isnull().sum()

Season             0
Player             0
SEASON_ID          0
Player_ID          0
Game_ID            0
MATCHUP            0
WL                 0
MIN                0
FGM                0
FGA                0
FG_PCT             0
FG3M               0
FG3A               0
FG3_PCT            0
FTM                0
FTA                0
FT_PCT             0
OREB               0
DREB               0
REB                0
AST                0
STL                0
BLK                0
TOV                0
PF                 0
PTS                0
PLUS_MINUS         0
VIDEO_AVAILABLE    0
dtype: int64

In [163]:
gamelogsdf.to_csv('gamelogsdf.csv')

### Done

The dataset is now cleaned up and ready to go.

## Merge gamelogs and roster data

The gamelogs data is pretty clean at this point, and in order to fill