# NBA Data Wrangling

## Import relevant packages


In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import time
import nba_py
import re
from nba_py.player import PlayerList
from nba_py import team
pd.set_option('display.max_columns', 50)

In [2]:
plyr_logs = pd.read_csv('player_logs.csv', index_col = 0).drop_duplicates()

plyr_logs.head()

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22000,711,Jerry Stackhouse,1610612765,DET,Detroit Pistons,20001068,2001-04-03,DET @ CHI,W,42,21,36,0.583,4,11,0.364,11,13,0.846,0,4,4.0,5,1,0,4,2,57,34,0
1,22000,947,Allen Iverson,1610612755,PHI,Philadelphia 76ers,20000477,2001-01-06,PHI @ CLE,W,44,20,30,0.667,4,7,0.571,10,13,0.769,0,3,3.0,3,3,1,4,3,54,5,0
2,22000,960,Tony Delk,1610612756,PHX,Phoenix Suns,20000449,2001-01-02,PHX @ SAC,L,50,20,27,0.741,0,1,0.0,13,15,0.867,3,3,6.0,0,1,0,2,1,53,7,0
3,22000,947,Allen Iverson,1610612755,PHI,Philadelphia 76ers,20000579,2001-01-21,PHI vs. TOR,L,51,20,40,0.5,4,8,0.5,7,8,0.875,1,2,3.0,4,1,0,4,4,51,-8,0
4,22000,185,Chris Webber,1610612758,SAC,Sacramento Kings,20000471,2001-01-05,SAC vs. IND,L,50,24,47,0.511,0,1,0.0,3,6,0.5,10,16,26.0,5,3,2,4,4,51,-1,0


In [3]:
#change all column names to lower case
plyr_logs.columns = [x.lower() for x in plyr_logs.columns]

#rename min to mp ('minutes played) so there is no confusion with the minimum function
plyr_logs.rename(columns = {'min' : 'mp'}, inplace = True)

#drop the video_available column ,it is unnecessary for our purposes
plyr_logs = plyr_logs.drop('video_available',1)
plyr_logs.columns

Index(['season_id', 'player_id', 'player_name', 'team_id', 'team_abbreviation',
       'team_name', 'game_id', 'game_date', 'matchup', 'wl', 'mp', 'fgm',
       'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct',
       'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts',
       'plus_minus'],
      dtype='object')

In [4]:
#Now that we have season and name columns, lets make the GAME_DATE column a data time index
#First convert the column to datetime format, only keeping the date
plyr_logs.GAME_DATE = pd.to_datetime(plyr_logs.game_date)

#Set the datetime column as the index and sort it
plyr_logs = plyr_logs.set_index('game_date')
plyr_logs.index = pd.to_datetime(plyr_logs.index)
plyr_logs = plyr_logs.sort_index()
plyr_logs.head()

Unnamed: 0_level_0,season_id,player_id,player_name,team_id,team_abbreviation,team_name,game_id,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2000-10-31,22000,672,Matt Bullard,1610612745,HOU,Houston Rockets,20000008,HOU vs. MIN,L,8,0,1,0.0,0,1,0.0,0,0,,0,1,1.0,0,0,0,1,1,0,-2
2000-10-31,22000,951,Ray Allen,1610612749,MIL,Milwaukee Bucks,20000007,MIL @ DAL,L,40,8,17,0.471,2,6,0.333,8,8,1.0,2,2,4.0,2,0,0,1,1,26,-9
2000-10-31,22000,714,Michael Finley,1610612742,DAL,Dallas Mavericks,20000007,DAL vs. MIL,W,42,10,21,0.476,0,5,0.0,6,8,0.75,3,6,9.0,8,2,0,2,2,26,13
2000-10-31,22000,361,Clifford Robinson,1610612756,PHX,Phoenix Suns,20000011,PHX @ GSW,L,42,10,25,0.4,2,6,0.333,4,6,0.667,1,4,5.0,3,2,1,1,2,26,-3
2000-10-31,22000,739,Rasheed Wallace,1610612757,POR,Portland Trail Blazers,20000012,POR vs. LAL,L,47,11,17,0.647,2,2,1.0,2,2,1.0,3,4,7.0,2,0,1,3,1,26,-8


In [5]:
#create year month and day, and season columns
plyr_logs.insert(0,'day',plyr_logs.index.day)

plyr_logs.insert(0,'month',plyr_logs.index.month)

plyr_logs.insert(0,'year',plyr_logs.index.year)

season = [int(str(x)[-4:]) for x in plyr_logs.season_id]
plyr_logs.insert(4,'season',season)

plyr_logs.reset_index(inplace = True)

plyr_logs.head()

Unnamed: 0,game_date,year,month,day,season_id,season,player_id,player_name,team_id,team_abbreviation,team_name,game_id,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus
0,2000-10-31,2000,10,31,22000,2000,672,Matt Bullard,1610612745,HOU,Houston Rockets,20000008,HOU vs. MIN,L,8,0,1,0.0,0,1,0.0,0,0,,0,1,1.0,0,0,0,1,1,0,-2
1,2000-10-31,2000,10,31,22000,2000,951,Ray Allen,1610612749,MIL,Milwaukee Bucks,20000007,MIL @ DAL,L,40,8,17,0.471,2,6,0.333,8,8,1.0,2,2,4.0,2,0,0,1,1,26,-9
2,2000-10-31,2000,10,31,22000,2000,714,Michael Finley,1610612742,DAL,Dallas Mavericks,20000007,DAL vs. MIL,W,42,10,21,0.476,0,5,0.0,6,8,0.75,3,6,9.0,8,2,0,2,2,26,13
3,2000-10-31,2000,10,31,22000,2000,361,Clifford Robinson,1610612756,PHX,Phoenix Suns,20000011,PHX @ GSW,L,42,10,25,0.4,2,6,0.333,4,6,0.667,1,4,5.0,3,2,1,1,2,26,-3
4,2000-10-31,2000,10,31,22000,2000,739,Rasheed Wallace,1610612757,POR,Portland Trail Blazers,20000012,POR vs. LAL,L,47,11,17,0.647,2,2,1.0,2,2,1.0,3,4,7.0,2,0,1,3,1,26,-8


### Make sure stat data are numeric type

In [6]:
#create tuples of column name and the unique types of the values in each column
[(col,set([type(x) for x in plyr_logs[col]])) for col in list(plyr_logs.columns)]

[('game_date', {pandas._libs.tslib.Timestamp}),
 ('year', {numpy.int64}),
 ('month', {numpy.int64}),
 ('day', {numpy.int64}),
 ('season_id', {numpy.int64}),
 ('season', {numpy.int64}),
 ('player_id', {numpy.int64}),
 ('player_name', {str}),
 ('team_id', {numpy.int64}),
 ('team_abbreviation', {str}),
 ('team_name', {str}),
 ('game_id', {numpy.int64}),
 ('matchup', {str}),
 ('wl', {str}),
 ('mp', {numpy.int64}),
 ('fgm', {numpy.int64}),
 ('fga', {numpy.int64}),
 ('fg_pct', {numpy.float64}),
 ('fg3m', {numpy.int64}),
 ('fg3a', {numpy.int64}),
 ('fg3_pct', {numpy.float64}),
 ('ftm', {numpy.int64}),
 ('fta', {numpy.int64}),
 ('ft_pct', {numpy.float64}),
 ('oreb', {numpy.int64}),
 ('dreb', {numpy.int64}),
 ('reb', {numpy.float64}),
 ('ast', {numpy.int64}),
 ('stl', {numpy.int64}),
 ('blk', {numpy.int64}),
 ('tov', {numpy.int64}),
 ('pf', {numpy.int64}),
 ('pts', {numpy.int64}),
 ('plus_minus', {numpy.int64})]

### Check for nulls


In [7]:
#print the amount of null values in each column
plyr_logs.isnull().sum()

game_date                 0
year                      0
month                     0
day                       0
season_id                 0
season                    0
player_id                 0
player_name               0
team_id                   0
team_abbreviation         0
team_name                 0
game_id                   0
matchup                   0
wl                        0
mp                        0
fgm                       0
fga                       0
fg_pct                23028
fg3m                      0
fg3a                      0
fg3_pct              195816
ftm                       0
fta                       0
ft_pct               182287
oreb                      0
dreb                      0
reb                      10
ast                       0
stl                       0
blk                       0
tov                       0
pf                        0
pts                       0
plus_minus                0
dtype: int64

### Fill in nulls
Looks like we only have 19 values in the entire dataset that are null values. This is pretty good news, but we should fill in those values. Instead of filling in the null values with 0, lets fill them in with the amount of rebounds each player was averaging that season.

In [8]:
plyr_logs.loc[plyr_logs.reb.isnull()].head()

Unnamed: 0,game_date,year,month,day,season_id,season,player_id,player_name,team_id,team_abbreviation,team_name,game_id,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus
31248,2001-12-20,2001,12,20,22001,2001,2226,Will Solomon,1610612763,MEM,Memphis Grizzlies,20100358,MEM @ NYK,W,0,0,0,,0,0,,0,0,,0,0,,0,0,0,0,0,0,0
31275,2001-12-20,2001,12,20,22001,2001,1510,Brevin Knight,1610612763,MEM,Memphis Grizzlies,20100358,MEM @ NYK,W,12,2,5,0.4,0,0,,3,4,0.75,0,0,,2,4,0,2,3,7,-8
32718,2001-12-31,2001,12,31,22001,2001,954,Kerry Kittles,1610612751,NJN,New Jersey Nets,20100432,NJN @ WAS,L,27,6,10,0.6,0,1,0.0,2,2,1.0,0,0,,2,2,0,0,0,14,-15
32732,2001-12-31,2001,12,31,22001,2001,2220,Brandon Armstrong,1610612751,NJN,New Jersey Nets,20100432,NJN @ WAS,L,3,0,2,0.0,0,0,,0,0,,0,0,,0,0,0,0,0,0,-1
72274,2003-10-31,2003,10,31,22003,2003,1762,Tremaine Fowlkes,1610612765,DET,Detroit Pistons,20300020,DET @ MIA,W,1,0,0,,0,0,,0,0,,0,0,,0,0,0,0,0,0,-3


In [9]:
#save dataframe of null rebounds
nullreb = plyr_logs.loc[plyr_logs.reb.isnull()]
nullreb

Unnamed: 0,game_date,year,month,day,season_id,season,player_id,player_name,team_id,team_abbreviation,team_name,game_id,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus
31248,2001-12-20,2001,12,20,22001,2001,2226,Will Solomon,1610612763,MEM,Memphis Grizzlies,20100358,MEM @ NYK,W,0,0,0,,0,0,,0,0,,0,0,,0,0,0,0,0,0,0
31275,2001-12-20,2001,12,20,22001,2001,1510,Brevin Knight,1610612763,MEM,Memphis Grizzlies,20100358,MEM @ NYK,W,12,2,5,0.4,0,0,,3,4,0.75,0,0,,2,4,0,2,3,7,-8
32718,2001-12-31,2001,12,31,22001,2001,954,Kerry Kittles,1610612751,NJN,New Jersey Nets,20100432,NJN @ WAS,L,27,6,10,0.6,0,1,0.0,2,2,1.0,0,0,,2,2,0,0,0,14,-15
32732,2001-12-31,2001,12,31,22001,2001,2220,Brandon Armstrong,1610612751,NJN,New Jersey Nets,20100432,NJN @ WAS,L,3,0,2,0.0,0,0,,0,0,,0,0,,0,0,0,0,0,0,-1
72274,2003-10-31,2003,10,31,22003,2003,1762,Tremaine Fowlkes,1610612765,DET,Detroit Pistons,20300020,DET @ MIA,W,1,0,0,,0,0,,0,0,,0,0,,0,0,0,0,0,0,-3
72286,2003-10-31,2003,10,31,22003,2003,961,John Wallace,1610612748,MIA,Miami Heat,20300020,MIA vs. DET,L,6,1,4,0.25,0,0,,1,2,0.5,0,0,,0,1,0,0,2,3,2
72300,2003-10-31,2003,10,31,22003,2003,1088,Chucky Atkins,1610612765,DET,Detroit Pistons,20300020,DET @ MIA,W,16,1,5,0.2,0,2,0.0,3,4,0.75,0,0,,0,0,0,1,3,5,-11
73403,2003-11-08,2003,11,8,22003,2003,1709,Michael Olowokandi,1610612750,MIN,Minnesota Timberwolves,20300077,MIN @ MIA,W,10,0,2,0.0,0,0,,0,0,,0,0,,0,0,0,2,2,0,-2
256045,2011-01-08,2011,1,8,22010,2010,2073,Brian Cardinal,1610612742,DAL,Dallas Mavericks,21000542,DAL vs. ORL,L,19,3,5,0.6,3,4,0.75,0,0,,0,0,,1,2,0,1,1,9,-1
256145,2011-01-08,2011,1,8,22010,2010,101133,Ian Mahinmi,1610612742,DAL,Dallas Mavericks,21000542,DAL vs. ORL,L,2,2,2,1.0,0,0,,1,1,1.0,0,0,,0,0,0,0,0,5,7


All players with null rebound values had 0 offensive and 0 defensive rebounds, so we can fill the null values with 0's

In [10]:
plyr_logs.reb = plyr_logs.reb.fillna(0)
plyr_logs.reb.isnull().sum()

0

In [11]:
plyr_logs.columns

Index(['game_date', 'year', 'month', 'day', 'season_id', 'season', 'player_id',
       'player_name', 'team_id', 'team_abbreviation', 'team_name', 'game_id',
       'matchup', 'wl', 'mp', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a',
       'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'plus_minus'],
      dtype='object')

### Split Matchup Column

We should have a column showing the team of the player in that row and the opponent team as well. The MATCHUP column always has the team of the player first and the oppenent last.


In [12]:
#create a list for the players team by taking the first 3 letter of the string
plyrTeam = [m[:3] for m in plyr_logs.matchup]

#insert that list right before the MATCHUP column
plyr_logs.insert(11,'team',plyrTeam)

#do the same with the opponenet team, taking the last 3 characters
plyrOpp = [m[-3:] for m in plyr_logs.matchup]
plyr_logs.insert(12,'opp',plyrOpp)

In [13]:
plyr_logs.head()

Unnamed: 0,game_date,year,month,day,season_id,season,player_id,player_name,team_id,team_abbreviation,team_name,team,opp,game_id,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus
0,2000-10-31,2000,10,31,22000,2000,672,Matt Bullard,1610612745,HOU,Houston Rockets,HOU,MIN,20000008,HOU vs. MIN,L,8,0,1,0.0,0,1,0.0,0,0,,0,1,1.0,0,0,0,1,1,0,-2
1,2000-10-31,2000,10,31,22000,2000,951,Ray Allen,1610612749,MIL,Milwaukee Bucks,MIL,DAL,20000007,MIL @ DAL,L,40,8,17,0.471,2,6,0.333,8,8,1.0,2,2,4.0,2,0,0,1,1,26,-9
2,2000-10-31,2000,10,31,22000,2000,714,Michael Finley,1610612742,DAL,Dallas Mavericks,DAL,MIL,20000007,DAL vs. MIL,W,42,10,21,0.476,0,5,0.0,6,8,0.75,3,6,9.0,8,2,0,2,2,26,13
3,2000-10-31,2000,10,31,22000,2000,361,Clifford Robinson,1610612756,PHX,Phoenix Suns,PHX,GSW,20000011,PHX @ GSW,L,42,10,25,0.4,2,6,0.333,4,6,0.667,1,4,5.0,3,2,1,1,2,26,-3
4,2000-10-31,2000,10,31,22000,2000,739,Rasheed Wallace,1610612757,POR,Portland Trail Blazers,POR,LAL,20000012,POR vs. LAL,L,47,11,17,0.647,2,2,1.0,2,2,1.0,3,4,7.0,2,0,1,3,1,26,-8


### Add Home or Away Column (1 home, 0 away)

In [14]:
#do the same with the opponenet team, taking the last 3 characters
home_away = [0 if '@' in m else 1 for m in plyr_logs.matchup]
plyr_logs.insert(plyr_logs.columns.get_loc('matchup'),'home',home_away)
plyr_logs.head(10)

Unnamed: 0,game_date,year,month,day,season_id,season,player_id,player_name,team_id,team_abbreviation,team_name,team,opp,game_id,home,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus
0,2000-10-31,2000,10,31,22000,2000,672,Matt Bullard,1610612745,HOU,Houston Rockets,HOU,MIN,20000008,1,HOU vs. MIN,L,8,0,1,0.0,0,1,0.0,0,0,,0,1,1.0,0,0,0,1,1,0,-2
1,2000-10-31,2000,10,31,22000,2000,951,Ray Allen,1610612749,MIL,Milwaukee Bucks,MIL,DAL,20000007,0,MIL @ DAL,L,40,8,17,0.471,2,6,0.333,8,8,1.0,2,2,4.0,2,0,0,1,1,26,-9
2,2000-10-31,2000,10,31,22000,2000,714,Michael Finley,1610612742,DAL,Dallas Mavericks,DAL,MIL,20000007,1,DAL vs. MIL,W,42,10,21,0.476,0,5,0.0,6,8,0.75,3,6,9.0,8,2,0,2,2,26,13
3,2000-10-31,2000,10,31,22000,2000,361,Clifford Robinson,1610612756,PHX,Phoenix Suns,PHX,GSW,20000011,0,PHX @ GSW,L,42,10,25,0.4,2,6,0.333,4,6,0.667,1,4,5.0,3,2,1,1,2,26,-3
4,2000-10-31,2000,10,31,22000,2000,739,Rasheed Wallace,1610612757,POR,Portland Trail Blazers,POR,LAL,20000012,1,POR vs. LAL,L,47,11,17,0.647,2,2,1.0,2,2,1.0,3,4,7.0,2,0,1,3,1,26,-8
5,2000-10-31,2000,10,31,22000,2000,1713,Vince Carter,1610612761,TOR,Toronto Raptors,TOR,DET,20000005,1,TOR vs. DET,L,40,9,24,0.375,1,4,0.25,7,10,0.7,1,2,3.0,2,4,0,1,6,26,-7
6,2000-10-31,2000,10,31,22000,2000,120,Steven Smith,1610612757,POR,Portland Trail Blazers,POR,LAL,20000012,1,POR vs. LAL,L,36,8,18,0.444,0,4,0.0,6,7,0.857,1,1,2.0,5,0,0,1,3,22,-24
7,2000-10-31,2000,10,31,22000,2000,764,David Robinson,1610612759,SAS,San Antonio Spurs,SAS,IND,20000009,1,SAS vs. IND,W,31,5,9,0.556,0,0,,12,12,1.0,1,8,9.0,0,1,3,1,2,22,18
8,2000-10-31,2000,10,31,22000,2000,210,Terrell Brandon,1610612750,MIN,Minnesota Timberwolves,MIN,HOU,20000008,0,MIN @ HOU,W,38,6,12,0.5,2,2,1.0,8,9,0.889,0,4,4.0,9,3,0,3,5,22,9
9,2000-10-31,2000,10,31,22000,2000,935,Bryon Russell,1610612762,UTA,Utah Jazz,UTA,LAC,20000010,1,UTA vs. LAC,W,31,6,10,0.6,3,3,1.0,4,5,0.8,1,3,4.0,1,1,0,0,3,19,12


### Convert WL (win-loss) to binary

In [15]:
#For a win the WL column will be 1, and a loss will be 0
wl = [1 if x == 'W' else 0 if x == 'L' else None for x in plyr_logs.wl]
plyr_logs.wl = wl
print(plyr_logs.isnull().sum())

game_date                 0
year                      0
month                     0
day                       0
season_id                 0
season                    0
player_id                 0
player_name               0
team_id                   0
team_abbreviation         0
team_name                 0
team                      0
opp                       0
game_id                   0
home                      0
matchup                   0
wl                        0
mp                        0
fgm                       0
fga                       0
fg_pct                23028
fg3m                      0
fg3a                      0
fg3_pct              195816
ftm                       0
fta                       0
ft_pct               182287
oreb                      0
dreb                      0
reb                       0
ast                       0
stl                       0
blk                       0
tov                       0
pf                        0
pts                 

### Fill in null shooting % values

In [16]:
fg_pct = [0 if x == 0 else y for x,y in zip(plyr_logs.fga,plyr_logs.fg_pct)]
plyr_logs.fg_pct = fg_pct

fg3_pct = [0 if x == 0 else y for x,y in zip(plyr_logs.fg3a,plyr_logs.fg3_pct)]
plyr_logs.fg3_pct = fg3_pct

ft_pct = [0 if x == 0 else y for x,y in zip(plyr_logs.fta,plyr_logs.ft_pct)]
plyr_logs.ft_pct = ft_pct


plyr_logs.head()

Unnamed: 0,game_date,year,month,day,season_id,season,player_id,player_name,team_id,team_abbreviation,team_name,team,opp,game_id,home,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus
0,2000-10-31,2000,10,31,22000,2000,672,Matt Bullard,1610612745,HOU,Houston Rockets,HOU,MIN,20000008,1,HOU vs. MIN,0,8,0,1,0.0,0,1,0.0,0,0,0.0,0,1,1.0,0,0,0,1,1,0,-2
1,2000-10-31,2000,10,31,22000,2000,951,Ray Allen,1610612749,MIL,Milwaukee Bucks,MIL,DAL,20000007,0,MIL @ DAL,0,40,8,17,0.471,2,6,0.333,8,8,1.0,2,2,4.0,2,0,0,1,1,26,-9
2,2000-10-31,2000,10,31,22000,2000,714,Michael Finley,1610612742,DAL,Dallas Mavericks,DAL,MIL,20000007,1,DAL vs. MIL,1,42,10,21,0.476,0,5,0.0,6,8,0.75,3,6,9.0,8,2,0,2,2,26,13
3,2000-10-31,2000,10,31,22000,2000,361,Clifford Robinson,1610612756,PHX,Phoenix Suns,PHX,GSW,20000011,0,PHX @ GSW,0,42,10,25,0.4,2,6,0.333,4,6,0.667,1,4,5.0,3,2,1,1,2,26,-3
4,2000-10-31,2000,10,31,22000,2000,739,Rasheed Wallace,1610612757,POR,Portland Trail Blazers,POR,LAL,20000012,1,POR vs. LAL,0,47,11,17,0.647,2,2,1.0,2,2,1.0,3,4,7.0,2,0,1,3,1,26,-8


In [17]:
plyr_logs.isnull().sum()

game_date            0
year                 0
month                0
day                  0
season_id            0
season               0
player_id            0
player_name          0
team_id              0
team_abbreviation    0
team_name            0
team                 0
opp                  0
game_id              0
home                 0
matchup              0
wl                   0
mp                   0
fgm                  0
fga                  0
fg_pct               0
fg3m                 0
fg3a                 0
fg3_pct              0
ftm                  0
fta                  0
ft_pct               0
oreb                 0
dreb                 0
reb                  0
ast                  0
stl                  0
blk                  0
tov                  0
pf                   0
pts                  0
plus_minus           0
dtype: int64

In [18]:
#change the names of old teams, New Orleans Hornets, Seattle Supersonics, New Jersey Nets, to their current teams
#the New Oreans Pelicans, OKC Thunder, and Brooklyn Nets
new_teams = {'NOH':'NOP','SEA':'OKC','NJN':'BKN', 'VAN':'MEM','CHH':'CHA'}

plyr_logs.team = [new_teams[team] if team in new_teams else team for team in plyr_logs.team]
plyr_logs.opp = [new_teams[team] if team in new_teams else team for team in plyr_logs.opp]


### Create Fantasy Score Column

Create this column based on DraftKings daily fantasy scoring system.

In [19]:
#create a funstions that returns the point values for a double double or triple double by aa player
def dblcount(pts,reb,ast,stl,blk):
    #counts the number of core stats that are greater than 10
    dbls = sum(x>=10 for x in [pts,ast,reb,stl,blk])
    
    #if double double, ad 1.5pts
    if dbls == 2:
        return 1.5
    #if triple double or more, 4.5pts
    elif dbls >= 3:
        return 4.5
    else:
        return 0

def add_fscore(data):
    #create an array with all the fantasy scores based on only the counting stats
    fscorebase = np.array(data.pts + .5*data.fg3m + 1.25*data.reb 
                          + 1.5*data.ast + 2*data.stl + 2*data.blk - .5*data.tov)

    #make an array with the points added by double or triple doubles
    dblcnt = np.array([dblcount(p,a,r,s,b) for p,a,r,s,b in zip(data.pts, data.ast, 
                                                                data.reb, data.stl, data.blk)])
    data['fscore'] = fscorebase + dblcnt
    return data['fscore'] 




In [20]:
#add base and dblcount arrays together to get final fantasy scores, assign to fscore column in dataframe
plyr_logs['fscore']  = add_fscore(plyr_logs)
plyr_logs[['player_name','pts','fg3m','ast','reb','stl','blk','tov','fscore']].head(10)

Unnamed: 0,player_name,pts,fg3m,ast,reb,stl,blk,tov,fscore
0,Matt Bullard,0,0,0,1.0,0,0,1,0.75
1,Ray Allen,26,2,2,4.0,0,0,1,34.5
2,Michael Finley,26,0,8,9.0,2,0,2,52.25
3,Clifford Robinson,26,2,3,5.0,2,1,1,43.25
4,Rasheed Wallace,26,2,2,7.0,0,1,3,39.25
5,Vince Carter,26,1,2,3.0,4,0,1,40.75
6,Steven Smith,22,0,5,2.0,0,0,1,31.5
7,David Robinson,22,0,0,9.0,1,3,1,40.75
8,Terrell Brandon,22,2,9,4.0,3,0,3,46.0
9,Bryon Russell,19,3,1,4.0,1,0,0,29.0


### Add Advanced Stats

All formulas have been pulled from basketball-reference.com. 

In [21]:
#get the relevant team totals for each gamelog
tm_mp = plyr_logs.groupby(['season','game_id','team']).mp.transform('sum')
tm_fgm = plyr_logs.groupby(['season','game_id','team']).fgm.transform('sum')
tm_fga = plyr_logs.groupby(['season','game_id','team']).fga.transform('sum')
tm_fta = plyr_logs.groupby(['season','game_id','team']).fta.transform('sum')
tm_dreb = plyr_logs.groupby(['season','game_id','team']).dreb.transform('sum')
tm_oreb = plyr_logs.groupby(['season','game_id','team']).oreb.transform('sum')
tm_reb = plyr_logs.groupby(['season','game_id','team']).reb.transform('sum')
tm_tov = plyr_logs.groupby(['season','game_id','team']).tov.transform('sum')


#get the relevant oppononent totals for each gamelog
opp_fgm = plyr_logs.groupby(['season','game_id','opp']).fgm.transform('sum')
opp_fga = plyr_logs.groupby(['season','game_id','opp']).fga.transform('sum')
opp_fg3a = plyr_logs.groupby(['season','game_id','opp']).fg3a.transform('sum')
opp_fta = plyr_logs.groupby(['season','game_id','opp']).fta.transform('sum')
opp_dreb = plyr_logs.groupby(['season','game_id','opp']).dreb.transform('sum')
opp_oreb = plyr_logs.groupby(['season','game_id','opp']).oreb.transform('sum')
opp_reb = plyr_logs.groupby(['season','game_id','opp']).reb.transform('sum')
opp_tov = plyr_logs.groupby(['season','game_id','opp']).tov.transform('sum')

tm_poss = tm_fga - (tm_oreb/(tm_oreb + opp_dreb))*(tm_fga - tm_fgm)*1.07 + tm_tov+ .4*tm_fta
opp_poss = opp_fga - (opp_oreb/(opp_oreb + tm_dreb))*(opp_fga - opp_fgm)*1.07 + opp_tov+ .4*opp_fta

In [22]:
def ast_pct(data):
    """Assist percentage is an estimate of the percentage of teammate 
    field goals a player assisted while he was on the floor"""
    return 100* data.ast / (((data.mp/(tm_mp/5))*tm_fgm) - data.fgm)
 
def blk_pct(data):
    """Block percentage is an estimate of the percentage of opponent 
    two-point field goal attempts blocked by the player while he was on the floor"""
    return 100*(data.blk*(tm_mp/5)) / (data.mp*(opp_fga - opp_fg3a))

def dreb_pct(data):
    """Defensive rebound percentage is an estimate of the percentage of available 
    defensive rebounds a player grabbed while he was on the floor"""
    return 100*(data.dreb*(tm_mp/5)) / (data.mp*(tm_dreb + opp_oreb))

def oreb_pct(data):
    """Offensive rebound percentage is an estimate of the percentage of available 
    offensive rebounds a player grabbed while he was on the floor."""
    return 100*(data.oreb*(tm_mp/5)) / (data.mp*(tm_oreb + opp_dreb))

def reb_pct(data):
    """Offensive rebound percentage is an estimate of the percentage of available 
    offensive rebounds a player grabbed while he was on the floor."""
    return 100*(data.reb*(tm_mp/5)) / (data.mp*(tm_reb + opp_reb))

def stl_pct(data):
    """Steal Percentage is an estimate of the percentage of opponent possessions 
    that end with a steal by the player while he was on the floor"""
    return 100*(data.stl*(tm_mp/5)) / (data.mp*(opp_poss))

def tov_pct(data):
    """Turnover percentage is an estimate of turnovers per 100 plays"""
    return 100*data.tov / (data.fga + .44*data.fta + data.tov)

def efg_pct(data):
    """adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal"""
    return (data.fgm + .5*data.fg3m) / data.fga

def ts_pct(data):
    """True shooting percentage is a measure of shooting efficiency 
    that takes into account field goals, 3-point field goals, and free throws"""
    return data.pts / (2 * (data.fga + .44*data.fta))

def usg_pct(data):
    """Usage percentage is an estimate of the percentage of team plays used by a player while he was on the floor"""
    return 100 * ((data.fga + .44*data.fta + data.tov)*(tm_mp/5)) / (data.mp*(tm_fga + .44*tm_fta + tm_tov))

In [23]:
adv_stat_cols = ['ast_pct','blk_pct','dreb_pct','oreb_pct','reb_pct','stl_pct','tov_pct','efg_pct','ts_pct','usg_pct']
adv_stat_funcs = [ast_pct, blk_pct, dreb_pct, oreb_pct, reb_pct, stl_pct, tov_pct, efg_pct, ts_pct, usg_pct]

for stat in adv_stat_cols:
    plyr_logs[stat] = adv_stat_funcs[adv_stat_cols.index(stat)](plyr_logs)

In [24]:
plyr_logs.head()

Unnamed: 0,game_date,year,month,day,season_id,season,player_id,player_name,team_id,team_abbreviation,team_name,team,opp,game_id,home,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus,fscore,ast_pct,blk_pct,dreb_pct,oreb_pct,reb_pct,stl_pct,tov_pct,efg_pct,ts_pct,usg_pct
0,2000-10-31,2000,10,31,22000,2000,672,Matt Bullard,1610612745,HOU,Houston Rockets,HOU,MIN,20000008,1,HOU vs. MIN,0,8,0,1,0.0,0,1,0.0,0,0,0.0,0,1,1.0,0,0,0,1,1,0,-2,0.75,0.0,0.0,15.789474,0.0,7.894737,0.0,50.0,0.0,0.0,10.948905
1,2000-10-31,2000,10,31,22000,2000,951,Ray Allen,1610612749,MIL,Milwaukee Bucks,MIL,DAL,20000007,0,MIL @ DAL,0,40,8,17,0.471,2,6,0.333,8,8,1.0,2,2,4.0,2,0,0,1,1,26,-9,34.5,10.25641,0.0,4.705882,4.705882,4.705882,0.0,4.64684,0.529412,0.633528,22.966916
2,2000-10-31,2000,10,31,22000,2000,714,Michael Finley,1610612742,DAL,Dallas Mavericks,DAL,MIL,20000007,1,DAL vs. MIL,1,42,10,21,0.476,0,5,0.0,6,8,0.75,3,6,9.0,8,2,0,2,2,26,13,52.25,38.787879,0.0,13.186813,6.593407,9.89011,2.431909,7.541478,0.47619,0.530179,25.931358
3,2000-10-31,2000,10,31,22000,2000,361,Clifford Robinson,1610612756,PHX,Phoenix Suns,PHX,GSW,20000011,0,PHX @ GSW,0,42,10,25,0.4,2,6,0.333,4,6,0.667,1,4,5.0,3,2,1,1,2,26,-3,43.25,13.953488,1.632653,10.38961,2.597403,6.493506,2.279162,3.49162,0.44,0.470333,28.265482
4,2000-10-31,2000,10,31,22000,2000,739,Rasheed Wallace,1610612757,POR,Portland Trail Blazers,POR,LAL,20000012,1,POR vs. LAL,0,47,11,17,0.647,2,2,1.0,2,2,1.0,3,4,7.0,2,0,1,3,1,26,-8,39.25,8.916247,1.374353,12.712766,9.534574,11.12367,0.0,14.367816,0.705882,0.727069,20.810863


In [25]:
#write clean dataframe to a csv file
plyr_logs.to_csv('player_logs_clean.csv')

## Team Logs

In [26]:
team_logs = pd.read_csv('team_logs.csv', index_col = 0)
team_logs.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22000,1610612758,SAC,Sacramento Kings,20000598,2001-01-23,SAC vs. NJN,W,240,48,86,0.558,14,28,0.5,20,29,0.69,17,34,51,30,7,7,14,15,130,26,0
1,22000,1610612765,DET,Detroit Pistons,20001149,2001-04-14,DET @ ATL,W,265,43,94,0.457,3,13,0.231,40,50,0.8,15,38,53,21,8,6,18,28,129,5,0
2,22000,1610612745,HOU,Houston Rockets,20000923,2001-03-13,HOU vs. IND,W,315,46,106,0.434,5,18,0.278,30,35,0.857,9,49,58,18,7,4,14,34,127,9,0
3,22000,1610612760,SEA,Seattle SuperSonics,20000778,2001-02-21,SEA vs. DEN,W,240,51,86,0.593,8,20,0.4,17,25,0.68,10,31,41,32,7,9,9,21,127,29,0
4,22000,1610612749,MIL,Milwaukee Bucks,20000847,2001-03-03,MIL vs. CHI,W,290,52,98,0.531,5,18,0.278,17,25,0.68,20,28,48,21,10,6,15,28,126,4,0


In [27]:
#repeat data wrangling steps done for the players data in one step
def cleanlogs(data):    
    """Performs all gamelog cleaning steps in one function"""
    data.columns = [x.lower() for x in data.columns]

    data.GAME_DATE = pd.to_datetime(data.game_date)

    data = data.set_index('game_date')
    data.index = pd.to_datetime(data.index)
    data = data.sort_index()
    
    #rename min to mp ('minutes played) so there is no confusion with the minimum function
    data.rename(columns = {'min' : 'mp'}, inplace = True)
    
    data = data.drop('video_available',1)
    
    data.insert(0,'day',data.index.day)

    data.insert(0,'month',data.index.month)

    data.insert(0,'year',data.index.year)

    season = [int(str(x)[-4:]) for x in data.season_id]
    data.insert(4,'season',season)

    data = data.rename(columns={'team_abbreviation':'team'})
    
    #do the same with the opponenet team, taking the last 3 characters
    opp = [m[-3:] for m in data.matchup]
    data.insert(8,'opp',opp)
    
    #For a win the WL column will be 1, and a loss will be 0
    wl = [1 if x == 'W' else 0 if x == 'L' else None for x in data.wl]
    data.wl = wl
    
    #insert home and away binary column, 1 for home, 0 for away
    home_away = [0 if '@' in m else 1 for m in data.matchup]
    data.insert(data.columns.get_loc('matchup'),'home',home_away)
    
    #clean up null shooting percentages by filling them with 0
    fg_pct = [0 if x == 0 else y for x,y in zip(data.fga,data.fg_pct)]
    data.fg_pct = fg_pct

    fg3_pct = [0 if x == 0 else y for x,y in zip(data.fg3a,data.fg3_pct)]
    data.fg3_pct = fg3_pct

    ft_pct = [0 if x == 0 else y for x,y in zip(data.fta,data.ft_pct)]
    data.ft_pct = ft_pct
    
    #change the names of old teams, New Orleans Hornets, Seattle Supersonics, New Jersey Nets, to their current teams
    #the New Oreans Pelicans, OKC Thunder, and Brooklyn Nets
    new_teams = {'NOH':'NOP','SEA':'OKC','NJN':'BKN', 'VAN':'MEM','CHH':'CHA'}
    
    data.team = [new_teams[team] if team in new_teams else team for team in data.team]
    data.opp = [new_teams[team] if team in new_teams else team for team in data.opp]

    data['fscore'] = add_fscore(data)
    return(data)

In [28]:
team_logs_clean = cleanlogs(team_logs).reset_index()
team_logs_clean.head()

Unnamed: 0,game_date,year,month,day,season_id,season,team_id,team,team_name,opp,game_id,home,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus,fscore
0,2000-10-31,2000,10,31,22000,2000,1610612762,UTA,Utah Jazz,LAC,20000010,1,UTA vs. LAC,1.0,240,43,85,0.506,5,10,0.5,16,31,0.516,11,22,33,33,11,4,12,30,107,13,228.75
1,2000-10-31,2000,10,31,22000,2000,1610612747,LAL,Los Angeles Lakers,POR,20000012,0,LAL @ POR,1.0,240,36,63,0.571,5,11,0.455,19,30,0.633,8,31,39,28,5,8,20,18,96,10,209.75
2,2000-10-31,2000,10,31,22000,2000,1610612763,MEM,Vancouver Grizzlies,OKC,20000013,1,VAN vs. SEA,1.0,240,39,90,0.433,4,14,0.286,12,26,0.462,14,37,51,28,12,6,11,21,94,6,236.75
3,2000-10-31,2000,10,31,22000,2000,1610612757,POR,Portland Trail Blazers,LAL,20000012,1,POR vs. LAL,0.0,240,34,85,0.4,4,11,0.364,14,16,0.875,13,19,32,18,13,1,10,28,86,-10,182.5
4,2000-10-31,2000,10,31,22000,2000,1610612756,PHX,Phoenix Suns,GSW,20000011,0,PHX @ GSW,0.0,240,36,91,0.396,6,21,0.286,16,20,0.8,11,33,44,25,12,3,16,28,94,-2,216.0


In [29]:
team_logs_clean.isnull().sum()

game_date     0
year          0
month         0
day           0
season_id     0
season        0
team_id       0
team          0
team_name     0
opp           0
game_id       0
home          0
matchup       0
wl            2
mp            0
fgm           0
fga           0
fg_pct        0
fg3m          0
fg3a          0
fg3_pct       0
ftm           0
fta           0
ft_pct        0
oreb          0
dreb          0
reb           0
ast           0
stl           0
blk           0
tov           0
pf            0
pts           0
plus_minus    0
fscore        0
dtype: int64

#### Add opponent stats to each row

In [35]:
def get_opp_stats(data, idx):
    """Return the opponent stats for each game using the game id to index the opponent row"""
    data_idx = data.iloc[[idx]]
    opp_log = data.loc[(data.game_id == data_idx.game_id[idx]) & (data.opp == data_idx.team[idx])]
    return list(opp_log.iloc[0])

stat_cols = ['mp', 'fgm', 'fga','fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 
             'ftm', 'fta', 'ft_pct', 'oreb','dreb', 'reb', 'ast', 'stl', 
             'blk', 'tov', 'pf', 'pts', 'plus_minus', 'fscore']

In [31]:
#create a data set with the same index as the original but with the opponent stats
opp_logs = [get_opp_stats(team_logs_clean,i) for i in team_logs_clean.index]
opp_logs_df = pd.DataFrame(opp_logs, columns = team_logs_clean.columns)


In [32]:
opp_logs_df.head()

Unnamed: 0,game_date,year,month,day,season_id,season,team_id,team,team_name,opp,game_id,home,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,plus_minus,fscore
0,2000-10-31,2000,10,31,22000,2000,1610612746,LAC,Los Angeles Clippers,UTA,20000010,0,LAC @ UTA,0.0,240,34,69,0.493,4,13,0.308,22,34,0.647,12,32,44,18,3,6,24,32,94,-13,188.5
1,2000-10-31,2000,10,31,22000,2000,1610612757,POR,Portland Trail Blazers,LAL,20000012,1,POR vs. LAL,0.0,240,34,85,0.4,4,11,0.364,14,16,0.875,13,19,32,18,13,1,10,28,86,-10,182.5
2,2000-10-31,2000,10,31,22000,2000,1610612760,OKC,Seattle SuperSonics,MEM,20000013,0,SEA @ VAN,0.0,240,32,84,0.381,8,23,0.348,16,22,0.727,14,38,52,20,5,9,19,25,88,-6,210.0
3,2000-10-31,2000,10,31,22000,2000,1610612747,LAL,Los Angeles Lakers,POR,20000012,0,LAL @ POR,1.0,240,36,63,0.571,5,11,0.455,19,30,0.633,8,31,39,28,5,8,20,18,96,10,209.75
4,2000-10-31,2000,10,31,22000,2000,1610612744,GSW,Golden State Warriors,PHX,20000011,1,GSW vs. PHX,1.0,240,32,79,0.405,4,8,0.5,28,38,0.737,14,41,55,18,11,5,21,22,96,2,219.75


In [36]:
#attach the stats columns from the opponent datafram to the info columns of the team dataframe
team_opp_logs = team_logs_clean.loc[:,:'wl']
for stat in stat_cols:
    team_opp_logs['opp_'+stat] = opp_logs_df[stat]
team_opp_logs.head()    

Unnamed: 0,game_date,year,month,day,season_id,season,team_id,team,team_name,opp,game_id,home,matchup,wl,opp_mp,opp_fgm,opp_fga,opp_fg_pct,opp_fg3m,opp_fg3a,opp_fg3_pct,opp_ftm,opp_fta,opp_ft_pct,opp_oreb,opp_dreb,opp_reb,opp_ast,opp_stl,opp_blk,opp_tov,opp_pf,opp_pts,opp_plus_minus,opp_fscore
0,2000-10-31,2000,10,31,22000,2000,1610612762,UTA,Utah Jazz,LAC,20000010,1,UTA vs. LAC,1.0,240,34,69,0.493,4,13,0.308,22,34,0.647,12,32,44,18,3,6,24,32,94,-13,188.5
1,2000-10-31,2000,10,31,22000,2000,1610612747,LAL,Los Angeles Lakers,POR,20000012,0,LAL @ POR,1.0,240,34,85,0.4,4,11,0.364,14,16,0.875,13,19,32,18,13,1,10,28,86,-10,182.5
2,2000-10-31,2000,10,31,22000,2000,1610612763,MEM,Vancouver Grizzlies,OKC,20000013,1,VAN vs. SEA,1.0,240,32,84,0.381,8,23,0.348,16,22,0.727,14,38,52,20,5,9,19,25,88,-6,210.0
3,2000-10-31,2000,10,31,22000,2000,1610612757,POR,Portland Trail Blazers,LAL,20000012,1,POR vs. LAL,0.0,240,36,63,0.571,5,11,0.455,19,30,0.633,8,31,39,28,5,8,20,18,96,10,209.75
4,2000-10-31,2000,10,31,22000,2000,1610612756,PHX,Phoenix Suns,GSW,20000011,0,PHX @ GSW,0.0,240,32,79,0.405,4,8,0.5,28,38,0.737,14,41,55,18,11,5,21,22,96,2,219.75


In [37]:
#Create a dataframe with both teams stats in each row
team_logs_full = team_logs_clean.join(team_opp_logs.loc[:,'opp_mp':])
team_logs_full.head()

Unnamed: 0,game_date,year,month,day,season_id,season,team_id,team,team_name,opp,game_id,home,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,...,pf,pts,plus_minus,fscore,opp_mp,opp_fgm,opp_fga,opp_fg_pct,opp_fg3m,opp_fg3a,opp_fg3_pct,opp_ftm,opp_fta,opp_ft_pct,opp_oreb,opp_dreb,opp_reb,opp_ast,opp_stl,opp_blk,opp_tov,opp_pf,opp_pts,opp_plus_minus,opp_fscore
0,2000-10-31,2000,10,31,22000,2000,1610612762,UTA,Utah Jazz,LAC,20000010,1,UTA vs. LAC,1.0,240,43,85,0.506,5,10,0.5,16,31,0.516,11,...,30,107,13,228.75,240,34,69,0.493,4,13,0.308,22,34,0.647,12,32,44,18,3,6,24,32,94,-13,188.5
1,2000-10-31,2000,10,31,22000,2000,1610612747,LAL,Los Angeles Lakers,POR,20000012,0,LAL @ POR,1.0,240,36,63,0.571,5,11,0.455,19,30,0.633,8,...,18,96,10,209.75,240,34,85,0.4,4,11,0.364,14,16,0.875,13,19,32,18,13,1,10,28,86,-10,182.5
2,2000-10-31,2000,10,31,22000,2000,1610612763,MEM,Vancouver Grizzlies,OKC,20000013,1,VAN vs. SEA,1.0,240,39,90,0.433,4,14,0.286,12,26,0.462,14,...,21,94,6,236.75,240,32,84,0.381,8,23,0.348,16,22,0.727,14,38,52,20,5,9,19,25,88,-6,210.0
3,2000-10-31,2000,10,31,22000,2000,1610612757,POR,Portland Trail Blazers,LAL,20000012,1,POR vs. LAL,0.0,240,34,85,0.4,4,11,0.364,14,16,0.875,13,...,28,86,-10,182.5,240,36,63,0.571,5,11,0.455,19,30,0.633,8,31,39,28,5,8,20,18,96,10,209.75
4,2000-10-31,2000,10,31,22000,2000,1610612756,PHX,Phoenix Suns,GSW,20000011,0,PHX @ GSW,0.0,240,36,91,0.396,6,21,0.286,16,20,0.8,11,...,28,94,-2,216.0,240,32,79,0.405,4,8,0.5,28,38,0.737,14,41,55,18,11,5,21,22,96,2,219.75


#### Add offensive and defensive efficiency numbers

In order to analyze fantasy scores of players against different levels of offenses and defenses, deriving values for a teams offensive and defensive efficiency will be helpful. Offensive and defensive efficiency is calculated by taking the amount of points a team scores and allows per possession. I'll be using Dean Oliver's equation from his book *Basketball on Paper* to calculate the number of possessions in a game.

In [38]:
def possessions(data):
    """Approximate possessions per game"""
    return data.fga - (data.oreb/(data.oreb + data.opp_dreb))*(data.fga - data.fgm)*1.07 + data.tov+ .4*data.fta

def opp_possessions(data):
    """Approximate possessions per game"""
    return data.opp_fga - (data.opp_oreb/(data.opp_oreb + data.dreb))*(data.opp_fga - data.opp_fgm)*1.07 + data.opp_tov+ .4*data.opp_fta

def pace(data):
    """Pace is an estimate of the number of possessions per 48 minutes by a team"""
    return 48*((data.possessions + data.opp_possessions)/(2*(data.mp/5)))

def off_eff(data):
    """Return Offensive Efficiency"""
    return 100 * data.pts / data.possessions

def def_eff(data):
    """Return Defensive Efficiency"""
    return 100 * data.opp_pts / data.possessions

In [39]:
team_logs_full['possessions'] = possessions(team_logs_full)
team_logs_full['opp_possessions'] = opp_possessions(team_logs_full)
team_logs_full['pace'] = pace(team_logs_full)
team_logs_full['off_eff'] = off_eff(team_logs_full)
team_logs_full['def_eff'] = def_eff(team_logs_full)
team_logs_full.head()

Unnamed: 0,game_date,year,month,day,season_id,season,team_id,team,team_name,opp,game_id,home,matchup,wl,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,...,opp_fgm,opp_fga,opp_fg_pct,opp_fg3m,opp_fg3a,opp_fg3_pct,opp_ftm,opp_fta,opp_ft_pct,opp_oreb,opp_dreb,opp_reb,opp_ast,opp_stl,opp_blk,opp_tov,opp_pf,opp_pts,opp_plus_minus,opp_fscore,possessions,opp_possessions,pace,off_eff,def_eff
0,2000-10-31,2000,10,31,22000,2000,1610612762,UTA,Utah Jazz,LAC,20000010,1,UTA vs. LAC,1.0,240,43,85,0.506,5,10,0.5,16,31,0.516,11,...,34,69,0.493,4,13,0.308,22,34,0.647,12,32,44,18,3,6,24,32,94,-13,188.5,97.903721,93.382353,95.643037,109.291045,96.012694
1,2000-10-31,2000,10,31,22000,2000,1610612747,LAL,Los Angeles Lakers,POR,20000012,0,LAL @ POR,1.0,240,36,63,0.571,5,11,0.455,19,30,0.633,8,...,34,85,0.4,4,11,0.364,14,16,0.875,13,19,32,18,13,1,10,28,86,-10,182.5,86.44,85.277045,85.858523,111.059695,99.490976
2,2000-10-31,2000,10,31,22000,2000,1610612763,MEM,Vancouver Grizzlies,OKC,20000013,1,VAN vs. SEA,1.0,240,39,90,0.433,4,14,0.286,12,26,0.462,14,...,32,84,0.381,8,23,0.348,16,22,0.727,14,38,52,20,5,9,19,25,88,-6,210.0,96.708077,96.526275,96.617176,97.199741,90.995502
3,2000-10-31,2000,10,31,22000,2000,1610612757,POR,Portland Trail Blazers,LAL,20000012,1,POR vs. LAL,0.0,240,34,85,0.4,4,11,0.364,14,16,0.875,13,...,36,63,0.571,5,11,0.455,19,30,0.633,8,31,39,28,5,8,20,18,96,10,209.75,85.277045,86.44,85.858523,100.847772,112.574257
4,2000-10-31,2000,10,31,22000,2000,1610612756,PHX,Phoenix Suns,GSW,20000011,0,PHX @ GSW,0.0,240,36,91,0.396,6,21,0.286,16,20,0.8,11,...,32,79,0.405,4,8,0.5,28,38,0.737,14,41,55,18,11,5,21,22,96,2,219.75,102.550962,100.22,101.385481,91.661744,93.611994


In [40]:
team_logs_clean['possessions'] = team_logs_full['possessions']
team_logs_clean['off_eff'] = team_logs_full['off_eff']
team_opp_logs['possessions'] = team_logs_full['possessions']
team_opp_logs['def_eff'] = team_logs_full['def_eff']

In [41]:
team_logs_clean.to_csv('team_logs_clean.csv')
team_opp_logs.to_csv('team_opp_logs.csv')
team_logs_full.to_csv('team_logs_full.csv')

## Conclusion

Now we have daily box score for every team and player since the turn of the century stored in player_logs_clean.csv and team_logs_clean.csv. 