In [1]:
import sys

In [2]:
sys.path.append(".")

In [3]:
import pandas as pd
from pathlib import Path
from basketball_reference_scraper.seasons import get_schedule, get_standings
from basketball_reference_scraper.box_scores import get_box_scores
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [3]:
data_path = Path('data')
assert data_path.exists()

In [4]:
with open(data_path/'team_params.txt') as f:
    teams = f.readlines()
team_dict={}
for team in teams:
    splits = team.split(":")
    team_dict[splits[0].strip().title()] = splits[1].strip()
team_dict

{'Atlanta Hawks': 'ATL',
 'St. Louis Hawks': 'SLH',
 'Milwaukee Hawks': 'MIL',
 'Tri-Cities Blackhawks': 'TCB',
 'Boston Celtics': 'BOS',
 'Brooklyn Nets': 'BRK',
 'New Jersey Nets': 'NJN',
 'Chicago Bulls': 'CHI',
 'Charlotte Hornets (1988-2004)': 'CHH',
 'Charlotte Hornets (2014-Present)': 'CHO',
 'Charlotte Bobcats': 'CHA',
 'Cleveland Cavaliers': 'CLE',
 'Dallas Mavericks': 'DAL',
 'Denver Nuggets': 'DEN',
 'Detroit Pistons': 'DET',
 'Fort Wayne Pistons': 'FWP',
 'Golden State Warriors': 'GSW',
 'San Francisco Warriors': 'SFW',
 'Philadelphia Warriors': 'PHI',
 'Houston Rockets': 'HOU',
 'Indiana Pacers': 'IND',
 'Los Angeles Clippers': 'LAC',
 'San Diego Clippers': 'SDC',
 'Buffalo Braves': 'BUF',
 'Los Angeles Lakers': 'LAL',
 'Minneapolis Lakers': 'MIN',
 'Memphis Grizzlies': 'MEM',
 'Vancouver Grizzlies': 'VAN',
 'Miami Heat': 'MIA',
 'Milwaukee Bucks': 'MIL',
 'Minnesota Timberwolves': 'MIN',
 'New Orleans Pelicans': 'NOP',
 'New Orleans/Oklahoma City Hornets': 'NOK',
 'New Or

# Get Schedule

In [22]:
all_schedules = []
for year in range(2014,2022):
    print(f"{year-1}-{year}")
    year_schedule = get_schedule(year, playoffs=False)
    year_schedule['season']= f"{year-1}-{year}"
    all_schedules.append(year_schedule)

2019-2020
2020-2021


In [23]:
schedule_20142021 = pd.concat(all_schedules)
len(schedule_20142021)

2139

In [26]:
schedule_20142021

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,season
0,2019-10-22,New Orleans Pelicans,122.0,Toronto Raptors,130.0,2019-2020
1,2019-10-22,Los Angeles Lakers,102.0,Los Angeles Clippers,112.0,2019-2020
2,2019-10-23,Denver Nuggets,108.0,Portland Trail Blazers,100.0,2019-2020
3,2019-10-23,Sacramento Kings,95.0,Phoenix Suns,124.0,2019-2020
4,2019-10-23,Oklahoma City Thunder,95.0,Utah Jazz,100.0,2019-2020
...,...,...,...,...,...,...
1074,2021-05-16,Denver Nuggets,,Portland Trail Blazers,,2020-2021
1075,2021-05-16,Utah Jazz,,Sacramento Kings,,2020-2021
1076,2021-05-16,Phoenix Suns,,San Antonio Spurs,,2020-2021
1077,2021-05-16,Indiana Pacers,,Toronto Raptors,,2020-2021


In [30]:
schedule_20142021['VISITOR'].apply(lambda x: x.replace('*','').upper())

0        NEW ORLEANS PELICANS
1          LOS ANGELES LAKERS
2              DENVER NUGGETS
3            SACRAMENTO KINGS
4       OKLAHOMA CITY THUNDER
                ...          
1074           DENVER NUGGETS
1075                UTAH JAZZ
1076             PHOENIX SUNS
1077           INDIANA PACERS
1078        CHARLOTTE HORNETS
Name: VISITOR, Length: 2139, dtype: object

In [9]:
playedgames_20142021 = schedule_20142021.dropna(how='any')
playedgames_20142021 = playedgames_20142021.replace({"Philadelphia 76ers":"Philadelphia 76Ers"})
playedgames_20142021 = playedgames_20142021.replace(team_dict)
playedgames_20142021 = playedgames_20142021.replace({'Charlotte Hornets':'CHO'})
len(playedgames_20142021)

8874

In [12]:
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,season
0,2013-10-29,ORL,87,IND,97,2014-2015
1,2013-10-29,CHI,95,MIA,107,2014-2015
2,2013-10-29,LAC,103,LAL,116,2014-2015
3,2013-10-30,BRK,94,CLE,98,2014-2015
4,2013-10-30,BOS,87,TOR,93,2014-2015


In [13]:
# check if team names are replaced with abbreviation
for visitor in playedgames_20142021['VISITOR'].unique():
    assert len(visitor)==3, f"{visitor}'s team name needs fix"
for home in playedgames_20142021['HOME'].unique():
    assert len(visitor)==3, f"{home}'s team name needs fix"

# Build URLs

In [14]:
box_scores_urls=[]
gameids = []
for key,row in playedgames_20142021.iterrows():
    nums_to_join=[str(num) for num in [row['DATE'].year,f"{row['DATE'].month:02d}",f"{row['DATE'].day:02d}",0,row['HOME']]]
    url = "https://www.basketball-reference.com/boxscores/"+''.join(nums_to_join)+".html"
    gameids.append(''.join(nums_to_join))
    box_scores_urls.append(url)

In [15]:
box_scores_urls[:3]

['https://www.basketball-reference.com/boxscores/201310290IND.html',
 'https://www.basketball-reference.com/boxscores/201310290MIA.html',
 'https://www.basketball-reference.com/boxscores/201310290LAL.html']

In [16]:
len(box_scores_urls)

8874

In [17]:
playedgames_20142021['boxscores_url'] = box_scores_urls
playedgames_20142021['game_id'] = gameids
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,season,boxscores_url,game_id
0,2013-10-29,ORL,87,IND,97,2014-2015,https://www.basketball-reference.com/boxscores...,201310290IND
1,2013-10-29,CHI,95,MIA,107,2014-2015,https://www.basketball-reference.com/boxscores...,201310290MIA
2,2013-10-29,LAC,103,LAL,116,2014-2015,https://www.basketball-reference.com/boxscores...,201310290LAL
3,2013-10-30,BRK,94,CLE,98,2014-2015,https://www.basketball-reference.com/boxscores...,201310300CLE
4,2013-10-30,BOS,87,TOR,93,2014-2015,https://www.basketball-reference.com/boxscores...,201310300TOR


In [21]:
import datetime

playedgames_20142021.to_pickle(f'Data/schedule_{datetime.date.today()}.pkl')

# Scrape four factors

In [40]:
tables = []
bad_games =[]
for key,row in tqdm(playedgames_20142021.iterrows()):
    try:
        response = requests.get(row['boxscores_url'])
        html = response.content
        html = html.decode()
        stat_html = html.replace('<!--', "")
        stat_html = stat_html.replace('-->', "")
        soup = BeautifulSoup(stat_html, 'html.parser')
        table = pd.read_html(str(soup.find_all('table',attrs={"id":"four_factors"})[0]))[0]
        table = table.droplevel(0,axis=1)
        table = table.rename({'Unnamed: 0_level_1':'Team'},axis=1)
        table['game_id'] = row['game_id']
        tables.append(table)
        if key%100==0: print(key)
    except:
        bad_games.append(row['game_id'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
0
100
200
300



In [41]:
temp_four_factors = pd.concat(tables)
temp_four_factors.head()

Unnamed: 0,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,game_id
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6,201310290IND
1,IND,95.0,0.528,19.0,27.8,0.31,102.1,201310290IND
0,CHI,97.1,0.464,16.2,23.9,0.217,97.9,201310290MIA
1,MIA,97.1,0.59,17.5,14.3,0.306,110.2,201310290MIA
0,LAC,100.4,0.542,14.7,22.7,0.157,102.6,201310290LAL


In [42]:
temp_four_factors.to_pickle("Data/temp_four_factors.pkl")

In [24]:
temp_four_factors = pd.read_pickle("Data/temp_four_factors.pkl")

In [25]:
temp_four_factors

Unnamed: 0,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,game_id
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6,201310290IND
1,IND,95.0,0.528,19.0,27.8,0.310,102.1,201310290IND
0,CHI,97.1,0.464,16.2,23.9,0.217,97.9,201310290MIA
1,MIA,97.1,0.590,17.5,14.3,0.306,110.2,201310290MIA
0,LAC,100.4,0.542,14.7,22.7,0.157,102.6,201310290LAL
...,...,...,...,...,...,...,...,...
1,WAS,96.6,0.651,12.6,16.7,0.157,125.3,202102020WAS
0,BOS,96.4,0.542,13.5,28.6,0.238,115.1,202102020GSW
1,GSW,96.4,0.548,10.6,13.3,0.193,111.0,202102020GSW
0,DET,99.6,0.468,12.8,25.0,0.194,105.4,202102020UTA


In [47]:
# handle the bad games
print(len(bad_games))
print(bad_games)

1
['201711110GSW']


# Join Four Factors with schedule to construct training data

In [26]:
temp_four_factors.head()

Unnamed: 0,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,game_id
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6,201310290IND
1,IND,95.0,0.528,19.0,27.8,0.31,102.1,201310290IND
0,CHI,97.1,0.464,16.2,23.9,0.217,97.9,201310290MIA
1,MIA,97.1,0.59,17.5,14.3,0.306,110.2,201310290MIA
0,LAC,100.4,0.542,14.7,22.7,0.157,102.6,201310290LAL


In [27]:
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,season,boxscores_url,game_id
0,2013-10-29,ORL,87,IND,97,2014-2015,https://www.basketball-reference.com/boxscores...,201310290IND
1,2013-10-29,CHI,95,MIA,107,2014-2015,https://www.basketball-reference.com/boxscores...,201310290MIA
2,2013-10-29,LAC,103,LAL,116,2014-2015,https://www.basketball-reference.com/boxscores...,201310290LAL
3,2013-10-30,BRK,94,CLE,98,2014-2015,https://www.basketball-reference.com/boxscores...,201310300CLE
4,2013-10-30,BOS,87,TOR,93,2014-2015,https://www.basketball-reference.com/boxscores...,201310300TOR


In [28]:
# join on home team
home_joined=pd.merge(playedgames_20142021,temp_four_factors,left_on=['game_id','HOME'],right_on=['game_id','Team'])
home_joined

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,season,boxscores_url,game_id,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg
0,2013-10-29,ORL,87,IND,97,2014-2015,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,19.0,27.8,0.310,102.1
1,2013-10-29,CHI,95,MIA,107,2014-2015,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.590,17.5,14.3,0.306,110.2
2,2013-10-29,LAC,103,LAL,116,2014-2015,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,15.3,37.5,0.194,115.6
3,2013-10-30,BRK,94,CLE,98,2014-2015,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,10.0,36.4,0.274,106.2
4,2013-10-30,BOS,87,TOR,93,2014-2015,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,15.0,42.2,0.140,101.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8738,2021-02-02,LAC,120.0,BRK,124.0,2021-2022,https://www.basketball-reference.com/boxscores...,202102020BRK,BRK,99.9,0.652,15.0,17.6,0.266,124.1
8739,2021-02-02,MEM,116.0,IND,134.0,2021-2022,https://www.basketball-reference.com/boxscores...,202102020IND,IND,104.2,0.695,16.0,25.7,0.244,128.6
8740,2021-02-02,POR,132.0,WAS,121.0,2021-2022,https://www.basketball-reference.com/boxscores...,202102020WAS,WAS,96.6,0.651,12.6,16.7,0.157,125.3
8741,2021-02-02,BOS,111.0,GSW,107.0,2021-2022,https://www.basketball-reference.com/boxscores...,202102020GSW,GSW,96.4,0.548,10.6,13.3,0.193,111.0


In [56]:
joined=pd.merge(home_joined,temp_four_factors,left_on=['game_id','VISITOR'],right_on=['game_id','Team'],suffixes=('_home','_visitor'))
joined.columns

Index(['DATE', 'VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url',
       'game_id', 'Team_home', 'Pace_home', 'eFG%_home', 'TOV%_home',
       'ORB%_home', 'FT/FGA_home', 'ORtg_home', 'Team_visitor', 'Pace_visitor',
       'eFG%_visitor', 'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor',
       'ORtg_visitor'],
      dtype='object')

In [57]:
joined

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.310,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.590,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.140,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8738,2021-02-02,LAC,120,BRK,124,https://www.basketball-reference.com/boxscores...,202102020BRK,BRK,99.9,0.652,...,17.6,0.266,124.1,LAC,99.9,0.530,7.0,22.4,0.152,120.1
8739,2021-02-02,MEM,116,IND,134,https://www.basketball-reference.com/boxscores...,202102020IND,IND,104.2,0.695,...,25.7,0.244,128.6,MEM,104.2,0.500,13.4,23.3,0.247,111.3
8740,2021-02-02,POR,132,WAS,121,https://www.basketball-reference.com/boxscores...,202102020WAS,WAS,96.6,0.651,...,16.7,0.157,125.3,POR,96.6,0.574,7.9,29.8,0.242,136.7
8741,2021-02-02,BOS,111,GSW,107,https://www.basketball-reference.com/boxscores...,202102020GSW,GSW,96.4,0.548,...,13.3,0.193,111.0,BOS,96.4,0.542,13.5,28.6,0.238,115.1


In [58]:
joined.to_pickle("Data/full_data_2014to2021.pkl")

In [4]:
joined = pd.read_pickle("Data/full_data_2014to2021.pkl")

In [5]:
joined.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.31,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.59,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.14,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1


# Add Defensive Rating

## Scrape Box Scores

In [101]:
joined = pd.read_pickle("Data/full_data_2014to2021.pkl")

# from multiprocessing.dummy import Pool as ThreadPool
params = list(zip(list(joined['game_id']),list(joined['DATE']),list(joined['HOME']),list(joined['VISITOR']),['GAME']*len(joined),['ADVANCED']*len(joined)))
box_scores={}
bad_games =[]
for param in tqdm(params,total = len(params)):
    try:
        box_scores[param[0]]=get_box_scores(*param[1:])
    except Exception as exc:
        print(exc,param)
        print('-'*50)
        bad_games.append(param)

HBox(children=(FloatProgress(value=0.0, max=8743.0), HTML(value='')))

HTTPSConnectionPool(host='widgets.sports-reference.com', port=443): Max retries exceeded with url: /wg.fcgi?css=1&site=bbr&url=%2Fboxscores%2F201612210PHO.html&div=div_box-PHO-game-advanced (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f3a3c87a4d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) ('201612210PHO', Timestamp('2016-12-21 00:00:00'), 'PHO', 'HOU', 'GAME', 'ADVANCED')
--------------------------------------------------



In [102]:
bad_games

[('201612210PHO',
  Timestamp('2016-12-21 00:00:00'),
  'PHO',
  'HOU',
  'GAME',
  'ADVANCED')]

In [107]:
more_bad_games=[]
for param in tqdm(bad_games,total = len(bad_games)):
    try:
        box_scores[param[0]]=get_box_scores(*param[1:])
    except Exception as exc:
        print(exc,param)
        print('-'*50)
        more_bad_games.append(param)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [111]:
bad_games.pop()

('201612210PHO',
 Timestamp('2016-12-21 00:00:00'),
 'PHO',
 'HOU',
 'GAME',
 'ADVANCED')

In [112]:
assert len(bad_games)==0
assert len(more_bad_games)==0

In [113]:
len(joined)

8743

In [114]:
len(box_scores)

8743

In [115]:
# save the data
import pickle
fname = "Data/box_scores_2014to2021.p"
pickle.dump(box_scores,open(fname, "wb"))

In [116]:
# get only team level data
box_scores = pickle.load(open(fname, "rb" ))

In [117]:
len(box_scores)

8743

In [None]:
updated_box_score = []
bad_games=[]
for game_id, team_box_scores in box_scores.items():
    try:
        for team, box_score in team_box_scores.items():
            box_score['Team']=team
            box_score['game_id']=game_id
            updated_box_score.append(box_score.iloc[-1])
    except:
        bad_games.append(game_id)

In [136]:
assert len(bad_games)==0, 'please handle bad games'

AssertionError: please handle bad games

In [131]:
bad_games

['201601250UTA']

In [148]:
temp_scores = get_box_scores('2016-01-25','UTA','DET',stat_type='ADVANCED')
temp_scores

{'UTA':             PLAYER            MP           TS%          eFG%          3PAr  \
 0   Gordon Hayward         36:43          .422          .326          .304   
 1      Rudy Gobert         35:58          .723          .700          .000   
 2      Rodney Hood         34:19          .681          .688          .563   
 3       Trey Lyles         16:28          .167          .167          .333   
 4        Raul Neto          6:12          .000          .000         1.000   
 5       Joe Ingles         27:30          .500          .500          .800   
 6       Trey Burke         22:10          .000          .000          .375   
 7   Derrick Favors         19:34          .686          .625          .000   
 8    Chris Johnson         17:06          .750          .750         1.000   
 9    Trevor Booker         14:18          .400          .400          .200   
 10     Jeff Withey          9:41         1.000         1.000          .000   
 11    J.J. O'Brien  Did Not Play  Did Not Pl

In [153]:
temp_scores['UTA']['Team']='UTA'
temp_scores['UTA']['game_id']='201601250UTA'
temp_scores['DET']['Team']='DET'
temp_scores['DET']['game_id']='201601250UTA'

In [157]:
updated_box_score.append(temp_scores['UTA'].iloc[-1])
updated_box_score.append(temp_scores['DET'].iloc[-1])

In [159]:
box_scores=pd.concat(updated_box_score,axis=1).transpose()
box_scores.head()

Unnamed: 0,PLAYER,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM,Team,game_id,Unnamed: 16_level_1
13,Team Totals,240,0.57,0.528,0.239,0.451,27.8,72.3,53.0,50.0,4.2,24.3,19.0,100.0,102.1,91.6,,IND,201310290IND,
13,Team Totals,240,0.447,0.435,0.204,0.108,27.7,72.2,47.0,47.2,10.5,11.1,14.9,100.0,91.6,102.1,,ORL,201310290IND,
13,Team Totals,240,0.631,0.59,0.278,0.403,14.3,76.1,49.4,70.3,10.3,12.3,17.5,100.0,110.2,97.9,,MIA,201310290MIA,
13,Team Totals,240,0.51,0.464,0.313,0.277,23.9,85.7,50.6,65.7,11.3,7.7,16.2,100.0,97.9,110.2,,CHI,201310290MIA,
13,Team Totals,240,0.551,0.527,0.312,0.301,37.5,77.3,56.5,54.8,8.0,9.7,15.3,100.0,115.6,102.6,,LAL,201310290LAL,


In [160]:
joined.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.31,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.59,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.14,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1


In [165]:
box_scores.columns

Index(['PLAYER', 'MP', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
       'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM', 'Team',
       'game_id', 'Unnamed: 16_level_1'],
      dtype='object')

In [166]:
joined.columns

Index(['DATE', 'VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url',
       'game_id', 'Team_home', 'Pace_home', 'eFG%_home', 'TOV%_home',
       'ORB%_home', 'FT/FGA_home', 'ORtg_home', 'Team_visitor', 'Pace_visitor',
       'eFG%_visitor', 'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor',
       'ORtg_visitor'],
      dtype='object')

In [171]:
columns_to_delete = ['PLAYER','MP','eFG%','TOV%','USG%','ORB%','ORtg','BPM','Unnamed: 16_level_1']
box_scores = box_scores.drop(columns=columns_to_delete,errors='ignore')

In [172]:
# join box_scores with joined, on game_id, team. add home or visitor suffix
box_scores

Unnamed: 0,TS%,3PAr,FTr,DRB%,TRB%,AST%,STL%,BLK%,DRtg,Team,game_id
13,.570,.239,.451,72.3,53.0,50.0,4.2,24.3,91.6,IND,201310290IND
13,.447,.204,.108,72.2,47.0,47.2,10.5,11.1,102.1,ORL,201310290IND
13,.631,.278,.403,76.1,49.4,70.3,10.3,12.3,97.9,MIA,201310290MIA
13,.510,.313,.277,85.7,50.6,65.7,11.3,7.7,110.2,CHI,201310290MIA
13,.551,.312,.301,77.3,56.5,54.8,8.0,9.7,102.6,LAL,201310290LAL
...,...,...,...,...,...,...,...,...,...,...,...
14,.516,.290,.215,81.0,51.1,46.2,7.0,12.5,117.5,DET,202102020UTA
13,.545,.269,.215,76.9,54.3,45.5,7.4,12.5,132.2,PHO,201612210PHO
13,.641,.442,.302,66.7,45.7,65.2,4.2,11.8,117.4,HOU,201612210PHO
13,.495,.341,.212,84.1,53.8,52.9,5.5,7.5,104.1,UTA,201601250UTA


In [173]:
joined

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.310,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.590,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.140,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8738,2021-02-02,LAC,120.0,BRK,124.0,https://www.basketball-reference.com/boxscores...,202102020BRK,BRK,99.9,0.652,...,17.6,0.266,124.1,LAC,99.9,0.530,7.0,22.4,0.152,120.1
8739,2021-02-02,MEM,116.0,IND,134.0,https://www.basketball-reference.com/boxscores...,202102020IND,IND,104.2,0.695,...,25.7,0.244,128.6,MEM,104.2,0.500,13.4,23.3,0.247,111.3
8740,2021-02-02,POR,132.0,WAS,121.0,https://www.basketball-reference.com/boxscores...,202102020WAS,WAS,96.6,0.651,...,16.7,0.157,125.3,POR,96.6,0.574,7.9,29.8,0.242,136.7
8741,2021-02-02,BOS,111.0,GSW,107.0,https://www.basketball-reference.com/boxscores...,202102020GSW,GSW,96.4,0.548,...,13.3,0.193,111.0,BOS,96.4,0.542,13.5,28.6,0.238,115.1


In [176]:
home_joined=pd.merge(joined,box_scores,left_on=['game_id','Team_home'],right_on=['game_id','Team'])
home_joined.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,TS%,3PAr,FTr,DRB%,TRB%,AST%,STL%,BLK%,DRtg,Team
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,0.57,0.239,0.451,72.3,53.0,50.0,4.2,24.3,91.6,IND
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.59,...,0.631,0.278,0.403,76.1,49.4,70.3,10.3,12.3,97.9,MIA
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,0.551,0.312,0.301,77.3,56.5,54.8,8.0,9.7,102.6,LAL
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,0.495,0.179,0.405,78.0,56.5,60.0,7.6,5.2,101.8,CLE
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,0.484,0.198,0.267,80.6,59.3,39.5,9.8,11.3,95.1,TOR


In [178]:
home_away_joined=pd.merge(home_joined,box_scores,left_on=['game_id','Team_visitor'],right_on=['game_id','Team'],
                         suffixes=('_home','_visitor'))
home_away_joined

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,TS%_visitor,3PAr_visitor,FTr_visitor,DRB%_visitor,TRB%_visitor,AST%_visitor,STL%_visitor,BLK%_visitor,DRtg_visitor,Team_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,.447,.204,.108,72.2,47.0,47.2,10.5,11.1,102.1,ORL
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.590,...,.510,.313,.277,85.7,50.6,65.7,11.3,7.7,110.2,CHI
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,.553,.253,.277,62.5,43.5,65.9,11.0,6.3,115.6,LAC
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,.508,.293,.293,63.6,43.5,72.7,8.7,11.6,106.2,BRK
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,.552,.197,.439,57.8,40.7,46.9,10.9,10.1,101.6,BOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8738,2021-02-02,LAC,120.0,BRK,124.0,https://www.basketball-reference.com/boxscores...,202102020BRK,BRK,99.9,0.652,...,.563,.364,.172,82.4,47.0,46.7,12.0,10.4,124.1,LAC
8739,2021-02-02,MEM,116.0,IND,134.0,https://www.basketball-reference.com/boxscores...,202102020IND,IND,104.2,0.695,...,.560,.269,.258,74.3,46.2,64.3,10.6,13.2,128.6,MEM
8740,2021-02-02,POR,132.0,WAS,121.0,https://www.basketball-reference.com/boxscores...,202102020WAS,WAS,96.6,0.651,...,.628,.484,.242,83.3,53.0,43.2,6.2,2.3,125.3,POR
8741,2021-02-02,BOS,111.0,GSW,107.0,https://www.basketball-reference.com/boxscores...,202102020GSW,GSW,96.4,0.548,...,.579,.405,.321,86.7,58.6,41.0,3.1,8.7,111.0,BOS


In [180]:
home_away_joined.head().transpose()

Unnamed: 0,0,1,2,3,4
DATE,2013-10-29 00:00:00,2013-10-29 00:00:00,2013-10-29 00:00:00,2013-10-30 00:00:00,2013-10-30 00:00:00
VISITOR,ORL,CHI,LAC,BRK,BOS
VISITOR_PTS,87,95,103,94,87
HOME,IND,MIA,LAL,CLE,TOR
HOME_PTS,97,107,116,98,93
boxscores_url,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
game_id,201310290IND,201310290MIA,201310290LAL,201310300CLE,201310300TOR
Team_home,IND,MIA,LAL,CLE,TOR
Pace_home,95.0,97.1,100.4,92.3,91.5
eFG%_home,0.528,0.59,0.527,0.446,0.471


In [182]:
home_away_joined.drop(columns=['Team_home','Team_visitor'],inplace=True)

In [183]:
home_away_joined.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Pace_home,eFG%_home,TOV%_home,...,DRtg_home,TS%_visitor,3PAr_visitor,FTr_visitor,DRB%_visitor,TRB%_visitor,AST%_visitor,STL%_visitor,BLK%_visitor,DRtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,95.0,0.528,19.0,...,91.6,0.447,0.204,0.108,72.2,47.0,47.2,10.5,11.1,102.1
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,97.1,0.59,17.5,...,97.9,0.51,0.313,0.277,85.7,50.6,65.7,11.3,7.7,110.2
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,100.4,0.527,15.3,...,102.6,0.553,0.253,0.277,62.5,43.5,65.9,11.0,6.3,115.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,92.3,0.446,10.0,...,101.8,0.508,0.293,0.293,63.6,43.5,72.7,8.7,11.6,106.2
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,91.5,0.471,15.0,...,95.1,0.552,0.197,0.439,57.8,40.7,46.9,10.9,10.1,101.6


In [184]:
home_away_joined.to_pickle('Data/full_data_2014to2021.pkl')

# Add auxiliary columns

In [22]:
home_away_joined = pd.read_pickle('Data/full_data_2014to2021.pkl')

In [37]:
home_away_joined.shape

(8743, 37)

In [39]:
home_away_joined = pd.merge(home_away_joined,playedgames_20142021[['game_id','season']],on='game_id')

In [54]:
home_away_joined['season_nth_game'] = home_away_joined.groupby(['season']).cumcount()+1
home_away_joined['hometeam_nth_homegame'] = home_away_joined.groupby(['season','HOME']).cumcount()+1
home_away_joined['visitorteam_nth_visitorgame'] = home_away_joined.groupby(['season','VISITOR']).cumcount()+1

In [55]:
home_away_joined.shape

(8743, 41)

In [52]:
home_away_joined.columns

Index(['DATE', 'VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url',
       'game_id', 'Pace_home', 'eFG%_home', 'TOV%_home', 'ORB%_home',
       'FT/FGA_home', 'ORtg_home', 'Pace_visitor', 'eFG%_visitor',
       'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor', 'ORtg_visitor',
       'TS%_home', '3PAr_home', 'FTr_home', 'DRB%_home', 'TRB%_home',
       'AST%_home', 'STL%_home', 'BLK%_home', 'DRtg_home', 'TS%_visitor',
       '3PAr_visitor', 'FTr_visitor', 'DRB%_visitor', 'TRB%_visitor',
       'AST%_visitor', 'STL%_visitor', 'BLK%_visitor', 'DRtg_visitor',
       'season', 'season_nth_game', 'hometeam_nth_homegame',
       'visitorteam_nth_visitorgame'],
      dtype='object')

In [56]:
home_away_joined.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Pace_home,eFG%_home,TOV%_home,...,DRB%_visitor,TRB%_visitor,AST%_visitor,STL%_visitor,BLK%_visitor,DRtg_visitor,season,season_nth_game,hometeam_nth_homegame,visitorteam_nth_visitorgame
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,95.0,0.528,19.0,...,72.2,47.0,47.2,10.5,11.1,102.1,2014-2015,1,1,1
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,97.1,0.59,17.5,...,85.7,50.6,65.7,11.3,7.7,110.2,2014-2015,2,1,1
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,100.4,0.527,15.3,...,62.5,43.5,65.9,11.0,6.3,115.6,2014-2015,3,1,1
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,92.3,0.446,10.0,...,63.6,43.5,72.7,8.7,11.6,106.2,2014-2015,4,1,1
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,91.5,0.471,15.0,...,57.8,40.7,46.9,10.9,10.1,101.6,2014-2015,5,1,1


In [58]:
home_away_joined.to_pickle('Data/full_data_2014to2021.pkl')