In [1]:
import pandas as pd
from pathlib import Path
from basketball_reference_scraper.seasons import get_schedule, get_standings
from basketball_reference_scraper.box_scores import get_box_scores
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm


In [2]:
data_path = Path('Data')
assert data_path.exists()

In [3]:
with open(data_path/'team_params.txt') as f:
    teams = f.readlines()
team_dict={}
for team in teams:
    splits = team.split(":")
    team_dict[splits[0].strip().title()] = splits[1].strip()
team_dict

{'Atlanta Hawks': 'ATL',
 'St. Louis Hawks': 'SLH',
 'Milwaukee Hawks': 'MIL',
 'Tri-Cities Blackhawks': 'TCB',
 'Boston Celtics': 'BOS',
 'Brooklyn Nets': 'BRK',
 'New Jersey Nets': 'NJN',
 'Chicago Bulls': 'CHI',
 'Charlotte Hornets (1988-2004)': 'CHH',
 'Charlotte Hornets (2014-Present)': 'CHO',
 'Charlotte Bobcats': 'CHA',
 'Cleveland Cavaliers': 'CLE',
 'Dallas Mavericks': 'DAL',
 'Denver Nuggets': 'DEN',
 'Detroit Pistons': 'DET',
 'Fort Wayne Pistons': 'FWP',
 'Golden State Warriors': 'GSW',
 'San Francisco Warriors': 'SFW',
 'Philadelphia Warriors': 'PHI',
 'Houston Rockets': 'HOU',
 'Indiana Pacers': 'IND',
 'Los Angeles Clippers': 'LAC',
 'San Diego Clippers': 'SDC',
 'Buffalo Braves': 'BUF',
 'Los Angeles Lakers': 'LAL',
 'Minneapolis Lakers': 'MIN',
 'Memphis Grizzlies': 'MEM',
 'Vancouver Grizzlies': 'VAN',
 'Miami Heat': 'MIA',
 'Milwaukee Bucks': 'MIL',
 'Minnesota Timberwolves': 'MIN',
 'New Orleans Pelicans': 'NOP',
 'New Orleans/Oklahoma City Hornets': 'NOK',
 'New Or

# Get Schedule

In [5]:
all_schedules = []
for year in range(2014,2022):
    print(year)
    year_schedule = get_schedule(year, playoffs=False)
    all_schedules.append(year_schedule)

2014
2015
2016
2017
2018
2019
2020
2021


In [6]:
schedule_20142021 = pd.concat(all_schedules)
len(schedule_20142021)

8980

In [7]:
playedgames_20142021 = schedule_20142021.dropna(how='any')
playedgames_20142021 = playedgames_20142021.replace({"Philadelphia 76ers":"Philadelphia 76Ers"})
playedgames_20142021 = playedgames_20142021.replace(team_dict)
playedgames_20142021 = playedgames_20142021.replace({'Charlotte Hornets':'CHO'})
len(playedgames_20142021)

8744

In [8]:
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS
0,2013-10-29,ORL,87,IND,97
1,2013-10-29,CHI,95,MIA,107
2,2013-10-29,LAC,103,LAL,116
3,2013-10-30,BRK,94,CLE,98
4,2013-10-30,BOS,87,TOR,93


In [9]:
# check if team names are replaced with abbreviation
for visitor in playedgames_20142021['VISITOR'].unique():
    assert len(visitor)==3, f"{visitor}'s team name needs fix"
for home in playedgames_20142021['HOME'].unique():
    assert len(visitor)==3, f"{home}'s team name needs fix"

# Build URLs

In [35]:
box_scores_urls=[]
gameids = []
for key,row in playedgames_20142021.iterrows():
    nums_to_join=[str(num) for num in [row['DATE'].year,f"{row['DATE'].month:02d}",f"{row['DATE'].day:02d}",0,row['HOME']]]
    url = "https://www.basketball-reference.com/boxscores/"+''.join(nums_to_join)+".html"
    gameids.append(''.join(nums_to_join))
    box_scores_urls.append(url)

In [36]:
box_scores_urls[:3]

['https://www.basketball-reference.com/boxscores/201310290IND.html',
 'https://www.basketball-reference.com/boxscores/201310290MIA.html',
 'https://www.basketball-reference.com/boxscores/201310290LAL.html']

In [12]:
len(box_scores_urls)

8744

In [37]:
playedgames_20142021['boxscores_url'] = box_scores_urls
playedgames_20142021['game_id'] = gameids
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR


# Scrape four factors

In [40]:
tables = []
bad_games =[]
for key,row in tqdm(playedgames_20142021.iterrows()):
    try:
        response = requests.get(row['boxscores_url'])
        html = response.content
        html = html.decode()
        stat_html = html.replace('<!--', "")
        stat_html = stat_html.replace('-->', "")
        soup = BeautifulSoup(stat_html, 'html.parser')
        table = pd.read_html(str(soup.find_all('table',attrs={"id":"four_factors"})[0]))[0]
        table = table.droplevel(0,axis=1)
        table = table.rename({'Unnamed: 0_level_1':'Team'},axis=1)
        table['game_id'] = row['game_id']
        tables.append(table)
        if key%100==0: print(key)
    except:
        bad_games.append(row['game_id'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
0
100
200
300



In [41]:
temp_four_factors = pd.concat(tables)
temp_four_factors.head()

Unnamed: 0,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,game_id
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6,201310290IND
1,IND,95.0,0.528,19.0,27.8,0.31,102.1,201310290IND
0,CHI,97.1,0.464,16.2,23.9,0.217,97.9,201310290MIA
1,MIA,97.1,0.59,17.5,14.3,0.306,110.2,201310290MIA
0,LAC,100.4,0.542,14.7,22.7,0.157,102.6,201310290LAL


In [42]:
temp_four_factors.to_pickle("Data/temp_four_factors.pkl")

In [6]:
temp_four_factors = pd.read_pickle("Data/temp_four_factors.pkl")

In [7]:
temp_four_factors

Unnamed: 0,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,game_id
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6,201310290IND
1,IND,95.0,0.528,19.0,27.8,0.310,102.1,201310290IND
0,CHI,97.1,0.464,16.2,23.9,0.217,97.9,201310290MIA
1,MIA,97.1,0.590,17.5,14.3,0.306,110.2,201310290MIA
0,LAC,100.4,0.542,14.7,22.7,0.157,102.6,201310290LAL
...,...,...,...,...,...,...,...,...
1,WAS,96.6,0.651,12.6,16.7,0.157,125.3,202102020WAS
0,BOS,96.4,0.542,13.5,28.6,0.238,115.1,202102020GSW
1,GSW,96.4,0.548,10.6,13.3,0.193,111.0,202102020GSW
0,DET,99.6,0.468,12.8,25.0,0.194,105.4,202102020UTA


In [47]:
# handle the bad games
print(len(bad_games))
print(bad_games)

1
['201711110GSW']


# Join Four Factors with schedule to construct training data

In [52]:
temp_four_factors.head()

Unnamed: 0,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg,game_id
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6,201310290IND
1,IND,95.0,0.528,19.0,27.8,0.31,102.1,201310290IND
0,CHI,97.1,0.464,16.2,23.9,0.217,97.9,201310290MIA
1,MIA,97.1,0.59,17.5,14.3,0.306,110.2,201310290MIA
0,LAC,100.4,0.542,14.7,22.7,0.157,102.6,201310290LAL


In [53]:
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR


In [55]:
# join on home team
home_joined=pd.merge(playedgames_20142021,temp_four_factors,left_on=['game_id','HOME'],right_on=['game_id','Team'])
home_joined

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,19.0,27.8,0.310,102.1
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.590,17.5,14.3,0.306,110.2
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,15.3,37.5,0.194,115.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,10.0,36.4,0.274,106.2
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,15.0,42.2,0.140,101.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8738,2021-02-02,LAC,120,BRK,124,https://www.basketball-reference.com/boxscores...,202102020BRK,BRK,99.9,0.652,15.0,17.6,0.266,124.1
8739,2021-02-02,MEM,116,IND,134,https://www.basketball-reference.com/boxscores...,202102020IND,IND,104.2,0.695,16.0,25.7,0.244,128.6
8740,2021-02-02,POR,132,WAS,121,https://www.basketball-reference.com/boxscores...,202102020WAS,WAS,96.6,0.651,12.6,16.7,0.157,125.3
8741,2021-02-02,BOS,111,GSW,107,https://www.basketball-reference.com/boxscores...,202102020GSW,GSW,96.4,0.548,10.6,13.3,0.193,111.0


In [56]:
joined=pd.merge(home_joined,temp_four_factors,left_on=['game_id','VISITOR'],right_on=['game_id','Team'],suffixes=('_home','_visitor'))
joined.columns

Index(['DATE', 'VISITOR', 'VISITOR_PTS', 'HOME', 'HOME_PTS', 'boxscores_url',
       'game_id', 'Team_home', 'Pace_home', 'eFG%_home', 'TOV%_home',
       'ORB%_home', 'FT/FGA_home', 'ORtg_home', 'Team_visitor', 'Pace_visitor',
       'eFG%_visitor', 'TOV%_visitor', 'ORB%_visitor', 'FT/FGA_visitor',
       'ORtg_visitor'],
      dtype='object')

In [57]:
joined

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.310,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.590,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.140,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8738,2021-02-02,LAC,120,BRK,124,https://www.basketball-reference.com/boxscores...,202102020BRK,BRK,99.9,0.652,...,17.6,0.266,124.1,LAC,99.9,0.530,7.0,22.4,0.152,120.1
8739,2021-02-02,MEM,116,IND,134,https://www.basketball-reference.com/boxscores...,202102020IND,IND,104.2,0.695,...,25.7,0.244,128.6,MEM,104.2,0.500,13.4,23.3,0.247,111.3
8740,2021-02-02,POR,132,WAS,121,https://www.basketball-reference.com/boxscores...,202102020WAS,WAS,96.6,0.651,...,16.7,0.157,125.3,POR,96.6,0.574,7.9,29.8,0.242,136.7
8741,2021-02-02,BOS,111,GSW,107,https://www.basketball-reference.com/boxscores...,202102020GSW,GSW,96.4,0.548,...,13.3,0.193,111.0,BOS,96.4,0.542,13.5,28.6,0.238,115.1


In [58]:
joined.to_pickle("Data/full_data_2014to2021.pkl")

In [4]:
joined = pd.read_pickle("Data/full_data_2014to2021.pkl")

In [5]:
joined.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url,game_id,Team_home,Pace_home,eFG%_home,...,ORB%_home,FT/FGA_home,ORtg_home,Team_visitor,Pace_visitor,eFG%_visitor,TOV%_visitor,ORB%_visitor,FT/FGA_visitor,ORtg_visitor
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...,201310290IND,IND,95.0,0.528,...,27.8,0.31,102.1,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...,201310290MIA,MIA,97.1,0.59,...,14.3,0.306,110.2,CHI,97.1,0.464,16.2,23.9,0.217,97.9
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...,201310290LAL,LAL,100.4,0.527,...,37.5,0.194,115.6,LAC,100.4,0.542,14.7,22.7,0.157,102.6
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...,201310300CLE,CLE,92.3,0.446,...,36.4,0.274,106.2,BRK,92.3,0.457,13.9,22.0,0.232,101.8
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...,201310300TOR,TOR,91.5,0.471,...,42.2,0.14,101.6,BOS,91.5,0.508,21.8,19.4,0.303,95.1


# Add Defensive Rating

In [4]:
joined = pd.read_pickle("Data/full_data_2014to2021.pkl")

In [5]:
from multiprocessing.dummy import Pool as ThreadPool
import time
time1 = time.time()
params = list(zip(list(range(len(joined))),list(joined['DATE']),list(joined['HOME']),list(joined['VISITOR']),['GAME']*len(joined),['ADVANCED']*len(joined)))

In [None]:
# pbar=tqdm(total=10)

# for bar in [1,2,3]:
#     pbar.set_description_str(str(bar))
#     time.sleep(0.5)

def new_get_box_scores(*f_args):
    if f_args[0]%50==0:
        print(f_args[0])
#     pbar.update(1)
    return get_box_scores(*f_args[1:])

with ThreadPool(4) as p:
    box_scores = p.starmap(new_get_box_scores,params)
time2 = time.time()
time2-time1

0
1100
550
1650
1150
600
50
1700
1200
650
100
1750
1250
700
150
1800
1300
750
200
1850
1350
800
250
1900
1400
850
300
1950
1450
2200
350
2000
1500
2250
400
2050
1550
2300
450
2100
1600
2350
500
2150
2400
2750
3300
3850
2450
2800
3350
2500
3900
2850
3400
3950
2550
2900
3450
4000
2600
2950
3500
4050
2650
3000
3550
4100
3050
2700
3600
3100
4150
4400
3650
3150
4200
4450
3700
3200
4500
4250
3750
3250
4300
4550
3800
