In [9]:
import pandas as pd
from pathlib import Path
from basketball_reference_scraper.seasons import get_schedule, get_standings
from basketball_reference_scraper.box_scores import get_box_scores
import requests
from bs4 import BeautifulSoup


In [10]:
data_path = Path('Data')
assert data_path.exists()

In [22]:
with open(data_path/'team_params.txt') as f:
    teams = f.readlines()
team_dict={}
for team in teams:
    splits = team.split(":")
    team_dict[splits[0].strip().title()] = splits[1].strip()
team_dict

{'Atlanta Hawks': 'ATL',
 'St. Louis Hawks': 'SLH',
 'Milwaukee Hawks': 'MIL',
 'Tri-Cities Blackhawks': 'TCB',
 'Boston Celtics': 'BOS',
 'Brooklyn Nets': 'BRK',
 'New Jersey Nets': 'NJN',
 'Chicago Bulls': 'CHI',
 'Charlotte Hornets (1988-2004)': 'CHH',
 'Charlotte Hornets (2014-Present)': 'CHO',
 'Charlotte Bobcats': 'CHA',
 'Cleveland Cavaliers': 'CLE',
 'Dallas Mavericks': 'DAL',
 'Denver Nuggets': 'DEN',
 'Detroit Pistons': 'DET',
 'Fort Wayne Pistons': 'FWP',
 'Golden State Warriors': 'GSW',
 'San Francisco Warriors': 'SFW',
 'Philadelphia Warriors': 'PHI',
 'Houston Rockets': 'HOU',
 'Indiana Pacers': 'IND',
 'Los Angeles Clippers': 'LAC',
 'San Diego Clippers': 'SDC',
 'Buffalo Braves': 'BUF',
 'Los Angeles Lakers': 'LAL',
 'Minneapolis Lakers': 'MIN',
 'Memphis Grizzlies': 'MEM',
 'Vancouver Grizzlies': 'VAN',
 'Miami Heat': 'MIA',
 'Milwaukee Bucks': 'MIL',
 'Minnesota Timberwolves': 'MIN',
 'New Orleans Pelicans': 'NOP',
 'New Orleans/Oklahoma City Hornets': 'NOK',
 'New Or

# Get Schedule

In [13]:
all_schedules = []
for year in range(2014,2022):
    print(year)
    year_schedule = get_schedule(year, playoffs=False)
    all_schedules.append(year_schedule)

2014
2015
2016
2017
2018
2019
2020
2021


In [15]:
schedule_20142021 = pd.concat(all_schedules)
len(schedule_20142021)

8980

In [35]:
playedgames_20142021 = schedule_20142021.dropna(how='any')
playedgames_20142021 = playedgames_20142021.replace({"Philadelphia 76ers":"Philadelphia 76Ers"})
playedgames_20142021 = playedgames_20142021.replace(team_dict)
playedgames_20142021 = playedgames_20142021.replace({'Charlotte Hornets':'CHO'})
len(playedgames_20142021)

8738

In [36]:
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS
0,2013-10-29,ORL,87,IND,97
1,2013-10-29,CHI,95,MIA,107
2,2013-10-29,LAC,103,LAL,116
3,2013-10-30,BRK,94,CLE,98
4,2013-10-30,BOS,87,TOR,93


In [38]:
# check if team names are replaced with abbreviation
for visitor in playedgames_20142021['VISITOR'].unique():
    assert len(visitor)==3, f"{visitor}'s team name needs fix"
for home in playedgames_20142021['HOME'].unique():
    assert len(visitor)==3, f"{home}'s team name needs fix"

# Build URLs

In [51]:
[str(num) for num in [row['DATE'].year,row['DATE'].month,row['DATE'].day,0,row['HOME']]]

['2013', '10', '30', '0', 'TOR']

In [57]:
box_scores_urls=[]
for key,row in playedgames_20142021.iterrows():
    nums_to_join=[str(num) for num in [row['DATE'].year,row['DATE'].month,row['DATE'].day,0,row['HOME']]]
    url = "https://www.basketball-reference.com/boxscores/"+''.join(nums_to_join)+".html"
    box_scores_urls.append(url)

In [58]:
box_scores_urls[:3]

['https://www.basketball-reference.com/boxscores/201310290IND.html',
 'https://www.basketball-reference.com/boxscores/201310290MIA.html',
 'https://www.basketball-reference.com/boxscores/201310290LAL.html']

In [59]:
len(box_scores_urls)

8738

In [60]:
playedgames_20142021['boxscores_url'] = box_scores_urls
playedgames_20142021.head()

Unnamed: 0,DATE,VISITOR,VISITOR_PTS,HOME,HOME_PTS,boxscores_url
0,2013-10-29,ORL,87,IND,97,https://www.basketball-reference.com/boxscores...
1,2013-10-29,CHI,95,MIA,107,https://www.basketball-reference.com/boxscores...
2,2013-10-29,LAC,103,LAL,116,https://www.basketball-reference.com/boxscores...
3,2013-10-30,BRK,94,CLE,98,https://www.basketball-reference.com/boxscores...
4,2013-10-30,BOS,87,TOR,93,https://www.basketball-reference.com/boxscores...


# Scrape four factors

In [73]:

for key,row in playedgames_20142021.head(1).iterrows():
    print(row['boxscores_url'])
    response = requests.get(row['boxscores_url'])
    html = response.content
    html = html.decode()
    stat_html = html.replace('<!--', "")
    stat_html = stat_html.replace('-->', "")
    soup = BeautifulSoup(stat_html, 'html.parser')


https://www.basketball-reference.com/boxscores/201310290IND.html


In [82]:
str(soup.find_all('table',attrs={"id":"four_factors"})[0])

'<table class="suppress_all stats_table" data-cols-to-freeze=",1" id="four_factors">\n<caption>Four Factors Table</caption>\n<colgroup><col/><col/><col/><col/><col/><col/><col/></colgroup>\n<thead>\n<tr class="over_header">\n<th aria-label="" class="over_header center" colspan="2" data-stat=""></th>\n<th aria-label="" class="over_header center" colspan="4" data-stat="header_tmp">Four Factors</th><th></th>\n</tr>\n<tr>\n<th aria-label="Team" class="poptip sort_default_asc left" data-stat="team_id" data-tip="Team" scope="col">\xa0</th>\n<th aria-label="Pace Factor" class="poptip right" data-stat="pace" data-tip="&lt;b&gt;Pace Factor&lt;/b&gt;: An estimate of possessions per 48 minutes" scope="col">Pace</th>\n<th aria-label="Effective Field Goal Percentage" class="poptip right" data-over-header="Four Factors" data-stat="efg_pct" data-tip="&lt;strong&gt;Effective Field Goal Percentage&lt;/strong&gt;&lt;br&gt;This statistic adjusts for the fact that a 3-point field goal is worth one more po

In [157]:
tables = pd.read_html(str(soup.find_all('table',attrs={"id":"four_factors"})[0]))[0]
tables

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Four Factors,Four Factors,Four Factors,Four Factors,Unnamed: 6_level_0
Unnamed: 0_level_1,Unnamed: 0_level_1.1,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,IND,95.0,0.528,19.0,27.8,0.31,102.1


In [159]:
tables.droplevel(0,axis=1)

Unnamed: 0,Unnamed: 0_level_1,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg
0,ORL,95.0,0.435,14.9,27.7,0.065,91.6
1,IND,95.0,0.528,19.0,27.8,0.31,102.1
