In [1]:
from bs4 import BeautifulSoup
import requests

from IPython.core.display import display, HTML

import pandas as pd

import re

import numpy as np

import datetime

In [2]:
#Retrieve breakdown of games from october to end of march

urlList = [
    'https://www.basketball-reference.com/leagues/NBA_2019_games-october.html',
    'https://www.basketball-reference.com/leagues/NBA_2019_games-november.html',
    'https://www.basketball-reference.com/leagues/NBA_2019_games-december.html',
    'https://www.basketball-reference.com/leagues/NBA_2019_games-january.html',
    'https://www.basketball-reference.com/leagues/NBA_2019_games-february.html',
    'https://www.basketball-reference.com/leagues/NBA_2019_games-march.html'
]
soupList = []
for url in urlList:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    soupList.append(soup)
[soup1,soup2,soup3,soup4,soup5,soup6] = soupList

In [3]:
headerData = soup1.find(class_ = 'overthrow table_container').find_all('tr')[0].find_all('th')

In [4]:
column_headers = []

#get headers
for header in headerData:
    column_headers.append(header['data-stat'])
column_headers

['date_game',
 'game_start_time',
 'visitor_team_name',
 'visitor_pts',
 'home_team_name',
 'home_pts',
 'box_score_text',
 'overtimes',
 'attendance',
 'game_remarks']

In [5]:
fullDataList = []

for soup in soupList:
    cellData = soup.find(class_ = 'overthrow table_container').find_all('tr')
    for row in range(1,len(cellData)):
        rowList = []
        for cell in cellData[row].find_all(lambda tag: tag.name == 'td' or tag.name == 'th'):
            if cell.contents == []:
                value = ''
            elif cell.contents[0].name == 'a':
                if cell.contents[0].contents[0] == 'Box Score':
                    value = 'https://www.basketball-reference.com'+ cell.contents[0]['href']
                else:
                    value = cell.contents[0].contents[0]
            else:
                value = cell.contents[0]
            rowList.append(value)
        fullDataList.append(rowList)

cleanCellData = [x for x in fullDataList if x != []]

cleanCellData

[['Tue, Oct 16, 2018',
  '8:00p',
  'Philadelphia 76ers',
  '87',
  'Boston Celtics',
  '105',
  'https://www.basketball-reference.com/boxscores/201810160BOS.html',
  '',
  '18,624',
  ''],
 ['Tue, Oct 16, 2018',
  '10:30p',
  'Oklahoma City Thunder',
  '100',
  'Golden State Warriors',
  '108',
  'https://www.basketball-reference.com/boxscores/201810160GSW.html',
  '',
  '19,596',
  ''],
 ['Wed, Oct 17, 2018',
  '7:00p',
  'Milwaukee Bucks',
  '113',
  'Charlotte Hornets',
  '112',
  'https://www.basketball-reference.com/boxscores/201810170CHO.html',
  '',
  '17,889',
  ''],
 ['Wed, Oct 17, 2018',
  '7:00p',
  'Brooklyn Nets',
  '100',
  'Detroit Pistons',
  '103',
  'https://www.basketball-reference.com/boxscores/201810170DET.html',
  '',
  '20,332',
  ''],
 ['Wed, Oct 17, 2018',
  '7:00p',
  'Memphis Grizzlies',
  '83',
  'Indiana Pacers',
  '111',
  'https://www.basketball-reference.com/boxscores/201810170IND.html',
  '',
  '17,923',
  ''],
 ['Wed, Oct 17, 2018',
  '7:00p',
  'Miam

In [6]:
#Combine Header and Data to create intiial DF

df = pd.DataFrame(cleanCellData,columns = column_headers)

In [7]:
def date_change(row):
    t = datetime.datetime.strptime(row['date_game'].replace(',',''), "%a %b %d %Y")
    convert_date = t.strftime('%m/%d/%Y')
    return(convert_date)

df['Date'] = pd.to_datetime(df.apply(date_change,axis = 1))

In [8]:
nbaDict = {
'Atlanta Hawks': 'ATL',
'Brooklyn Nets': 'BRK',
'Boston Celtics': 'BOS',
'Charlotte Hornets': 'CHO',
'Chicago Bulls': 'CHI',
'Cleveland Cavaliers': 'CLE',
'Dallas Mavericks': 'DAL',
'Denver Nuggets': 'DEN',
'Detroit Pistons': 'DET',
'Golden State Warriors': 'GSW',
'Houston Rockets': 'HOU',
'Indiana Pacers': 'IND',
'Los Angeles Clippers': 'LAC',
'Los Angeles Lakers': 'LAL',
'Memphis Grizzlies': 'MEM',
'Miami Heat': 'MIA',
'Milwaukee Bucks': 'MIL',
'Minnesota Timberwolves': 'MIN',
'New Orleans Pelicans': 'NOP',
'New York Knicks': 'NYK',
'Oklahoma City Thunder': 'OKC',
'Orlando Magic': 'ORL',
'Philadelphia 76ers': 'PHI',
'Phoenix Suns': 'PHO',
'Portland Trail Blazers': 'POR',
'Sacramento Kings': 'SAC',
'San Antonio Spurs': 'SAS',
'Toronto Raptors': 'TOR',
'Utah Jazz': 'UTA',
'Washington Wizards': 'WAS'}

In [9]:
statColumns = ['Team TS%','Team eFG%','Team ORtg','Opp TS%','Opp eFG%','Opp ORtg']

In [10]:
def statRecorder(row):
    url1 = row['box_score_text']
    response1 = requests.get(url1)
    page1 = response1.text
    soup1 = BeautifulSoup(page1, "lxml")
    team_list = [nbaDict[row['visitor_team_name']],nbaDict[row['home_team_name']]]
    print([row['visitor_team_name'],row['home_team_name']])
    statsList = []
    index = [1,2,-2]
    for team in team_list:
        boxScoreLink = "box-{}-game-advanced".format(team)
        totalStats = soup1.find('table', id =boxScoreLink).find_all('tr')[-1].find_all('td')
        for i in index:
            statsList.append(totalStats[i].contents[0])
    return(statsList)

In [11]:
#Create new stats list incrementally and then concatenate after
stats_list = []

In [12]:
firstBatch = df[0:1200].apply(statRecorder,axis = 1)
for i in firstBatch:
    stats_list.append(i)

['Philadelphia 76ers', 'Boston Celtics']
['Oklahoma City Thunder', 'Golden State Warriors']
['Milwaukee Bucks', 'Charlotte Hornets']
['Brooklyn Nets', 'Detroit Pistons']
['Memphis Grizzlies', 'Indiana Pacers']
['Miami Heat', 'Orlando Magic']
['Atlanta Hawks', 'New York Knicks']
['Cleveland Cavaliers', 'Toronto Raptors']
['New Orleans Pelicans', 'Houston Rockets']
['Minnesota Timberwolves', 'San Antonio Spurs']
['Utah Jazz', 'Sacramento Kings']
['Denver Nuggets', 'Los Angeles Clippers']
['Dallas Mavericks', 'Phoenix Suns']
['Chicago Bulls', 'Philadelphia 76ers']
['Miami Heat', 'Washington Wizards']
['Los Angeles Lakers', 'Portland Trail Blazers']
['Charlotte Hornets', 'Orlando Magic']
['New York Knicks', 'Brooklyn Nets']
['Atlanta Hawks', 'Memphis Grizzlies']
['Cleveland Cavaliers', 'Minnesota Timberwolves']
['Sacramento Kings', 'New Orleans Pelicans']
['Boston Celtics', 'Toronto Raptors']
['Indiana Pacers', 'Milwaukee Bucks']
['Oklahoma City Thunder', 'Los Angeles Clippers']
['Golden S

['Atlanta Hawks', 'Golden State Warriors']
['Philadelphia 76ers', 'Orlando Magic']
['Cleveland Cavaliers', 'Washington Wizards']
['Chicago Bulls', 'Boston Celtics']
['Miami Heat', 'Brooklyn Nets']
['Detroit Pistons', 'Toronto Raptors']
['Memphis Grizzlies', 'Milwaukee Bucks']
['New Orleans Pelicans', 'Minnesota Timberwolves']
['New York Knicks', 'Oklahoma City Thunder']
['Utah Jazz', 'Dallas Mavericks']
['San Antonio Spurs', 'Phoenix Suns']
['Portland Trail Blazers', 'Los Angeles Lakers']
['Golden State Warriors', 'Houston Rockets']
['Atlanta Hawks', 'Denver Nuggets']
['San Antonio Spurs', 'Los Angeles Clippers']
['Toronto Raptors', 'Boston Celtics']
['Miami Heat', 'Indiana Pacers']
['Utah Jazz', 'Philadelphia 76ers']
['Brooklyn Nets', 'Washington Wizards']
['Sacramento Kings', 'Memphis Grizzlies']
['Portland Trail Blazers', 'Minnesota Timberwolves']
['New York Knicks', 'New Orleans Pelicans']
['Chicago Bulls', 'Milwaukee Bucks']
['Los Angeles Clippers', 'Brooklyn Nets']
['Philadelphia

['Miami Heat', 'Los Angeles Lakers']
['Portland Trail Blazers', 'Houston Rockets']
['Phoenix Suns', 'San Antonio Spurs']
['Toronto Raptors', 'Los Angeles Clippers']
['Detroit Pistons', 'Charlotte Hornets']
['New York Knicks', 'Cleveland Cavaliers']
['Milwaukee Bucks', 'Indiana Pacers']
['Brooklyn Nets', 'Philadelphia 76ers']
['Boston Celtics', 'Washington Wizards']
['Portland Trail Blazers', 'Memphis Grizzlies']
['Oklahoma City Thunder', 'New Orleans Pelicans']
['Atlanta Hawks', 'Dallas Mavericks']
['Miami Heat', 'Utah Jazz']
['Minnesota Timberwolves', 'Sacramento Kings']
['Toronto Raptors', 'Golden State Warriors']
['Los Angeles Lakers', 'Houston Rockets']
['Los Angeles Clippers', 'San Antonio Spurs']
['Chicago Bulls', 'Orlando Magic']
['Dallas Mavericks', 'Phoenix Suns']
['Atlanta Hawks', 'Boston Celtics']
['New York Knicks', 'Charlotte Hornets']
['Washington Wizards', 'Brooklyn Nets']
['Milwaukee Bucks', 'Cleveland Cavaliers']
['Indiana Pacers', 'Philadelphia 76ers']
['Miami Heat', 

['Washington Wizards', 'Philadelphia 76ers']
['Denver Nuggets', 'Miami Heat']
['Atlanta Hawks', 'Toronto Raptors']
['Minnesota Timberwolves', 'Oklahoma City Thunder']
['Sacramento Kings', 'Phoenix Suns']
['New York Knicks', 'Golden State Warriors']
['Charlotte Hornets', 'Los Angeles Clippers']
['Indiana Pacers', 'Boston Celtics']
['Philadelphia 76ers', 'Washington Wizards']
['Atlanta Hawks', 'Brooklyn Nets']
['Milwaukee Bucks', 'Houston Rockets']
['San Antonio Spurs', 'Memphis Grizzlies']
['Cleveland Cavaliers', 'New Orleans Pelicans']
['Phoenix Suns', 'Dallas Mavericks']
['Orlando Magic', 'Utah Jazz']
['Chicago Bulls', 'Portland Trail Blazers']
['Detroit Pistons', 'Los Angeles Lakers']
['Boston Celtics', 'Miami Heat']
['Los Angeles Clippers', 'Denver Nuggets']
['Oklahoma City Thunder', 'San Antonio Spurs']
['Detroit Pistons', 'Sacramento Kings']
['Atlanta Hawks', 'Philadelphia 76ers']
['Milwaukee Bucks', 'Washington Wizards']
['Indiana Pacers', 'New York Knicks']
['Brooklyn Nets', 'To

['Minnesota Timberwolves', 'Memphis Grizzlies']
['Orlando Magic', 'Oklahoma City Thunder']
['Toronto Raptors', 'Philadelphia 76ers']
['Miami Heat', 'Portland Trail Blazers']
['Denver Nuggets', 'Brooklyn Nets']
['New Orleans Pelicans', 'Chicago Bulls']
['Washington Wizards', 'Milwaukee Bucks']
['Charlotte Hornets', 'Dallas Mavericks']
['Phoenix Suns', 'Utah Jazz']
['Houston Rockets', 'Sacramento Kings']
['San Antonio Spurs', 'Golden State Warriors']
['Los Angeles Clippers', 'Indiana Pacers']
['Minnesota Timberwolves', 'Orlando Magic']
['Toronto Raptors', 'Atlanta Hawks']
['Los Angeles Lakers', 'Boston Celtics']
['Memphis Grizzlies', 'Oklahoma City Thunder']
['San Antonio Spurs', 'Portland Trail Blazers']
['New York Knicks', 'Detroit Pistons']
['Denver Nuggets', 'Philadelphia 76ers']
['Cleveland Cavaliers', 'Washington Wizards']
['Chicago Bulls', 'Brooklyn Nets']
['Milwaukee Bucks', 'Dallas Mavericks']
['Golden State Warriors', 'Phoenix Suns']
['Minnesota Timberwolves', 'New Orleans Peli

['Milwaukee Bucks', 'San Antonio Spurs']
['Phoenix Suns', 'Golden State Warriors']
['Toronto Raptors', 'Cleveland Cavaliers']
['Sacramento Kings', 'Washington Wizards']
['Detroit Pistons', 'Brooklyn Nets']
['Charlotte Hornets', 'Houston Rockets']
['Oklahoma City Thunder', 'Utah Jazz']
['Boston Celtics', 'Los Angeles Clippers']
['New York Knicks', 'Indiana Pacers']
['Cleveland Cavaliers', 'Philadelphia 76ers']
['Los Angeles Lakers', 'Chicago Bulls']
['San Antonio Spurs', 'Dallas Mavericks']
['Milwaukee Bucks', 'New Orleans Pelicans']
['Minnesota Timberwolves', 'Denver Nuggets']
['Portland Trail Blazers', 'Los Angeles Clippers']
['Brooklyn Nets', 'Oklahoma City Thunder']
['Orlando Magic', 'Washington Wizards']
['Memphis Grizzlies', 'Atlanta Hawks']
['Detroit Pistons', 'Miami Heat']
['Golden State Warriors', 'Houston Rockets']
['Utah Jazz', 'Phoenix Suns']
['Oklahoma City Thunder', 'Indiana Pacers']
['Cleveland Cavaliers', 'Orlando Magic']
['Sacramento Kings', 'Boston Celtics']
['Los Ange

In [13]:
statsdf = pd.DataFrame(stats_list,columns = statColumns)

In [14]:
resultdf = pd.concat([df, statsdf], axis=1)
resultdf.head()

Unnamed: 0,date_game,game_start_time,visitor_team_name,visitor_pts,home_team_name,home_pts,box_score_text,overtimes,attendance,game_remarks,Date,Team TS%,Team eFG%,Team ORtg,Opp TS%,Opp eFG%,Opp ORtg
0,"Tue, Oct 16, 2018",8:00p,Philadelphia 76ers,87,Boston Celtics,105,https://www.basketball-reference.com/boxscores...,,18624,,2018-10-16,0.448,0.42,83.4,0.509,0.49,100.7
1,"Tue, Oct 16, 2018",10:30p,Oklahoma City Thunder,100,Golden State Warriors,108,https://www.basketball-reference.com/boxscores...,,19596,,2018-10-16,0.466,0.418,97.7,0.525,0.479,105.6
2,"Wed, Oct 17, 2018",7:00p,Milwaukee Bucks,113,Charlotte Hornets,112,https://www.basketball-reference.com/boxscores...,,17889,,2018-10-17,0.602,0.576,110.2,0.551,0.533,109.2
3,"Wed, Oct 17, 2018",7:00p,Brooklyn Nets,100,Detroit Pistons,103,https://www.basketball-reference.com/boxscores...,,20332,,2018-10-17,0.545,0.518,100.0,0.506,0.457,103.0
4,"Wed, Oct 17, 2018",7:00p,Memphis Grizzlies,83,Indiana Pacers,111,https://www.basketball-reference.com/boxscores...,,17923,,2018-10-17,0.431,0.357,88.8,0.626,0.627,118.7


In [15]:
#visitor stats
visitor_df = resultdf.iloc[:,np.r_[0:7,11:14]]
visitor_df.head()

#home stats
home_df = resultdf.iloc[:,np.r_[0:2,4:6,2:4,6,14:17]]
home_df.head()

visitor_df.head()


Unnamed: 0,date_game,game_start_time,visitor_team_name,visitor_pts,home_team_name,home_pts,box_score_text,Team TS%,Team eFG%,Team ORtg
0,"Tue, Oct 16, 2018",8:00p,Philadelphia 76ers,87,Boston Celtics,105,https://www.basketball-reference.com/boxscores...,0.448,0.42,83.4
1,"Tue, Oct 16, 2018",10:30p,Oklahoma City Thunder,100,Golden State Warriors,108,https://www.basketball-reference.com/boxscores...,0.466,0.418,97.7
2,"Wed, Oct 17, 2018",7:00p,Milwaukee Bucks,113,Charlotte Hornets,112,https://www.basketball-reference.com/boxscores...,0.602,0.576,110.2
3,"Wed, Oct 17, 2018",7:00p,Brooklyn Nets,100,Detroit Pistons,103,https://www.basketball-reference.com/boxscores...,0.545,0.518,100.0
4,"Wed, Oct 17, 2018",7:00p,Memphis Grizzlies,83,Indiana Pacers,111,https://www.basketball-reference.com/boxscores...,0.431,0.357,88.8


In [16]:
home_df.columns = ['date_game','game_start_time','team_name','team_pts','opposing_team','opposing_pts','box_score','TS%','eFG%','ORtg']

visitor_df.columns = ['date_game','game_start_time','team_name','team_pts','opposing_team','opposing_pts','box_score','TS%','eFG%','ORtg']


In [17]:
frames = [visitor_df, home_df]
agg_data = pd.concat(frames,ignore_index=True)

In [18]:
def date_change(row):
    t = datetime.datetime.strptime(row['date_game'].replace(',',''), "%a %b %d %Y")
    convert_date = t.strftime('%m/%d/%Y')
    return(convert_date)

agg_data['Date'] = pd.to_datetime(agg_data.apply(date_change,axis = 1))

In [19]:
agg_data['date_game'] = agg_data['Date']
del agg_data['Date']

In [20]:
ordered_agg_data = agg_data.sort_values(by = ['team_name','date_game']).reset_index()

In [21]:
rolling_stat_data = agg_data.sort_values(by = ['team_name','date_game'])

testdata_grouped_rolling = rolling_stat_data.groupby('team_name')[['TS%','eFG%','ORtg']].rolling(window=30, min_periods=1).mean().reset_index()



In [22]:
del testdata_grouped_rolling['level_1']
del testdata_grouped_rolling['team_name']

In [23]:
testdata_grouped_rolling.columns = ['TS1%','eFG1%','ORtg1']

In [24]:
full_stats_data = pd.concat([ordered_agg_data ,testdata_grouped_rolling], axis=1)

In [25]:
#Shift average stats so each row has the average of the games previously
full_stats_data[['TS%','eFG%','ORtg']] = full_stats_data.groupby('team_name')['TS1%','eFG1%','ORtg1'].apply(lambda grp: grp.shift(1))

In [26]:
#Deleted non-shifted columns

full_stats_data = full_stats_data.drop(['TS1%','eFG1%','ORtg1'],axis =1)

In [27]:
full_stats_data_2 = full_stats_data

In [28]:
newDf = pd.merge(full_stats_data_2,full_stats_data,left_on = ['date_game','team_name'],right_on = ['date_game','opposing_team'])

In [29]:
dedupe_new_df = newDf.drop_duplicates(subset = 'box_score_x',keep ='first')

In [30]:
dedupe_new_df.head()

Unnamed: 0,index_x,date_game,game_start_time_x,team_name_x,team_pts_x,opposing_team_x,opposing_pts_x,box_score_x,TS%_x,eFG%_x,...,index_y,game_start_time_y,team_name_y,team_pts_y,opposing_team_y,opposing_pts_y,box_score_y,TS%_y,eFG%_y,ORtg_y
0,6,2018-10-17,7:30p,Atlanta Hawks,107,New York Knicks,126,https://www.basketball-reference.com/boxscores...,,,...,1157,7:30p,New York Knicks,126,Atlanta Hawks,107,https://www.basketball-reference.com/boxscores...,,,
1,18,2018-10-19,8:00p,Atlanta Hawks,117,Memphis Grizzlies,131,https://www.basketball-reference.com/boxscores...,0.546,0.511,...,1169,8:00p,Memphis Grizzlies,131,Atlanta Hawks,117,https://www.basketball-reference.com/boxscores...,0.431,0.357,88.8
2,35,2018-10-21,6:00p,Atlanta Hawks,133,Cleveland Cavaliers,111,https://www.basketball-reference.com/boxscores...,0.569,0.5445,...,1186,6:00p,Cleveland Cavaliers,111,Atlanta Hawks,133,https://www.basketball-reference.com/boxscores...,0.5455,0.484,111.35
3,1202,2018-10-24,7:00p,Atlanta Hawks,111,Dallas Mavericks,104,https://www.basketball-reference.com/boxscores...,0.587667,0.563667,...,51,7:00p,Dallas Mavericks,104,Atlanta Hawks,111,https://www.basketball-reference.com/boxscores...,0.566333,0.529,117.7
4,1226,2018-10-27,7:30p,Atlanta Hawks,85,Chicago Bulls,97,https://www.basketball-reference.com/boxscores...,0.57325,0.545,...,75,7:30p,Chicago Bulls,97,Atlanta Hawks,85,https://www.basketball-reference.com/boxscores...,0.5748,0.5408,110.88


In [31]:
dedupe_new_df.columns

Index(['index_x', 'date_game', 'game_start_time_x', 'team_name_x',
       'team_pts_x', 'opposing_team_x', 'opposing_pts_x', 'box_score_x',
       'TS%_x', 'eFG%_x', 'ORtg_x', 'index_y', 'game_start_time_y',
       'team_name_y', 'team_pts_y', 'opposing_team_y', 'opposing_pts_y',
       'box_score_y', 'TS%_y', 'eFG%_y', 'ORtg_y'],
      dtype='object')

In [32]:
allowed_df = dedupe_new_df[['date_game','team_name_x','opposing_team_x','TS%_x','eFG%_x', 'ORtg_x','TS%_y', 'eFG%_y', 'ORtg_y']]

allowed_df.columns = ['date_game','team_name','opposing_team','Team TS%','Team eFG%','Team ORtg','Opp TS%', 'Opp eFG%', 'Opp ORtg']

In [33]:
export_csv = allowed_df.to_csv (r'C:\Users\jeromerufin\Desktop\Metis\advstats_18_19.csv') #Don't forget to add '.csv' at the end of the path