In [1]:
from bs4 import BeautifulSoup
import requests

from IPython.core.display import display, HTML

import pandas as pd

import re

import numpy as np

import datetime

In [2]:
#Retrieve breakdown of games from october to end of march

urlList = [
    'https://www.basketball-reference.com/leagues/NBA_2018_games-october.html',
    'https://www.basketball-reference.com/leagues/NBA_2018_games-november.html',
    'https://www.basketball-reference.com/leagues/NBA_2018_games-december.html',
    'https://www.basketball-reference.com/leagues/NBA_2018_games-january.html',
    'https://www.basketball-reference.com/leagues/NBA_2018_games-february.html',
    'https://www.basketball-reference.com/leagues/NBA_2018_games-march.html'
]
soupList = []
for url in urlList:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    soupList.append(soup)
[soup1,soup2,soup3,soup4,soup5,soup6] = soupList

In [3]:
headerData = soup1.find(class_ = 'overthrow table_container').find_all('tr')[0].find_all('th')

In [4]:
column_headers = []

#get headers
for header in headerData:
    column_headers.append(header['data-stat'])
column_headers

['date_game',
 'game_start_time',
 'visitor_team_name',
 'visitor_pts',
 'home_team_name',
 'home_pts',
 'box_score_text',
 'overtimes',
 'attendance',
 'game_remarks']

In [5]:
fullDataList = []

for soup in soupList:
    cellData = soup.find(class_ = 'overthrow table_container').find_all('tr')
    for row in range(1,len(cellData)):
        rowList = []
        for cell in cellData[row].find_all(lambda tag: tag.name == 'td' or tag.name == 'th'):
            if cell.contents == []:
                value = ''
            elif cell.contents[0].name == 'a':
                if cell.contents[0].contents[0] == 'Box Score':
                    value = 'https://www.basketball-reference.com'+ cell.contents[0]['href']
                else:
                    value = cell.contents[0].contents[0]
            else:
                value = cell.contents[0]
            rowList.append(value)
        fullDataList.append(rowList)

cleanCellData = [x for x in fullDataList if x != []]

cleanCellData

[['Tue, Oct 17, 2017',
  '8:01p',
  'Boston Celtics',
  '99',
  'Cleveland Cavaliers',
  '102',
  'https://www.basketball-reference.com/boxscores/201710170CLE.html',
  '',
  '20,562',
  ''],
 ['Tue, Oct 17, 2017',
  '10:30p',
  'Houston Rockets',
  '122',
  'Golden State Warriors',
  '121',
  'https://www.basketball-reference.com/boxscores/201710170GSW.html',
  '',
  '19,596',
  ''],
 ['Wed, Oct 18, 2017',
  '7:00p',
  'Charlotte Hornets',
  '90',
  'Detroit Pistons',
  '102',
  'https://www.basketball-reference.com/boxscores/201710180DET.html',
  '',
  '20,491',
  ''],
 ['Wed, Oct 18, 2017',
  '7:00p',
  'Brooklyn Nets',
  '131',
  'Indiana Pacers',
  '140',
  'https://www.basketball-reference.com/boxscores/201710180IND.html',
  '',
  '15,008',
  ''],
 ['Wed, Oct 18, 2017',
  '7:00p',
  'Miami Heat',
  '109',
  'Orlando Magic',
  '116',
  'https://www.basketball-reference.com/boxscores/201710180ORL.html',
  '',
  '18,846',
  ''],
 ['Wed, Oct 18, 2017',
  '7:00p',
  'Philadelphia 76ers

In [6]:
#Combine Header and Data to create intiial DF

df = pd.DataFrame(cleanCellData,columns = column_headers)

In [7]:
def date_change(row):
    t = datetime.datetime.strptime(row['date_game'].replace(',',''), "%a %b %d %Y")
    convert_date = t.strftime('%m/%d/%Y')
    return(convert_date)

df['Date'] = pd.to_datetime(df.apply(date_change,axis = 1))

In [8]:
nbaDict = {
'Atlanta Hawks': 'ATL',
'Brooklyn Nets': 'BRK',
'Boston Celtics': 'BOS',
'Charlotte Hornets': 'CHO',
'Chicago Bulls': 'CHI',
'Cleveland Cavaliers': 'CLE',
'Dallas Mavericks': 'DAL',
'Denver Nuggets': 'DEN',
'Detroit Pistons': 'DET',
'Golden State Warriors': 'GSW',
'Houston Rockets': 'HOU',
'Indiana Pacers': 'IND',
'Los Angeles Clippers': 'LAC',
'Los Angeles Lakers': 'LAL',
'Memphis Grizzlies': 'MEM',
'Miami Heat': 'MIA',
'Milwaukee Bucks': 'MIL',
'Minnesota Timberwolves': 'MIN',
'New Orleans Pelicans': 'NOP',
'New York Knicks': 'NYK',
'Oklahoma City Thunder': 'OKC',
'Orlando Magic': 'ORL',
'Philadelphia 76ers': 'PHI',
'Phoenix Suns': 'PHO',
'Portland Trail Blazers': 'POR',
'Sacramento Kings': 'SAC',
'San Antonio Spurs': 'SAS',
'Toronto Raptors': 'TOR',
'Utah Jazz': 'UTA',
'Washington Wizards': 'WAS'}

In [24]:
statColumns = ['Team TS%','Team eFG%','Team ORtg','Opp TS%','Opp eFG%','Opp ORtg']

In [16]:
def statRecorder(row):
    url1 = row['box_score_text']
    response1 = requests.get(url1)
    page1 = response1.text
    soup1 = BeautifulSoup(page1, "lxml")
    team_list = [nbaDict[row['visitor_team_name']],nbaDict[row['home_team_name']]]
    print([row['visitor_team_name'],row['home_team_name']])
    statsList = []
    index = [1,2,-2]
    for team in team_list:
        boxScoreLink = "box-{}-game-advanced".format(team)
        totalStats = soup1.find('table', id =boxScoreLink).find_all('tr')[-1].find_all('td')
        for i in index:
            statsList.append(totalStats[i].contents[0])
    return(statsList)

In [17]:
#Create new stats list incrementally and then concatenate after
stats_list = []

In [18]:
firstBatch = df[0:1200].apply(statRecorder,axis = 1)
for i in firstBatch:
    stats_list.append(i)

['Boston Celtics', 'Cleveland Cavaliers']
['Houston Rockets', 'Golden State Warriors']
['Charlotte Hornets', 'Detroit Pistons']
['Brooklyn Nets', 'Indiana Pacers']
['Miami Heat', 'Orlando Magic']
['Philadelphia 76ers', 'Washington Wizards']
['Milwaukee Bucks', 'Boston Celtics']
['New Orleans Pelicans', 'Memphis Grizzlies']
['Atlanta Hawks', 'Dallas Mavericks']
['Denver Nuggets', 'Utah Jazz']
['Minnesota Timberwolves', 'San Antonio Spurs']
['Portland Trail Blazers', 'Phoenix Suns']
['Houston Rockets', 'Sacramento Kings']
['Chicago Bulls', 'Toronto Raptors']
['New York Knicks', 'Oklahoma City Thunder']
['Los Angeles Clippers', 'Los Angeles Lakers']
['Atlanta Hawks', 'Charlotte Hornets']
['Portland Trail Blazers', 'Indiana Pacers']
['Cleveland Cavaliers', 'Milwaukee Bucks']
['Boston Celtics', 'Philadelphia 76ers']
['Detroit Pistons', 'Washington Wizards']
['Orlando Magic', 'Brooklyn Nets']
['Utah Jazz', 'Minnesota Timberwolves']
['Sacramento Kings', 'Dallas Mavericks']
['Golden State Warr

['Boston Celtics', 'Brooklyn Nets']
['Toronto Raptors', 'Houston Rockets']
['San Antonio Spurs', 'Dallas Mavericks']
['Sacramento Kings', 'Atlanta Hawks']
['Washington Wizards', 'Miami Heat']
['Utah Jazz', 'New York Knicks']
['Cleveland Cavaliers', 'Charlotte Hornets']
['Indiana Pacers', 'Memphis Grizzlies']
['Detroit Pistons', 'Milwaukee Bucks']
['San Antonio Spurs', 'Minnesota Timberwolves']
['Toronto Raptors', 'New Orleans Pelicans']
['Chicago Bulls', 'Oklahoma City Thunder']
['Orlando Magic', 'Portland Trail Blazers']
['Philadelphia 76ers', 'Los Angeles Lakers']
['Golden State Warriors', 'Boston Celtics']
['Houston Rockets', 'Phoenix Suns']
['Detroit Pistons', 'Indiana Pacers']
['Miami Heat', 'Washington Wizards']
['Utah Jazz', 'Brooklyn Nets']
['Los Angeles Clippers', 'Cleveland Cavaliers']
['New York Knicks', 'Toronto Raptors']
['Charlotte Hornets', 'Chicago Bulls']
['Oklahoma City Thunder', 'San Antonio Spurs']
['Minnesota Timberwolves', 'Dallas Mavericks']
['Portland Trail Blaz

['San Antonio Spurs', 'Dallas Mavericks']
['Philadelphia 76ers', 'Minnesota Timberwolves']
['Phoenix Suns', 'Sacramento Kings']
['Oklahoma City Thunder', 'Indiana Pacers']
['Los Angeles Clippers', 'Orlando Magic']
['Memphis Grizzlies', 'Washington Wizards']
['Denver Nuggets', 'Boston Celtics']
['Portland Trail Blazers', 'Miami Heat']
['Utah Jazz', 'Chicago Bulls']
['Milwaukee Bucks', 'New Orleans Pelicans']
['Toronto Raptors', 'Phoenix Suns']
['Charlotte Hornets', 'Houston Rockets']
['Detroit Pistons', 'Atlanta Hawks']
['New York Knicks', 'Brooklyn Nets']
['Los Angeles Lakers', 'Cleveland Cavaliers']
['Sacramento Kings', 'Minnesota Timberwolves']
['Dallas Mavericks', 'Golden State Warriors']
['Miami Heat', 'Charlotte Hornets']
['Detroit Pistons', 'Indiana Pacers']
['Portland Trail Blazers', 'Orlando Magic']
['Oklahoma City Thunder', 'Philadelphia 76ers']
['Los Angeles Clippers', 'Washington Wizards']
['Utah Jazz', 'Boston Celtics']
['Brooklyn Nets', 'Toronto Raptors']
['Atlanta Hawks',

['Orlando Magic', 'Dallas Mavericks']
['Sacramento Kings', 'Los Angeles Lakers']
['Dallas Mavericks', 'Charlotte Hornets']
['Miami Heat', 'Indiana Pacers']
['Utah Jazz', 'Washington Wizards']
['Detroit Pistons', 'Brooklyn Nets']
['Chicago Bulls', 'New York Knicks']
['Portland Trail Blazers', 'Houston Rockets']
['New Orleans Pelicans', 'Memphis Grizzlies']
['Orlando Magic', 'Milwaukee Bucks']
['Oklahoma City Thunder', 'Minnesota Timberwolves']
['Atlanta Hawks', 'Denver Nuggets']
['Los Angeles Clippers', 'Golden State Warriors']
['Boston Celtics', 'Philadelphia 76ers']
['Cleveland Cavaliers', 'Toronto Raptors']
['Los Angeles Clippers', 'Sacramento Kings']
['San Antonio Spurs', 'Los Angeles Lakers']
['Utah Jazz', 'Charlotte Hornets']
['Cleveland Cavaliers', 'Indiana Pacers']
['Orlando Magic', 'Washington Wizards']
['Brooklyn Nets', 'Atlanta Hawks']
['Golden State Warriors', 'Milwaukee Bucks']
['New York Knicks', 'Minnesota Timberwolves']
['Portland Trail Blazers', 'New Orleans Pelicans']


['Boston Celtics', 'Toronto Raptors']
['Washington Wizards', 'Philadelphia 76ers']
['Oklahoma City Thunder', 'Golden State Warriors']
['Phoenix Suns', 'Los Angeles Lakers']
['Brooklyn Nets', 'Detroit Pistons']
['Houston Rockets', 'Miami Heat']
['Minnesota Timberwolves', 'Cleveland Cavaliers']
['Utah Jazz', 'Memphis Grizzlies']
['San Antonio Spurs', 'Phoenix Suns']
['Atlanta Hawks', 'Orlando Magic']
['New York Knicks', 'Toronto Raptors']
['Boston Celtics', 'Washington Wizards']
['Charlotte Hornets', 'Portland Trail Blazers']
['Dallas Mavericks', 'Golden State Warriors']
['Oklahoma City Thunder', 'Los Angeles Lakers']
['Los Angeles Clippers', 'Detroit Pistons']
['New Orleans Pelicans', 'Philadelphia 76ers']
['Cleveland Cavaliers', 'Atlanta Hawks']
['Indiana Pacers', 'Boston Celtics']
['Denver Nuggets', 'Houston Rockets']
['Milwaukee Bucks', 'Miami Heat']
['Charlotte Hornets', 'Utah Jazz']
['Minnesota Timberwolves', 'Chicago Bulls']
['Portland Trail Blazers', 'Sacramento Kings']
['New Orl

['Milwaukee Bucks', 'Memphis Grizzlies']
['Sacramento Kings', 'Oklahoma City Thunder']
['Miami Heat', 'Portland Trail Blazers']
['Indiana Pacers', 'Philadelphia 76ers']
['Minnesota Timberwolves', 'Washington Wizards']
['Oklahoma City Thunder', 'Atlanta Hawks']
['Toronto Raptors', 'Brooklyn Nets']
['Dallas Mavericks', 'New York Knicks']
['Los Angeles Clippers', 'Chicago Bulls']
['Charlotte Hornets', 'New Orleans Pelicans']
['Orlando Magic', 'San Antonio Spurs']
['Detroit Pistons', 'Utah Jazz']
['Cleveland Cavaliers', 'Phoenix Suns']
['Denver Nuggets', 'Los Angeles Lakers']
['Milwaukee Bucks', 'Orlando Magic']
['Washington Wizards', 'Boston Celtics']
['Miami Heat', 'Sacramento Kings']
['Los Angeles Lakers', 'Golden State Warriors']
['Toronto Raptors', 'Indiana Pacers']
['Charlotte Hornets', 'Atlanta Hawks']
['Philadelphia 76ers', 'New York Knicks']
['Los Angeles Clippers', 'Houston Rockets']
['Chicago Bulls', 'Memphis Grizzlies']
['New Orleans Pelicans', 'San Antonio Spurs']
['Detroit Pi

In [62]:
statsdf = pd.DataFrame(stats_list,columns = statColumns)

In [63]:
resultdf = pd.concat([df, statsdf], axis=1)
resultdf.head()

Unnamed: 0,date_game,game_start_time,visitor_team_name,visitor_pts,home_team_name,home_pts,box_score_text,overtimes,attendance,game_remarks,Date,Team TS%,Team eFG%,Team ORtg,Opp TS%,Opp eFG%,Opp ORtg
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,https://www.basketball-reference.com/boxscores...,,20562,,2017-10-17,0.5,0.455,99.7,0.543,0.488,102.7
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,https://www.basketball-reference.com/boxscores...,,19596,,2017-10-17,0.579,0.562,119.6,0.678,0.638,118.6
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,https://www.basketball-reference.com/boxscores...,,20491,,2017-10-18,0.525,0.459,91.4,0.504,0.474,103.6
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,https://www.basketball-reference.com/boxscores...,,15008,,2017-10-18,0.606,0.543,115.7,0.603,0.564,123.6
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,https://www.basketball-reference.com/boxscores...,,18846,,2017-10-18,0.502,0.475,103.6,0.564,0.522,110.3


In [64]:
#visitor stats
visitor_df = resultdf.iloc[:,np.r_[0:7,11:14]]
visitor_df.head()

#home stats
home_df = resultdf.iloc[:,np.r_[0:2,4:6,2:4,6,14:17]]
home_df.head()

visitor_df.head()


Unnamed: 0,date_game,game_start_time,visitor_team_name,visitor_pts,home_team_name,home_pts,box_score_text,Team TS%,Team eFG%,Team ORtg
0,"Tue, Oct 17, 2017",8:01p,Boston Celtics,99,Cleveland Cavaliers,102,https://www.basketball-reference.com/boxscores...,0.5,0.455,99.7
1,"Tue, Oct 17, 2017",10:30p,Houston Rockets,122,Golden State Warriors,121,https://www.basketball-reference.com/boxscores...,0.579,0.562,119.6
2,"Wed, Oct 18, 2017",7:00p,Charlotte Hornets,90,Detroit Pistons,102,https://www.basketball-reference.com/boxscores...,0.525,0.459,91.4
3,"Wed, Oct 18, 2017",7:00p,Brooklyn Nets,131,Indiana Pacers,140,https://www.basketball-reference.com/boxscores...,0.606,0.543,115.7
4,"Wed, Oct 18, 2017",7:00p,Miami Heat,109,Orlando Magic,116,https://www.basketball-reference.com/boxscores...,0.502,0.475,103.6


In [65]:
home_df.columns = ['date_game','game_start_time','team_name','team_pts','opposing_team','opposing_pts','box_score','TS%','eFG%','ORtg']

visitor_df.columns = ['date_game','game_start_time','team_name','team_pts','opposing_team','opposing_pts','box_score','TS%','eFG%','ORtg']


In [66]:
frames = [visitor_df, home_df]
agg_data = pd.concat(frames,ignore_index=True)

In [67]:
def date_change(row):
    t = datetime.datetime.strptime(row['date_game'].replace(',',''), "%a %b %d %Y")
    convert_date = t.strftime('%m/%d/%Y')
    return(convert_date)

agg_data['Date'] = pd.to_datetime(agg_data.apply(date_change,axis = 1))

In [68]:
agg_data['date_game'] = agg_data['Date']
del agg_data['Date']

In [69]:
ordered_agg_data = agg_data.sort_values(by = ['team_name','date_game']).reset_index()

In [70]:
rolling_stat_data = agg_data.sort_values(by = ['team_name','date_game'])

testdata_grouped_rolling = rolling_stat_data.groupby('team_name')[['TS%','eFG%','ORtg']].rolling(window=30, min_periods=1).mean().reset_index()



In [71]:
del testdata_grouped_rolling['level_1']
del testdata_grouped_rolling['team_name']

In [72]:
testdata_grouped_rolling.columns = ['TS1%','eFG1%','ORtg1']

In [73]:
full_stats_data = pd.concat([ordered_agg_data ,testdata_grouped_rolling], axis=1)

In [74]:
#Shift average stats so each row has the average of the games previously
full_stats_data[['TS%','eFG%','ORtg']] = full_stats_data.groupby('team_name')['TS1%','eFG1%','ORtg1'].apply(lambda grp: grp.shift(1))

In [75]:
#Deleted non-shifted columns

full_stats_data = full_stats_data.drop(['TS1%','eFG1%','ORtg1'],axis =1)

In [77]:
full_stats_data_2 = full_stats_data

In [78]:
newDf = pd.merge(full_stats_data_2,full_stats_data,left_on = ['date_game','team_name'],right_on = ['date_game','opposing_team'])

In [79]:
dedupe_new_df = newDf.drop_duplicates(subset = 'box_score_x',keep ='first')

In [80]:
dedupe_new_df.head()

Unnamed: 0,index_x,date_game,game_start_time_x,team_name_x,team_pts_x,opposing_team_x,opposing_pts_x,box_score_x,TS%_x,eFG%_x,...,index_y,game_start_time_y,team_name_y,team_pts_y,opposing_team_y,opposing_pts_y,box_score_y,TS%_y,eFG%_y,ORtg_y
0,8,2017-10-18,8:30p,Atlanta Hawks,117,Dallas Mavericks,111,https://www.basketball-reference.com/boxscores...,,,...,1150,8:30p,Dallas Mavericks,111,Atlanta Hawks,117,https://www.basketball-reference.com/boxscores...,,,
1,16,2017-10-20,7:00p,Atlanta Hawks,91,Charlotte Hornets,109,https://www.basketball-reference.com/boxscores...,0.582,0.559,...,1158,7:00p,Charlotte Hornets,109,Atlanta Hawks,91,https://www.basketball-reference.com/boxscores...,0.525,0.459,91.4
2,37,2017-10-22,3:30p,Atlanta Hawks,104,Brooklyn Nets,116,https://www.basketball-reference.com/boxscores...,0.516,0.4875,...,1179,3:30p,Brooklyn Nets,116,Atlanta Hawks,104,https://www.basketball-reference.com/boxscores...,0.5965,0.546,116.5
3,41,2017-10-23,7:30p,Atlanta Hawks,93,Miami Heat,104,https://www.basketball-reference.com/boxscores...,0.501667,0.451,...,1183,7:30p,Miami Heat,104,Atlanta Hawks,93,https://www.basketball-reference.com/boxscores...,0.5515,0.5365,107.9
4,64,2017-10-26,8:00p,Atlanta Hawks,86,Chicago Bulls,91,https://www.basketball-reference.com/boxscores...,0.51475,0.46825,...,1206,8:00p,Chicago Bulls,91,Atlanta Hawks,86,https://www.basketball-reference.com/boxscores...,0.520667,0.491,100.9


In [81]:
dedupe_new_df.columns

Index(['index_x', 'date_game', 'game_start_time_x', 'team_name_x',
       'team_pts_x', 'opposing_team_x', 'opposing_pts_x', 'box_score_x',
       'TS%_x', 'eFG%_x', 'ORtg_x', 'index_y', 'game_start_time_y',
       'team_name_y', 'team_pts_y', 'opposing_team_y', 'opposing_pts_y',
       'box_score_y', 'TS%_y', 'eFG%_y', 'ORtg_y'],
      dtype='object')

In [83]:
allowed_df = dedupe_new_df[['date_game','team_name_x','opposing_team_x','TS%_x','eFG%_x', 'ORtg_x','TS%_y', 'eFG%_y', 'ORtg_y']]

allowed_df.columns = ['date_game','team_name','opposing_team','Team TS%','Team eFG%','Team ORtg','Opp TS%', 'Opp eFG%', 'Opp ORtg']

In [85]:
export_csv = allowed_df.to_csv (r'C:\Users\jeromerufin\Desktop\Metis\advstats_17_18.csv') #Don't forget to add '.csv' at the end of the path