In [1]:
from bs4 import BeautifulSoup
import requests

from IPython.core.display import display, HTML

import pandas as pd

import re

import numpy as np

import datetime

In [2]:
#Retrieve breakdown of games from october to end of march

urlList = [
    'https://www.basketball-reference.com/leagues/NBA_2016_games-october.html',
    'https://www.basketball-reference.com/leagues/NBA_2016_games-november.html',
    'https://www.basketball-reference.com/leagues/NBA_2016_games-december.html',
    'https://www.basketball-reference.com/leagues/NBA_2016_games-january.html',
    'https://www.basketball-reference.com/leagues/NBA_2016_games-february.html',
    'https://www.basketball-reference.com/leagues/NBA_2016_games-march.html'
]
soupList = []
for url in urlList:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    soupList.append(soup)
[soup1,soup2,soup3,soup4,soup5,soup6] = soupList

In [3]:
headerData = soup1.find(class_ = 'overthrow table_container').find_all('tr')[0].find_all('th')

In [4]:
column_headers = []

#get headers
for header in headerData:
    column_headers.append(header['data-stat'])
column_headers

['date_game',
 'game_start_time',
 'visitor_team_name',
 'visitor_pts',
 'home_team_name',
 'home_pts',
 'box_score_text',
 'overtimes',
 'attendance',
 'game_remarks']

In [5]:
fullDataList = []

for soup in soupList:
    cellData = soup.find(class_ = 'overthrow table_container').find_all('tr')
    for row in range(1,len(cellData)):
        rowList = []
        for cell in cellData[row].find_all(lambda tag: tag.name == 'td' or tag.name == 'th'):
            if cell.contents == []:
                value = ''
            elif cell.contents[0].name == 'a':
                if cell.contents[0].contents[0] == 'Box Score':
                    value = 'https://www.basketball-reference.com'+ cell.contents[0]['href']
                else:
                    value = cell.contents[0].contents[0]
            else:
                value = cell.contents[0]
            rowList.append(value)
        fullDataList.append(rowList)

cleanCellData = [x for x in fullDataList if x != []]

cleanCellData

[['Tue, Oct 27, 2015',
  '8:00p',
  'Detroit Pistons',
  '106',
  'Atlanta Hawks',
  '94',
  'https://www.basketball-reference.com/boxscores/201510270ATL.html',
  '',
  '19,187',
  ''],
 ['Tue, Oct 27, 2015',
  '8:00p',
  'Cleveland Cavaliers',
  '95',
  'Chicago Bulls',
  '97',
  'https://www.basketball-reference.com/boxscores/201510270CHI.html',
  '',
  '21,957',
  ''],
 ['Tue, Oct 27, 2015',
  '10:30p',
  'New Orleans Pelicans',
  '95',
  'Golden State Warriors',
  '111',
  'https://www.basketball-reference.com/boxscores/201510270GSW.html',
  '',
  '19,596',
  ''],
 ['Wed, Oct 28, 2015',
  '7:00p',
  'Washington Wizards',
  '88',
  'Orlando Magic',
  '87',
  'https://www.basketball-reference.com/boxscores/201510280ORL.html',
  '',
  '18,846',
  ''],
 ['Wed, Oct 28, 2015',
  '7:30p',
  'Indiana Pacers',
  '99',
  'Toronto Raptors',
  '106',
  'https://www.basketball-reference.com/boxscores/201510280TOR.html',
  '',
  '19,800',
  ''],
 ['Wed, Oct 28, 2015',
  '7:30p',
  'Charlotte Hor

In [6]:
#Combine Header and Data to create intiial DF

df = pd.DataFrame(cleanCellData,columns = column_headers)

In [7]:
df.head()

Unnamed: 0,date_game,game_start_time,visitor_team_name,visitor_pts,home_team_name,home_pts,box_score_text,overtimes,attendance,game_remarks
0,"Tue, Oct 27, 2015",8:00p,Detroit Pistons,106,Atlanta Hawks,94,https://www.basketball-reference.com/boxscores...,,19187,
1,"Tue, Oct 27, 2015",8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,https://www.basketball-reference.com/boxscores...,,21957,
2,"Tue, Oct 27, 2015",10:30p,New Orleans Pelicans,95,Golden State Warriors,111,https://www.basketball-reference.com/boxscores...,,19596,
3,"Wed, Oct 28, 2015",7:00p,Washington Wizards,88,Orlando Magic,87,https://www.basketball-reference.com/boxscores...,,18846,
4,"Wed, Oct 28, 2015",7:30p,Indiana Pacers,99,Toronto Raptors,106,https://www.basketball-reference.com/boxscores...,,19800,


In [8]:
#visitor stats
visitor_df = df.iloc[:,np.r_[0:7]]

#home stats
home_df = df.iloc[:,np.r_[0:2,4:6,2:4,6]]


In [9]:
home_df.columns = ['date_game','game_start_time','team_name','team_pts','opposing_team','opposing_pts','box_score']


In [10]:
visitor_df.columns = ['date_game','game_start_time','team_name','team_pts','opposing_team','opposing_pts','box_score']


In [11]:
frames = [visitor_df, home_df]
agg_data = pd.concat(frames,ignore_index=True)

In [12]:
def date_change(row):
    t = datetime.datetime.strptime(row['date_game'].replace(',',''), "%a %b %d %Y")
    convert_date = t.strftime('%m/%d/%Y')
    return(convert_date)

agg_data['Date'] = pd.to_datetime(agg_data.apply(date_change,axis = 1))

In [13]:
agg_data['date_game'] = agg_data['Date']
del agg_data['Date']

In [14]:
agg_data.head()

Unnamed: 0,date_game,game_start_time,team_name,team_pts,opposing_team,opposing_pts,box_score
0,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,https://www.basketball-reference.com/boxscores...
1,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,https://www.basketball-reference.com/boxscores...
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,https://www.basketball-reference.com/boxscores...
3,2015-10-28,7:00p,Washington Wizards,88,Orlando Magic,87,https://www.basketball-reference.com/boxscores...
4,2015-10-28,7:30p,Indiana Pacers,99,Toronto Raptors,106,https://www.basketball-reference.com/boxscores...


In [15]:
ordered_agg_data = agg_data.sort_values(by = ['team_name','date_game']).reset_index()

In [16]:
rolling_stat_data = agg_data.sort_values(by = ['team_name','date_game'])

testdata_grouped_rolling = rolling_stat_data.groupby('team_name')['opposing_pts'].rolling(window=30, min_periods=1).mean().reset_index()

In [17]:
del testdata_grouped_rolling['level_1']
del testdata_grouped_rolling['team_name']

In [18]:
testdata_grouped_rolling.columns = ['allowed_pts']

In [19]:
full_stats_data = pd.concat([ordered_agg_data ,testdata_grouped_rolling], axis=1)

In [20]:
full_stats_data['avg_allowed_pts'] = full_stats_data.groupby('team_name')['allowed_pts'].apply(lambda grp: grp.shift(1))

In [21]:
full_stats_data = full_stats_data[['date_game','team_name','opposing_team','box_score','avg_allowed_pts']]

In [22]:
full_stats_data_2 = full_stats_data

In [23]:
full_stats_data.head()

Unnamed: 0,date_game,team_name,opposing_team,box_score,avg_allowed_pts
0,2015-10-27,Atlanta Hawks,Detroit Pistons,https://www.basketball-reference.com/boxscores...,
1,2015-10-29,Atlanta Hawks,New York Knicks,https://www.basketball-reference.com/boxscores...,106.0
2,2015-10-30,Atlanta Hawks,Charlotte Hornets,https://www.basketball-reference.com/boxscores...,103.5
3,2015-11-01,Atlanta Hawks,Charlotte Hornets,https://www.basketball-reference.com/boxscores...,100.333333
4,2015-11-03,Atlanta Hawks,Miami Heat,https://www.basketball-reference.com/boxscores...,98.25


In [24]:
newDf = pd.merge(full_stats_data_2,full_stats_data,left_on = ['date_game','team_name'],right_on = ['date_game','opposing_team'])

In [25]:
dedupe_new_df = newDf.drop_duplicates(subset = 'box_score_x',keep ='first')

In [26]:
allowed_df = dedupe_new_df[['date_game','team_name_x','opposing_team_x','avg_allowed_pts_x','avg_allowed_pts_y']]

In [27]:
allowed_df.columns = ['date_game','team_name','opposing_team','team_avg_allowed','opp_avg_allowed']

In [28]:
allowed_df.head()

Unnamed: 0,date_game,team_name,opposing_team,team_avg_allowed,opp_avg_allowed
0,2015-10-27,Atlanta Hawks,Detroit Pistons,,
1,2015-10-29,Atlanta Hawks,New York Knicks,106.0,97.0
2,2015-10-30,Atlanta Hawks,Charlotte Hornets,103.5,104.0
3,2015-11-01,Atlanta Hawks,Charlotte Hornets,100.333333,100.5
4,2015-11-03,Atlanta Hawks,Miami Heat,98.25,95.0


In [29]:
export_csv = allowed_df.to_csv (r'C:\Users\jeromerufin\Desktop\Metis\allowed_15_16.csv') #Don't forget to add '.csv' at the end of the path