### Data Scraping
Scrape data for each season for boxscores and DraftKings fantasy salary information from Basketball-Reference and RotoGuru. Check existing files and create directories automatically.

In [1]:
import os
import re
import time
import glob

import numpy as np
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm

from constants import DATA_DIR, SEASON_DATES, SECONDS_SLEEP

In [6]:
class DataScraper():
    # Scraping Historical Game Data from Basketball-Reference.com
    def get_boxscores(self, season, date_list):
        url_parent = "https://www.basketball-reference.com"
        url_boxscore = "https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}"

        print("Scraping boxscores from the {} regular season".format(season))
        
        for date in tqdm(date_list):
            # BeautifulSoup object for a list of boxscores on a given day
            url_summaries = url_boxscore.format(month=date[4:6],day=date[6:8],year=date[0:4])
            soup_summaries = BeautifulSoup(urlopen(url_summaries),'lxml')
            games = soup_summaries.find_all('div',class_='game_summary expanded nohover')

            for game in games:
                summary = {}

                host = game.find_all('table')[1].find_all('a')[1]['href'][7:10]

                winner = game.find('tr',class_='winner').find_all('td')
                loser = game.find('tr',class_='loser').find_all('td')

                summary['winner'] = [winner[0].find('a')['href'][7:10],int(winner[1].get_text())]
                summary['loser'] = [loser[0].find('a')['href'][7:10],int(loser[1].get_text())]


                url_game = url_parent+game.find('a',text='Box Score')['href']
                soup_game = BeautifulSoup(urlopen(url_game),'lxml')


                tables = soup_game.find_all('table',limit=4)[2:]

                columns_basic = [th.get_text() for th in tables[0].find('thead').find_all('tr')[1].find_all('th')][1:]
                columns_advanced = [th.get_text() for th in tables[1].find('thead').find_all('tr')[1].find_all('th')][2:]

                game_columns = ['Name','Date','Team','Home','W','W_PTS','L','L_PTS']
                column_headers = game_columns + columns_basic + columns_advanced

                teams = ['winner','loser']
                basic_stat_template = 'box_{team}_basic'
                advanced_stat_template = 'box_{team}_advanced'

                for team in teams:

                    if summary[team][0] == host:
                        home = 1
                    else:
                        home = 0

                    basic_stat = basic_stat_template.format(team=summary[team][0].lower())
                    advanced_stat = advanced_stat_template.format(team=summary[team][0].lower())

                    game_data = [date, summary[team][0], home,summary['winner'][0],
                                 summary['winner'][1], summary['loser'][0],summary['loser'][1]]

                    data_basic = soup_game.find('table',id=basic_stat).find('tbody').find_all('tr',class_=None)
                    data_advanced = soup_game.find('table',id=advanced_stat).find('tbody').find_all('tr',class_=None)

                    n = len(data_basic)

                    player_names = [data_basic[i].find('a').get_text() for i in range(n)]

                    player_data = []
                    injury_keywords = ['Did Not Dress', 'Not With Team']

                    for i in range(n):
                        if data_basic[i].find('td').get_text() not in injury_keywords:
                            data = [player_names[i]] + game_data + \
                                   [td.get_text() for td in data_basic[i].find_all('td')] + \
                                   [td.get_text() for td in data_advanced[i].find_all('td')[1:]]

                            player_data.append(data)

                    df = pd.DataFrame(player_data,columns=column_headers)
                    df.columns = df.columns.str.replace('%','_perc').str.replace('/','')
                    df = df.fillna(0)
                    df.loc[:,'FG':'+-'] = df.loc[:,'FG':'+-'].apply(pd.to_numeric)
                    df['MP'] = [0.00 if ':' not in t else round(int(t.split(':')[0])+int(t.split(':')[1])/60, 2) for t in df['MP']] 
                    df.to_csv(os.path.join(*[DATA_DIR, 'Boxscores', season, date+'-'+summary[team][0]+'.csv']), index=False)

                time.sleep(SECONDS_SLEEP)
        return None

    # Scraping DraftKings salary data from RotoGuru.com
    def get_fantasy_salary(self, season, date_list):
        url_roto = "http://rotoguru1.com/cgi-bin/hyday.pl?mon={month}&day={day}&year={year}&game=dk"  
        print("Scraping salary information from the {} regular season".format(season))
        
        for date in tqdm(date_list):
            print(date)
            teams, positions, players, starters, salaries = [], [], [], [], []

            url_date = url_roto.format(month=date[4:6],day=date[6:8],year=date[0:4])
            soup = BeautifulSoup(urlopen(url_date),'lxml')

            #Check if there were any games on a given date
            soup_table = soup.find('body').find('table', border="0", cellspacing="5")

            soup_rows = soup_table.find_all('tr')

            for row in soup_rows:
                if row.find('td').has_attr('colspan') == False:
                    if row.find('a').get_text() != '':

                        position = row.find_all('td')[0].get_text()

                        player_tmp = row.find('a').get_text().split(", ")
                        player = player_tmp[1] + ' ' + player_tmp[0]

                        starter_tmp = row.find_all('td')[1].get_text()

                        if '^' in starter_tmp:
                            starter = 1
                        else:
                            starter =0

                        salary_tmp = row.find_all('td')[3].get_text()
                        salary = re.sub('[$,]', '', salary_tmp)

                        team = row.find_all('td')[4].get_text()

                        positions.append(position)
                        players.append(player)
                        starters.append(starter)
                        salaries.append(salary)
                        teams.append(team)

            df = pd.DataFrame({'Date': [date for i in range(len(players))], 
                               'Team': [team.upper() for team in teams],
                               'Starter': starters,
                               'Pos': positions,
                               'Name': players,
                               'Salary': salaries})

            df = df.loc[:,['Date','Team','Pos','Name','Starter','Salary']]
            df.to_csv(os.path.join(DATA_DIR, 'DKSalary', season, 'salary_'+date+'.csv'), index=False)

            time.sleep(SECONDS_SLEEP)
        return None

In [7]:
scraper = DataScraper()

# Comment out season dates in SEASON_DATES in constants.py to extract data for specific seasons
for data_type in ['Boxscores', 'DKSalary']:
    for season in SEASON_DATES.keys(): 
        if not os.path.exists(os.path.join(DATA_DIR, data_type, season)):
            # Create a new directory and scrape the entire season
            os.mkdir(os.path.join(DATA_DIR, data_type, season))
            start_date = SEASON_DATES[season][0]
            end_date = SEASON_DATES[season][1]
            date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date)]

            if data_type == 'Boxscores':
                scraper.get_boxscores(season, date_list)
            else:
                scraper.get_fantasy_salary(season, date_list)


        elif os.path.exists(os.path.join(DATA_DIR, data_type, season)):
            # Iterate over the existing files by name and scrape missing dates
            start_date = SEASON_DATES[season][0]
            end_date = SEASON_DATES[season][1]
            # Dates to scrape box scores from
            date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date)]
                            
            if data_type == 'Boxscores':               
                for date in date_list:
                    # Check if csv files of the form {date}-{hometeam}.csv (i.e. 20131029-CHI.csv) exists
                    if len(glob.glob(os.path.join(DATA_DIR, data_type, season, str(date)+"*.csv"))) > 0:
                        # Set back the start day by 
                        date_list = date_list[date_list.index(date):]

                scraper.get_boxscores(season, date_list)
                
            else:
                for date in date_list:
                    # Check if csv files of the form salary_{date}.csv (i.e. salary_20131029.csv) exists
                    if os.path.exists(os.path.join(DATA_DIR, data_type, season, "salary_{}.csv".format(date))):
                        date_list = date_list[date_list.index(date):]

                scraper.get_fantasy_salary(season, date_list)

Scraping salary information from the 2014-15 regular season


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

20141031
20141101
20141102
20141103
20141104
20141105
20141106
20141107
20141108
20141109
20141110
20141111
20141112
20141113
20141114
20141115
20141116
20141117
20141118
20141119
20141120
20141121
20141122
20141123
20141124
20141125
20141126
20141127
20141128
20141129
20141130
20141201
20141202
20141203
20141204
20141205
20141206
20141207
20141208
20141209
20141210
20141211
20141212
20141213
20141214
20141215
20141216
20141217
20141218
20141219
20141220
20141221
20141222
20141223
20141224
20141225
20141226
20141227
20141228
20141229
20141230
20141231
20150101
20150102
20150103
20150104
20150105
20150106
20150107
20150108
20150109
20150110
20150111
20150112
20150113
20150114
20150115
20150116
20150117
20150118
20150119
20150120
20150121
20150122
20150123
20150124
20150125
20150126
20150127
20150128
20150129
20150130
20150131
20150201
20150202
20150203
20150204
20150205
20150206
20150207
20150208
20150209
20150210
20150211
20150212
20150213
20150214
20150215
20150216
20150217
20150218
2

HBox(children=(IntProgress(value=0, max=170), HTML(value='')))

20151027
20151028
20151029
20151030
20151031
20151101
20151102
20151103
20151104
20151105
20151106
20151107
20151108
20151109
20151110
20151111
20151112
20151113
20151114
20151115
20151116
20151117
20151118
20151119
20151120
20151121
20151122
20151123
20151124
20151125
20151126
20151127
20151128
20151129
20151130
20151201
20151202
20151203
20151204
20151205
20151206
20151207
20151208
20151209
20151210
20151211
20151212
20151213
20151214
20151215
20151216
20151217
20151218
20151219
20151220
20151221
20151222
20151223
20151224
20151225
20151226
20151227
20151228
20151229
20151230
20151231
20160101
20160102
20160103
20160104
20160105
20160106
20160107
20160108
20160109
20160110
20160111
20160112
20160113
20160114
20160115
20160116
20160117
20160118
20160119
20160120
20160121
20160122
20160123
20160124
20160125
20160126
20160127
20160128
20160129
20160130
20160131
20160201
20160202
20160203
20160204
20160205
20160206
20160207
20160208
20160209
20160210
20160211
20160212
20160213
20160214
2

HBox(children=(IntProgress(value=0, max=170), HTML(value='')))

20161025
20161026
20161027
20161028
20161029
20161030
20161031
20161101
20161102
20161103
20161104
20161105
20161106
20161107
20161108
20161109
20161110
20161111
20161112
20161113
20161114
20161115
20161116
20161117
20161118
20161119
20161120
20161121
20161122
20161123
20161124
20161125
20161126
20161127
20161128
20161129
20161130
20161201
20161202
20161203
20161204
20161205
20161206
20161207
20161208
20161209
20161210
20161211
20161212
20161213
20161214
20161215
20161216
20161217
20161218
20161219
20161220
20161221
20161222
20161223
20161224
20161225
20161226
20161227
20161228
20161229
20161230
20161231
20170101
20170102
20170103
20170104
20170105
20170106
20170107
20170108
20170109
20170110
20170111
20170112
20170113
20170114
20170115
20170116
20170117
20170118
20170119
20170120
20170121
20170122
20170123
20170124
20170125
20170126
20170127
20170128
20170129
20170130
20170131
20170201
20170202
20170203
20170204
20170205
20170206
20170207
20170208
20170209
20170210
20170211
20170212
2

HBox(children=(IntProgress(value=0, max=177), HTML(value='')))

20171017
20171018
20171019
20171020
20171021
20171022
20171023
20171024
20171025
20171026
20171027
20171028
20171029
20171030
20171031
20171101
20171102
20171103
20171104
20171105
20171106
20171107
20171108
20171109
20171110
20171111
20171112
20171113
20171114
20171115
20171116
20171117
20171118
20171119
20171120
20171121
20171122
20171123
20171124
20171125
20171126
20171127
20171128
20171129
20171130
20171201
20171202
20171203
20171204
20171205
20171206
20171207
20171208
20171209
20171210
20171211
20171212
20171213
20171214
20171215
20171216
20171217
20171218
20171219
20171220
20171221
20171222
20171223
20171224
20171225
20171226
20171227
20171228
20171229
20171230
20171231
20180101
20180102
20180103
20180104
20180105
20180106
20180107
20180108
20180109
20180110
20180111
20180112
20180113
20180114
20180115
20180116
20180117
20180118
20180119
20180120
20180121
20180122
20180123
20180124
20180125
20180126
20180127
20180128
20180129
20180130
20180131
20180201
20180202
20180203
20180204
2

HBox(children=(IntProgress(value=0, max=177), HTML(value='')))

20181016
20181017
20181018
20181019
20181020
20181021
20181022
20181023
20181024
20181025
20181026
20181027
20181028
20181029
20181030
20181031
20181101
20181102
20181103
20181104
20181105
20181106
20181107
20181108
20181109
20181110
20181111
20181112
20181113
20181114
20181115
20181116
20181117
20181118
20181119
20181120
20181121
20181122
20181123
20181124
20181125
20181126
20181127
20181128
20181129
20181130
20181201
20181202
20181203
20181204
20181205
20181206
20181207
20181208
20181209
20181210
20181211
20181212
20181213
20181214
20181215
20181216
20181217
20181218
20181219
20181220
20181221
20181222
20181223
20181224
20181225
20181226
20181227
20181228
20181229
20181230
20181231
20190101
20190102
20190103
20190104
20190105
20190106
20190107
20190108
20190109
20190110
20190111
20190112
20190113
20190114
20190115
20190116
20190117
20190118
20190119
20190120
20190121
20190122
20190123
20190124
20190125
20190126
20190127
20190128
20190129
20190130
20190131
20190201
20190202
20190203
2