### Data Scraping

In [1]:
import os
import re
import pickle
import time
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from urllib.request  import urlopen
from bs4             import BeautifulSoup

In [2]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')

### Scraping Historical Game Data

In [18]:
#2017-18
#Done: [20171017-20180411]

#2016-17
#Done: [20161025-20170412]

#2015-16
#Done: [20151025-20150413]

In [19]:
start_date = '20151025'
end_date = '20160413'
season = '2015-16'
date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date,end_date,index=False)]

In [21]:
url_parent = "https://www.basketball-reference.com"
url_boxscore = "https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}"

for date in tqdm(date_list):
    
    url_summaries = url_boxscore.format(month=date[4:6],day=date[6:8],year=date[0:4])
    soup_summaries = BeautifulSoup(urlopen(url_summaries),'html5lib')
    games = soup_summaries.find_all('div',class_='game_summary expanded nohover')
    
    print(date)
    
    for game in games:
        summary = {}
        
        host = game.find_all('table')[1].find_all('a')[1]['href'][7:10]
        
        winner = game.find('tr',class_='winner').find_all('td')
        loser = game.find('tr',class_='loser').find_all('td')
        
        summary['winner'] = [winner[0].find('a')['href'][7:10],int(winner[1].get_text())]
        summary['loser'] = [loser[0].find('a')['href'][7:10],int(loser[1].get_text())]
        

        url_game = url_parent+game.find('a',text='Box Score')['href']
        soup_game = BeautifulSoup(urlopen(url_game),'html5lib')
        
        
        tables = soup_game.find_all('table',limit=4)[2:]
        
        columns_basic = [th.get_text() for th in tables[0].find('thead').find_all('tr')[1].find_all('th')][1:]
        columns_advanced = [th.get_text() for th in tables[1].find('thead').find_all('tr')[1].find_all('th')][2:]
        
        game_columns = ['Name','Date','Team','Home','W','W_PTS','L','L_PTS']
        column_headers = game_columns + columns_basic + columns_advanced
        
        teams = ['winner','loser']
        basic_stat_template = 'box_{team}_basic'
        advanced_stat_template = 'box_{team}_advanced'

        for team in teams:
            
            if summary[team][0] == host:
                home = 1
            else:
                home = 0
            
            basic_stat = basic_stat_template.format(team=summary[team][0].lower())
            advanced_stat = advanced_stat_template.format(team=summary[team][0].lower())
            
            game_data = [date, summary[team][0], home,summary['winner'][0],
                         summary['winner'][1], summary['loser'][0],summary['loser'][1]]
            
            data_basic = soup_game.find('table',id=basic_stat).find('tbody').find_all('tr',class_=None)
            data_advanced = soup_game.find('table',id=advanced_stat).find('tbody').find_all('tr',class_=None)
            
            n = len(data_basic)
            
            player_names = [data_basic[i].find('a').get_text() for i in range(n)]
            
            player_data = []
            injury_keywords = ['Did Not Dress', 'Not With Team']
    
            for i in range(n):
                if data_basic[i].find('td').get_text() not in injury_keywords:
                    data = [player_names[i]] + game_data + \
                           [td.get_text() for td in data_basic[i].find_all('td')] + \
                           [td.get_text() for td in data_advanced[i].find_all('td')[1:]]
                            
                    player_data.append(data)
            
            df = pd.DataFrame(player_data,columns=column_headers)
            df.columns = df.columns.str.replace('%','_perc').str.replace('/','')
            df = df.fillna(0)
            df.loc[:,'FG':'+-'] = df.loc[:,'FG':'+-'].apply(pd.to_numeric)
            df['MP'] = [0.00 if ':' not in t else round(int(t.split(':')[0])+int(t.split(':')[1])/60, 2) for t in df['MP']] 
            df.to_csv(os.path.join(*[data_dir, 'Games', season, date+'-'+summary[team][0]+'.csv']), index=False)
            
        time.sleep(1)


20151025
20151026
20151027
20151028
20151029
20151030
20151031
20151101
20151102
20151103
20151104
20151105
20151106
20151107
20151108
20151109
20151110
20151111
20151112
20151113
20151114
20151115
20151116
20151117
20151118
20151119
20151120
20151121
20151122
20151123
20151124
20151125
20151126
20151127
20151128
20151129
20151130
20151201
20151202
20151203
20151204
20151205
20151206
20151207
20151208
20151209
20151210
20151211
20151212
20151213
20151214
20151215
20151216
20151217
20151218
20151219
20151220
20151221
20151222
20151223
20151224
20151225
20151226
20151227
20151228
20151229
20151230
20151231
20160101
20160102
20160103
20160104
20160105
20160106
20160107
20160108
20160109
20160110
20160111
20160112
20160113
20160114
20160115
20160116
20160117
20160118
20160119
20160120
20160121
20160122
20160123
20160124
20160125
20160126
20160127
20160128
20160129
20160130
20160131
20160201
20160202
20160203


URLError: <urlopen error [Errno 111] Connection refused>

### Scraping Historical Salary Data from RotoGuru

In [22]:
start_date = '20151025'
end_date = '20160413'
season = '2015-16'
date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date,end_date,index=False)]

In [23]:
url_roto = "http://rotoguru1.com/cgi-bin/hyday.pl?mon={month}&day={day}&year={year}&game=dk"

In [25]:
for date in tqdm(date_list):
    teams, positions, players, starters, salaries = [], [], [], [], []
    
    url_date = url_roto.format(month=date[4:6],day=date[6:8],year=date[0:4])
    soup = BeautifulSoup(urlopen(url_date),'html5lib')
    
    #Check if there were any games on a given date
    soup_table = soup.find('body').find('table', border="0", cellspacing="5")
    
    if soup_table.find('tbody') != None:
        soup_rows = soup_table.find('tbody').find_all('tr')
        
        for row in soup_rows:
            if row.find('td').has_attr('colspan') == False:
                if row.find('a').get_text() != '':
                    
                    position = row.find_all('td')[0].get_text()
                    
                    player_tmp = row.find('a').get_text().split(", ")
                    player = player_tmp[1] + ' ' + player_tmp[0]
                    
                    starter_tmp = row.find_all('td')[1].get_text()
                    if '^' in starter_tmp:
                        starter = 1
                    else:
                        starter =0
                        
                    salary_tmp = row.find_all('td')[3].get_text()
                    salary = re.sub('[$,]', '', salary_tmp)
                    
                    team = row.find_all('td')[4].get_text()

                    positions.append(position)
                    players.append(player)
                    starters.append(starter)
                    salaries.append(salary)
                    teams.append(team)
                
        df = pd.DataFrame({'Date': [date for i in range(len(players))], 
                           'Team': [team.upper() for team in teams],
                           'Starter': starters,
                           'Pos': positions,
                           'Name': players,
                           'Salary': salaries})
        
        df = df.loc[:,['Date','Team','Pos','Name','Starter','Salary']]
        
        df.to_csv(os.path.join(data_dir, 'DKSalary', season, 'salary_'+date+'.csv'), index=False)
        
    time.sleep(1)
    




Exception in thread Thread-8:
Traceback (most recent call last):
  File "/home/kengo/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/kengo/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/home/kengo/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






KeyboardInterrupt: 

### Scraping Contest Info from RotoGrinders

In [20]:
start_date = '20171017'
end_date = '20180404'
date_list = [d.strftime('%Y-%m-%d') for d in pd.date_range(start_date,end_date,index=False)]

In [21]:
url_rg = 'https://rotogrinders.com/contests/nba?site=draftkings&date={date}'

In [42]:
for date in tqdm(date_list):
    
    url_date = url_rg.format(date=date)
    soup = BeautifulSoup(urlopen(url_date),'html5lib')
    
    #body = soup.find('section', class_='bdy content contest cflex reset')
    #title = body.find('div', class_='hdr', id='contests-title').find('h2').get_text()
    columns = soup.find('div', class_='rgt-col')
    #print(columns)
    
    #columns = body.find('div', class_='rgtable').find('div', class_='rg-colwrap').find_all('div', class_='rgt-col')[0]
    
    print(columns.find_all('div'))
    
    #styles = [style for style in columns[0].find_all('div', class_=None)]
    #fees = [fee.replace('$','') for fee in columns[3].find_all('div', class_=None)]
    
    #print(styles)
    #print(fees)
            
            
                
                
        
    
    

[<div class="rgt-hdr">Name<span class="icn-arw-down"></span></div>]
[<div class="rgt-hdr">Name<span class="icn-arw-down"></span></div>]


KeyboardInterrupt: 

### Scraping Injury Report from Pro Sports Transactions

In [None]:
def standardize_teams(df):
    teams_conversion = {'Mavericks':'DAL',
                        'Hawks':'ATL',
                        'Celtics':'BOS',
                        'Raptors':'TOR',
                        'Bulls':'CHI',
                        'Magic':'ORL',
                        'Pelicans':'NOP',
                        'Suns':'PHO',
                        'Jazz':'UTA',
                        'Hornets':'CHA',
                        'Grizzlies':'MEM',
                        'Cavaliers':'CLE',
                        'Rockets':'HOU',
                        'Clippers':'LAC',
                        'Nuggets':'DEN',
                        'Knicks':'NYK',
                        'Heat':'MIA',
                        'Nets':'BRK',
                        'Kings':'SAC',
                        'Pacers':'IND',
                        'Blazers':'POR',
                        'Timberwolves':'MIN',
                        'Thunder':'OKC',
                        'Pistons':'DET',
                        'Bucks':'MIL',
                        'Wizards':'WAS',
                        'Warriors':'GSW',
                        'Lakers':'LAL',
                        'Spurs':'SAS',
                        '76ers':'PHI'
                       }
    for team in df['Team']:
        if team in teams_conversion.keys():
            df.loc[df['Team']==team, 'Team'] = teams_conversion[team]
            

In [None]:
start = '2017-10-17'
end = '2018-04-04'

In [None]:
url_injury_parent = 'https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate={start}&EndDate={end}&ILChkBx=yes&Submit=Search&start=0'

In [None]:
url_injury = url_injury_parent.format(start=start, end=end)
soup = BeautifulSoup(urlopen(url_injury),'html5lib')
urls = soup.find('table', align='center').find_all('td')[2].find('p').find_all('a')
urls = [url_injury] + ['http://www.prosportstransactions.com/basketball/Search/' + url['href'] for url in urls]

In [None]:
Date, Team, In, Out, Note = [], [], [], [], []

for url in tqdm(urls):
    soup = BeautifulSoup(urlopen(url),'html5lib')
    soup_table = soup.find('body').find('table')
    soup_rows = soup_table.find_all('tr', align='left')

    for row in soup_rows:
        td = row.find_all('td')
        Date.append(td[0].get_text().replace('-',''))
        Team.append(td[1].get_text().replace(' ', ''))
        In.append(td[2].get_text().replace(' • ','').replace('(a)',''))
        Out.append(td[3].get_text().replace(' • ','').replace('(a)',''))
        
        if 'placed on IL' in td[4].get_text():
            Note.append('placed')
        elif 'activated from IL' in td[4].get_text():
            Note.append('activated')
        else:
            Note.append(np.nan)
        
    time.sleep(1)
    
df = pd.DataFrame({'Date':Date, 'Team':Team, 'In':In, 'Out':Out, 'Note':Note})

df['Name'] = df['In'] + df['Out']

for name in list(set(df['Name'])):
    if name[0] == ' ':
        df.loc[df['Name']==name, ['Name']] = name[1:]
    elif name[-1] == ' ':
        df.loc[df['Name']==name, ['Name']] = name[:-1]
        

In [None]:
df = df.loc[:,['Date','Team','Name','Note']]
standardize_teams(df)
df.to_csv(os.path.join(*[data_dir, 'Injury', season+'.csv']), index=False)