# Data Acquisition: Scraping DraftKings and NBA Data

## 1) Import relevant packages

In [30]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime as dt
import pickle
import json
import time



## 2) Scrape DraftKings data from Rotoguru

Rotoguru is a website that contains DraftKings and Fan Duel daily NBA fantasy data for each day of the NBA season. 
The data in each table contains the player names, the team they play for, the opponent team the daily fantasy score
and salary of each player on a given night on a datetime index. The data only goes back to the 2014-15 season, and the stats column is all text with missing information, so this dataset alone will not be enough for the project. 

### Define constants and functions

In [31]:
#The base url is constant except for the year, month, and day
base_url = 'http://rotoguru1.com/cgi-bin/hyday.pl?game=dk&mon=MONTH&day=DAY&year=YEAR'

#need string values of each day month and year we're concerned with
days = [str(x) for x in range(1,32)]

#there are no NBA games from July to September
months = ['1','2','3','4','5','6','10','11','12']

#Rotoguru data only goes back to the 2014-15 season
years = [str(x) for x in range(2014,2019)]

#Loop through each day, month, and year with function to pull the relevant table from the base_url
def rotoScrape(year,month,day):
    global base_url
    
    #sanity check to see where your loop is, not required
    print(year + '-' + month + '-' + day) 
    
    #pull and request the url, than use Beautiful soup to bring in desired table
    url = base_url.replace('DAY',day).replace('MONTH',month).replace('YEAR',year)
    r = requests.get(url)
    soup = BeautifulSoup(r.content,'lxml')
    
    #the 10th table on each page is where the relevant information lies
    table = soup.find_all('table')[9]
    
    #some days don't have any games, or no data, mark those pages with 'error' string
    try:
        #the table returned with soup has a few indexes, we only want the first
        return pd.read_html(str(table))[0]
    except ValueError:
        return 'error'



### Scrape the data
#### NOTE: function takes about 2-3 seconds per loop, looping through every year, month, day will take 10-15 minutes

In [32]:
#create a dictionary with the days as keys and the scraped data as values, remove slicing to loop through all data
DKdata = {(year+'-'+month+'-'+day):rotoScrape(year,month,day) 
                for year in years[1:2] for month in months[:1] for day in days[:3]}

#write the dictionary of dataframes into a pickle file
pickle.dump(DKdata,open('dk_partial.p','wb'))


2015-1-1
2015-1-2
2015-1-3


## 3) Scrape NBA  Boxscore Data

The nba_py package on Github is a great resource for scraping the NBA.com API. In order to have a large enough dataset, I scraped every players gamelogs for every regular season game in every year since the 2007-08 season. This dataset contains the player name, NBA player ID, the team they're on and the opponent as well as every box score stat on a datetime index.

### Import relevant nba_py functions

In [33]:
#For NBA data use the nba_py package
import nba_py
from nba_py.player import PlayerList, PlayerGameLogs
from nba_py.game import Boxscore


### Obtain a list of players for each season

In [34]:
#The nba_py PLayerList function takes season as an input in the YYYY-yy format
seasons = ['2007-08','2008-09','2009-10','2010-11','2011-12','2012-13','2013-14','2014-15','2015-16','2016-17','2017-18']

#PlayerList returns a list of all NBA players for the current season if only_current = 1
#if not it returns all players in history
players = PlayerList(league_id='00', only_current=0).info()

#convert FROM_YEAR and TO_YEAR columns to numeric to filter out players by year
players.TO_YEAR = pd.to_numeric(players.TO_YEAR)
players.FROM_YEAR = pd.to_numeric(players.FROM_YEAR)

#filter out players that actually played in each season, organize into a dictionary with the seasons as keys
seasonPlyr = {s:players.loc[(players.FROM_YEAR <= int(s[:4])) & (players.TO_YEAR >= int(s[:4]))] for s in seasons}
seasonPlyr = {s:seasonPlyr[s].sort_values('DISPLAY_FIRST_LAST') for s in seasonPlyr}

### Define the scraping function

In [37]:
#define function to scrape gamelogs of a certain player in a given season
def getlogs(player, season):
        try:
            #the PlayerGameLogs function takes playerID and season as inputs, players DateFrame as lookup for id's
            playerlogs = PlayerGameLogs(players.PERSON_ID.loc[players.DISPLAY_FIRST_LAST == player],season = season)
        except: 
            #if there is an error in pulling the data, flag it
            return ['ERROR']
        
        #another sanity check when the scraping actually begins, not required
        print(season, player)
        
        #the NBA API seems to freeze if it is call too many times in a short period of time
        #pause for a 1 second after each call
        time.sleep(1)
        
        #The PlayerGameLogs funtion has a json endpoint, the following path gives us a DataFrame for each 
        #player containing boxscore data for each game they played that season
        return playerlogs.info()


### Scrape gamelog data per season

#### NOTE: Looping through every player in every season will take about 2.5 hours, about 1.5 seconds per player

In [38]:
#Create nested dictionary, Level1 keys = season, Level2 keys = player, values = gamelog list of lists
#Remove slicing to loop through every player in past 10 years, CAUTION SEE NOTE
gamelogs = {s:{player:getlogs(player,s) for player in seasonPlyr[s].DISPLAY_FIRST_LAST[:3]} for s in seasons[:1]}

#write the scraped data to pickle file as soon as it is done
pickle.dump(gamelogs,open('gamelogsraw_partial.p','wb')) 

2007-08 Aaron Brooks
2007-08 Aaron Gray
2007-08 Aaron Williams


## 4) Scrape NBA Roster Data

This will give us roster information such as the teamID of each player as well as their position, weight, height, and more.

In [41]:
#Import team module from nba_py
from nba_py import team

#define function to get roster for a team in a given season, 
def getRoster(teamId, season):
    #sanity check for looping progress
    print(season)
    #NBA API requires some sleep time or else it freezes
    time.sleep(.5)
    return team.TeamCommonRoster(teamId, season= season).roster()

In [42]:
#Pull the list of teams with the teamlist function
teams = team.TeamList().info()
teams.head()

Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION
0,0,1610612737,1949,2017,ATL
1,0,1610612738,1946,2017,BOS
2,0,1610612739,1970,2017,CLE
3,0,1610612740,2002,2017,NOP
4,0,1610612741,1966,2017,CHI


In [46]:
#We only want teams that are currently active
teams = teams.loc[teams.MAX_YEAR == '2017']    
print('Number of Teams:',len(teams))

Number of Teams: 30


In [47]:
#Remove slicing to loop through every team in past 10 years
#NOTE: this will take about 10 minutes if you remove slicing
rosters = {season:{team:getRoster(team,season) for team in teams.TEAM_ID[:2]} for season in seasons[:2]}
pickle.dump(rosters,open('rostersraw.p','wb'))

2007-08
2007-08
2008-09
2008-09


## Final Thoughts

Patience is definitely required when scraping this data. My method was to make sure the code worked for the first few iterations of each function and run the code on its own while I stepped away from the computer. The full raw data files are located in the data folder of the repo as "dkDataraw.p" and "gamelogsraw.json". To see how I went about cleaning the data, shift over to the Data Wrangling Notebook.

In [40]:
gamelogs['2007-08']['Aaron Brooks'].head()

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22007,201166,20701226,"APR 16, 2008",HOU vs. LAC,W,17,3,8,0.375,...,1,1,1,2,0,0,2,8,-7,0
1,22007,201166,20701208,"APR 14, 2008",HOU @ UTA,L,14,2,5,0.4,...,0,0,3,0,0,1,4,8,-5,0
2,22007,201166,20701201,"APR 13, 2008",HOU @ DEN,L,17,2,8,0.25,...,0,0,3,0,0,1,3,7,-5,0
3,22007,201166,20701184,"APR 11, 2008",HOU vs. PHX,W,18,3,4,0.75,...,2,2,3,0,0,1,4,7,7,0
4,22007,201166,20701171,"APR 09, 2008",HOU vs. SEA,W,17,4,8,0.5,...,4,4,4,0,1,4,1,11,10,0
