# Data Acquisition: Scraping DraftKings and NBA Data

## 1) Import relevant packages

In [30]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime as dt
import pickle
import json
import time



## 2) Scrape DraftKings data from Rotoguru

Rotoguru is a website that contains DraftKings and Fan Duel daily NBA fantasy data for each day of the NBA season. 
The data in each table contains the player names, the team they play for, the opponent team the daily fantasy score
and salary of each player on a given night on a datetime index. The data only goes back to the 2014-15 season, and the stats column is all text with missing information, so this dataset alone will not be enough for the project. 

### Define constants and functions

In [31]:
#The base url is constant except for the year, month, and day
base_url = 'http://rotoguru1.com/cgi-bin/hyday.pl?game=dk&mon=MONTH&day=DAY&year=YEAR'

#need string values of each day month and year we're concerned with
days = [str(x) for x in range(1,32)]

#there are no NBA games from July to September
months = ['1','2','3','4','5','6','10','11','12']

#Rotoguru data only goes back to the 2014-15 season
years = [str(x) for x in range(2014,2019)]

#Loop through each day, month, and year with function to pull the relevant table from the base_url
def rotoScrape(year,month,day):
    global base_url
    
    #sanity check to see where your loop is, not required
    print(year + '-' + month + '-' + day) 
    
    #pull and request the url, than use Beautiful soup to bring in desired table
    url = base_url.replace('DAY',day).replace('MONTH',month).replace('YEAR',year)
    r = requests.get(url)
    soup = BeautifulSoup(r.content,'lxml')
    
    #the 10th table on each page is where the relevant information lies
    table = soup.find_all('table')[9]
    
    #some days don't have any games, or no data, mark those pages with 'error' string
    try:
        #the table returned with soup has a few indexes, we only want the first
        return pd.read_html(str(table))[0]
    except ValueError:
        return 'error'



### Scrape the data
#### NOTE: function takes about 2-3 seconds per loop, looping through every year, month, day will take 10-15 minutes

In [32]:
#create a dictionary with the days as keys and the scraped data as values, remove slicing to loop through all data
DKdata = {(year+'-'+month+'-'+day):rotoScrape(year,month,day) 
                for year in years[1:2] for month in months[:1] for day in days[:3]}

#write the dictionary of dataframes into a pickle file
pickle.dump(DKdata,open('dk_partial.p','wb'))


2015-1-1
2015-1-2
2015-1-3


## 3) Scrape NBA  Boxscore Data

The nba_py package on Github is a great resource for scraping the NBA.com API. In order to have a large enough dataset, I scraped every players gamelogs for every regular season game in every year since theurn of the century. This dataset contains the player name, NBA player ID, the team they're on and the opponent as well as every box score stat.

### Import relevant nba_py functions

The GameLog function in nba_py takes a season as the input and the .overall attribute returns a dataframe of each boxscore of every player or team for the season.

In [49]:
#For NBA data use the nba_py package
import nba_py
from nba_py.player import PlayerList
from nba_py.league import GameLog, PlayerStats


### Obtain Player Gamelogs

In [52]:
#The nba_py GameLog function takes season as an input in the YYYY-yy format
seasons = [str(x)+'-'+str(x+1)[-2:] for x in range(2000,2018)]

plyr_logs = pd.DataFrame()
#loop through each season and append the dataframes
#NOTE this will take about 10 minutes if you loop through all seasons
for season in seasons:
    #setting player_or_team to 'P' return player boxcores.
    plyr_logs = plyr_logs.append(GameLog(season=season, season_type='Regular Season', player_or_team='P').overall())

plyr_logs.to_csv('player_logs_partial.csv')
plyr_logs.head()

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22000,711,Jerry Stackhouse,1610612765,DET,Detroit Pistons,20001068,2001-04-03,DET @ CHI,W,...,4,4,5,1,0,4,2,57,34,0
1,22000,947,Allen Iverson,1610612755,PHI,Philadelphia 76ers,20000477,2001-01-06,PHI @ CLE,W,...,3,3,3,3,1,4,3,54,5,0
2,22000,960,Tony Delk,1610612756,PHX,Phoenix Suns,20000449,2001-01-02,PHX @ SAC,L,...,3,6,0,1,0,2,1,53,7,0
3,22000,947,Allen Iverson,1610612755,PHI,Philadelphia 76ers,20000579,2001-01-21,PHI vs. TOR,L,...,2,3,4,1,0,4,4,51,-8,0
4,22000,185,Chris Webber,1610612758,SAC,Sacramento Kings,20000471,2001-01-05,SAC vs. IND,L,...,16,26,5,3,2,4,4,51,-1,0


### Obtain Team Gamelogs

In [37]:
#same process as obtaining player logs
team_logs = pd.DataFrame()
for season in seasons:
    team_logs = team_logs.append(GameLog(season=season, season_type='Regular Season', player_or_team='T').overall())
team_logs.to_csv('team_logs.csv')

## 4) Scrape NBA Roster Data

This will give us roster information such as the teamID of each player as well as their position, weight, height, and more.

In [41]:
#Import team module from nba_py
from nba_py import team

#define function to get roster for a team in a given season, 
def getRoster(teamId, season):
    #sanity check for looping progress
    print(season)
    #NBA API requires some sleep time or else it freezes
    time.sleep(.5)
    return team.TeamCommonRoster(teamId, season= season).roster()

In [42]:
#Pull the list of teams with the teamlist function
teams = team.TeamList().info()
teams.head()

Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION
0,0,1610612737,1949,2017,ATL
1,0,1610612738,1946,2017,BOS
2,0,1610612739,1970,2017,CLE
3,0,1610612740,2002,2017,NOP
4,0,1610612741,1966,2017,CHI


In [46]:
#We only want teams that are currently active
teams = teams.loc[teams.MAX_YEAR == '2017']    
print('Number of Teams:',len(teams))

Number of Teams: 30


In [47]:
#Remove slicing to loop through every team in past 10 years
#NOTE: this will take about 10 minutes if you remove slicing
rosters = {season:{team:getRoster(team,season) for team in teams.TEAM_ID[:2]} for season in seasons[:2]}
pickle.dump(rosters,open('rostersraw_partial.p','wb'))

2007-08
2007-08
2008-09
2008-09


## Final Thoughts

Patience is definitely required when scraping this data. My method was to make sure the code worked for the first few iterations of each function and run the code on its own while I stepped away from the computer. The full raw data files are located in the data folder of the repo as "dkDataraw.p", "player_logs.csv", and "team_logs.csv'. To see how I went about cleaning the data, shift over to the Data Wrangling Notebooks.