# NBA Stats Crawler
- NBA stats official site: https://stats.nba.com/teams/boxscores/
- Selenium, locating elements: https://selenium-python.readthedocs.io/locating-elements.html
- Reference: https://ithelp.ithome.com.tw/articles/10185964
    - Example: /Users/ino/Projects/notes/crawler.ipynb

## Setup

In [1]:
import time
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select

# User input
outPathRaw = './Z_raw/'
outPathArranged = './Z_arranged/'
outFile = './nbaGamePair_test.csv'
isAppend = 1
seasonYear = '2018-19'
seasonType = 'RegularSeason'
dateStart = '2018-10-15'
pageNum = 5
loadTime = 2

# Create path if necessary
if not os.path.exists(outPathRaw):
    os.makedirs(outPathRaw)
if not os.path.exists(outPathArranged):
    os.makedirs(outPathArranged)

# Date of crawling
dateOfCrawl = time.strftime("%Y-%m-%d-h%Hm%Ms%S", time.localtime())

# URL selection
prefix = 'https://stats.nba.com/teams/boxscores/' + '?Season='
url = {
    'Preseason': prefix + seasonYear + '&SeasonType=Pre%20Season',
    'RegularSeason': prefix + seasonYear + '&SeasonType=Regular%20Season',
    'Playoffs': prefix + seasonYear + '&SeasonType=Playoffs',
    'All-Star': prefix + seasonYear + '&SeasonType=All%20Star'
}

# Open browser
driver = webdriver.Chrome(executable_path='./webDriver/chromedriver')
driver.get(url[seasonType])

# Wait for loading the web
time.sleep(loadTime)

## Scrape Maximum Page Information

In [2]:
# Scrape the maximum page informtion
content = []
curSoup = BeautifulSoup(driver.page_source, 'html.parser')
info = curSoup.findAll('select', {'class':'stats-table-pagination__select'})
try:
    for x in info[-1]:
        content.append(x.get_text())
    maxPage = int(content[-1])
    isPagination = True
except:
    print('Pagination prohibited. Set maxPage to %d' %(maxPage))
    isPagination = False
    maxPage = 1

Available page: 1, pagination action is not permitted


## Load Pages

In [3]:
# Load pages
pages = []
pageNum = min(pageNum, maxPage)
if isPagination:
    pageSel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))
    for i in range(1, pageNum+1):
        # Pagination (Mimic a browser that clicks "next page".)
        pageSel.select_by_value('number:'+str(i))
        # Wait for loading the web
        time.sleep(loadTime)
        # Capture current page
        pages.append(driver.page_source)
else:
    pages.append(driver.page_source)
# Shutdown the browser
driver.quit()
print('----- Note: %d of %d page(s) scraped -----' %(pageNum, maxPage))

----- Note: 1 of 1 page(s) scraped -----


## Raw Box Acquisition

In [4]:
# Scrape box from each page and create DataFrame
soups = [BeautifulSoup(page, 'html.parser') for page in pages]
df_box = None
for soup in soups:
    # Scrape boxes' title
    boxTitle = []
    for item in soup.findAll('thead'):
        boxTitle.append(item.get_text())
    # Store boxes' title as list
    title = [x for x in boxTitle[0].split('\n') if x != '' and x != 'Season']
    
    # Scrape boxes
    boxes = []
    for item in soup.findAll('tr', {'data-ng-repeat':'(i, row) in page track by row.$hash'}):
        boxes.append(item.get_text())
    # Store boxes as list
    boxData = []
    for i in range(len(boxes)//2):
        boxData.append([x for x in boxes[i].split('\n') if x != ''])
        if len(boxData[-1]) != 24: boxData.pop()
    for i in range(len(boxData)):
        for j in range(4, len(boxData[i])):
            if j == 8 or j == 11 or j == 14:
                boxData[i][j] = round(float(boxData[i][j])*0.01, 3)
            else:
                boxData[i][j] = int(boxData[i][j])
    
    # Create/Append data frame
    if df_box is None:
        df_box = pd.DataFrame(boxData, columns=title)
    else:
        df_box = df_box.append(pd.DataFrame(boxData, columns=title), ignore_index=True)

In [5]:
df_box

Unnamed: 0,Team,Match Up,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,DEN,DEN @ LAC,10/17/2018,W,240,107,33,87,0.379,8,...,0.786,14,42,56,20,6,8,11,22,9
1,LAC,LAC vs. DEN,10/17/2018,L,240,98,35,87,0.402,8,...,0.833,9,38,47,21,3,9,14,32,-9
2,PHX,PHX vs. DAL,10/17/2018,W,240,121,44,81,0.543,19,...,0.875,6,38,44,35,5,2,12,22,21
3,DAL,DAL @ PHX,10/17/2018,L,240,100,38,88,0.432,10,...,0.7,10,28,38,28,7,5,9,16,-21
4,SAC,SAC vs. UTA,10/17/2018,L,240,117,49,95,0.516,7,...,0.667,5,32,37,17,8,3,9,27,-6
5,UTA,UTA @ SAC,10/17/2018,W,240,123,41,79,0.519,13,...,0.737,5,39,44,21,8,4,17,19,6
6,MIN,MIN @ SAS,10/17/2018,L,240,108,39,91,0.429,6,...,0.889,14,32,46,20,9,2,11,27,-4
7,SAS,SAS vs. MIN,10/17/2018,W,240,112,40,93,0.43,11,...,0.724,19,33,52,22,3,4,12,22,4
8,NOP,NOP @ HOU,10/17/2018,W,240,131,52,98,0.531,10,...,0.773,14,40,54,36,8,3,12,25,19
9,HOU,HOU vs. NOP,10/17/2018,L,240,112,39,92,0.424,16,...,0.75,8,29,37,21,8,7,12,22,-19


In [6]:
# Date reformation and selection
date = []
df_box.rename(columns = {'Game\xa0Date':'Date'}, inplace=True)
for x in df_box['Date']:
    date.append(x[-4:] + '-' + x[0:2] + '-' + x[3:5])
df_box['Date'] = date
df_box = df_box.loc[(df_box.Date >= dateStart), :].reset_index(drop=True)
df_box

Unnamed: 0,Team,Match Up,Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,DEN,DEN @ LAC,2018-10-17,W,240,107,33,87,0.379,8,...,0.786,14,42,56,20,6,8,11,22,9
1,LAC,LAC vs. DEN,2018-10-17,L,240,98,35,87,0.402,8,...,0.833,9,38,47,21,3,9,14,32,-9
2,PHX,PHX vs. DAL,2018-10-17,W,240,121,44,81,0.543,19,...,0.875,6,38,44,35,5,2,12,22,21
3,DAL,DAL @ PHX,2018-10-17,L,240,100,38,88,0.432,10,...,0.7,10,28,38,28,7,5,9,16,-21
4,SAC,SAC vs. UTA,2018-10-17,L,240,117,49,95,0.516,7,...,0.667,5,32,37,17,8,3,9,27,-6
5,UTA,UTA @ SAC,2018-10-17,W,240,123,41,79,0.519,13,...,0.737,5,39,44,21,8,4,17,19,6
6,MIN,MIN @ SAS,2018-10-17,L,240,108,39,91,0.429,6,...,0.889,14,32,46,20,9,2,11,27,-4
7,SAS,SAS vs. MIN,2018-10-17,W,240,112,40,93,0.43,11,...,0.724,19,33,52,22,3,4,12,22,4
8,NOP,NOP @ HOU,2018-10-17,W,240,131,52,98,0.531,10,...,0.773,14,40,54,36,8,3,12,25,19
9,HOU,HOU vs. NOP,2018-10-17,L,240,112,39,92,0.424,16,...,0.75,8,29,37,21,8,7,12,22,-19


In [7]:
# Save raw box as .csv
df_box.to_csv(outPathRaw + dateOfCrawl + '_' + seasonYear + '_' + seasonType + '.csv', encoding='utf-8', index=False, float_format='%.3f')
df_box

Unnamed: 0,Team,Match Up,Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,DEN,DEN @ LAC,2018-10-17,W,240,107,33,87,0.379,8,...,0.786,14,42,56,20,6,8,11,22,9
1,LAC,LAC vs. DEN,2018-10-17,L,240,98,35,87,0.402,8,...,0.833,9,38,47,21,3,9,14,32,-9
2,PHX,PHX vs. DAL,2018-10-17,W,240,121,44,81,0.543,19,...,0.875,6,38,44,35,5,2,12,22,21
3,DAL,DAL @ PHX,2018-10-17,L,240,100,38,88,0.432,10,...,0.7,10,28,38,28,7,5,9,16,-21
4,SAC,SAC vs. UTA,2018-10-17,L,240,117,49,95,0.516,7,...,0.667,5,32,37,17,8,3,9,27,-6
5,UTA,UTA @ SAC,2018-10-17,W,240,123,41,79,0.519,13,...,0.737,5,39,44,21,8,4,17,19,6
6,MIN,MIN @ SAS,2018-10-17,L,240,108,39,91,0.429,6,...,0.889,14,32,46,20,9,2,11,27,-4
7,SAS,SAS vs. MIN,2018-10-17,W,240,112,40,93,0.43,11,...,0.724,19,33,52,22,3,4,12,22,4
8,NOP,NOP @ HOU,2018-10-17,W,240,131,52,98,0.531,10,...,0.773,14,40,54,36,8,3,12,25,19
9,HOU,HOU vs. NOP,2018-10-17,L,240,112,39,92,0.424,16,...,0.75,8,29,37,21,8,7,12,22,-19


## Arranged Box Acquisition

In [8]:
# Drop 'MIN'
df_box = df_box.drop(columns=['MIN'])

In [9]:
# Create 'Score' and 'Home/Away' columns
score = []
homeAway = []
for team, match, pts, pm in zip(df_box['Team'], df_box['Match\xa0Up'], df_box['PTS'], df_box['+/-']):
    # 'Score'
    oppo = match[-3:]
    score.append(oppo + str(pts-pm) + '-' + str(pts) + team)
    # 'Home/Away'
    if '@' in match:
        homeAway.append('Away')
    else:
        homeAway.append('Home')
df_box['Score'] = score
df_box['Home/Away'] = homeAway
df_box = df_box.drop(columns=['Match\xa0Up', '+/-'])
df_box

Unnamed: 0,Team,Date,W/L,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,Score,Home/Away
0,DEN,2018-10-17,W,107,33,87,0.379,8,24,0.333,...,14,42,56,20,6,8,11,22,LAC98-107DEN,Away
1,LAC,2018-10-17,L,98,35,87,0.402,8,28,0.286,...,9,38,47,21,3,9,14,32,DEN107-98LAC,Home
2,PHX,2018-10-17,W,121,44,81,0.543,19,34,0.559,...,6,38,44,35,5,2,12,22,DAL100-121PHX,Home
3,DAL,2018-10-17,L,100,38,88,0.432,10,33,0.303,...,10,28,38,28,7,5,9,16,PHX121-100DAL,Away
4,SAC,2018-10-17,L,117,49,95,0.516,7,19,0.368,...,5,32,37,17,8,3,9,27,UTA123-117SAC,Home
5,UTA,2018-10-17,W,123,41,79,0.519,13,27,0.481,...,5,39,44,21,8,4,17,19,SAC117-123UTA,Away
6,MIN,2018-10-17,L,108,39,91,0.429,6,19,0.316,...,14,32,46,20,9,2,11,27,SAS112-108MIN,Away
7,SAS,2018-10-17,W,112,40,93,0.43,11,25,0.44,...,19,33,52,22,3,4,12,22,MIN108-112SAS,Home
8,NOP,2018-10-17,W,131,52,98,0.531,10,25,0.4,...,14,40,54,36,8,3,12,25,HOU112-131NOP,Away
9,HOU,2018-10-17,L,112,39,92,0.424,16,48,0.333,...,8,29,37,21,8,7,12,22,NOP131-112HOU,Home


Preferred column order:<br/>
Team, Date, W/L, Home/Away, Score, FG%, FGM, FGA, 3P%, 3PM, 3PA, FT%, FTM, FTA, REB, OREB, DREB, AST, STL, BLK, TOV, PF, PTS

In [10]:
# Re-arrange orders
df_box = df_box[['Team', 'Date', 'W/L', 'Home/Away', 'Score', 'FG%', 'FGM', 
                 'FGA', '3P%', '3PM', '3PA', 'FT%', 'FTM', 'FTA', 'REB', 'OREB', 
                 'DREB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]
df_box

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,FTA,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS
0,DEN,2018-10-17,W,Away,LAC98-107DEN,0.379,33,87,0.333,8,...,42,56,14,42,20,6,8,11,22,107
1,LAC,2018-10-17,L,Home,DEN107-98LAC,0.402,35,87,0.286,8,...,24,47,9,38,21,3,9,14,32,98
2,PHX,2018-10-17,W,Home,DAL100-121PHX,0.543,44,81,0.559,19,...,16,44,6,38,35,5,2,12,22,121
3,DAL,2018-10-17,L,Away,PHX121-100DAL,0.432,38,88,0.303,10,...,20,38,10,28,28,7,5,9,16,100
4,SAC,2018-10-17,L,Home,UTA123-117SAC,0.516,49,95,0.368,7,...,18,37,5,32,17,8,3,9,27,117
5,UTA,2018-10-17,W,Away,SAC117-123UTA,0.519,41,79,0.481,13,...,38,44,5,39,21,8,4,17,19,123
6,MIN,2018-10-17,L,Away,SAS112-108MIN,0.429,39,91,0.316,6,...,27,46,14,32,20,9,2,11,27,108
7,SAS,2018-10-17,W,Home,MIN108-112SAS,0.43,40,93,0.44,11,...,29,52,19,33,22,3,4,12,22,112
8,NOP,2018-10-17,W,Away,HOU112-131NOP,0.531,52,98,0.4,10,...,22,54,14,40,36,8,3,12,25,131
9,HOU,2018-10-17,L,Home,NOP131-112HOU,0.424,39,92,0.333,16,...,24,37,8,29,21,8,7,12,22,112


In [11]:
# Sort by 'Date'
df_box = df_box.sort_values(by=['Date'])
df_box

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,FTA,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS
25,BOS,2018-10-16,W,Home,PHI87-105BOS,0.433,42,97,0.297,11,...,14,55,12,43,21,7,5,15,20,105
23,OKC,2018-10-16,L,Away,GSW108-100OKC,0.363,33,91,0.27,10,...,37,45,16,29,21,12,6,15,21,100
22,GSW,2018-10-16,W,Home,OKC100-108GSW,0.442,42,95,0.269,7,...,18,58,17,41,28,7,7,21,29,108
24,PHI,2018-10-16,L,Away,BOS105-87PHI,0.391,34,87,0.192,5,...,24,47,6,41,18,8,5,16,20,87
21,MEM,2018-10-17,L,Away,IND111-83MEM,0.298,25,84,0.345,10,...,28,28,7,21,16,11,3,10,18,83
20,IND,2018-10-17,W,Home,MEM83-111IND,0.566,47,83,0.385,10,...,13,57,13,44,29,2,7,20,24,111
19,CHA,2018-10-17,L,Home,MIL113-112CHA,0.446,41,92,0.421,16,...,22,41,9,32,21,8,9,11,19,112
18,MIL,2018-10-17,W,Away,CHA112-113MIL,0.494,42,85,0.412,14,...,20,57,11,46,26,5,4,21,25,113
17,BKN,2018-10-17,L,Away,DET103-100BKN,0.488,40,82,0.185,5,...,22,39,5,34,28,9,5,19,23,100
16,DET,2018-10-17,W,Home,BKN100-103DET,0.424,39,92,0.25,6,...,22,46,14,32,21,5,5,17,20,103


In [12]:
# Save arranged box as .csv
df_box.to_csv(outPathArranged + dateOfCrawl + '_' + seasonYear + '_' + seasonType + '.csv', encoding='utf-8', index=False, float_format='%.3f')

## End of Scraping

In [13]:
print('----- Number of data scraped: %d -----' %(len(df_box)))

----- Number of data scraped: 26 -----
