## Crawl NBA games' stats (~4h)

In [1]:
!pip install tqdm # progress bar



In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
from tqdm import tqdm
import requests

opts = webdriver.ChromeOptions()
opts.headless = True
browser = webdriver.Chrome(options=opts)
browser.maximize_window()
df = pd.DataFrame()

---

1. Crawl teams and games data from [basketball-reference.com](https://www.basketball-reference.com) 
- Check term of service https://www.sports-reference.com/termsofuse.html <br>
(using is welcomed but explicitly credit SRL as the source of the data + must not use this data to compete their services + must not submit virus, trojan,... + must not violate any express restrictions) 
2. Crawl preseason elo from [projects.fivethirtyeight.com](https://projects.fivethirtyeight.com)
- Check term of service https://disneytermsofuse.com/english <br>
(not allow to introduce a virus or other harmful component + maybe receive advertisements, promotions,...) 

### Get statistics of all NBA teams from last season

Crawl old stat data from last season to create:
- <b>"preseason_data.csv"</b>: Teams' average stats from last season
- <b>"full_name.txt"</b>: All NBA teams' name and their abbreviations

In [3]:
def get_old_stat(year):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year - 1}.html'
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    tbl = soup.find("table",{"id":"per_game-team"})

    df = pd.read_html(str(tbl))[0]
    df['Team'] = df['Team'].str.replace('*', '', regex=False)
    df.drop(df.tail(1).index, inplace=True)
    df.to_csv('preseason_data.csv', header=True, index=False)
    
    with open('full_name.txt', 'w') as f:
        for i, link in enumerate(tbl.find_all('a')):
            short = link.get('href').split('/')[2]
            f.write(short + ', ' + df['Team'][i] + '\n')

### Get NBA Team Elo

Crawl NBA teams' elo from last season: <b>Data available from Season 2016-2017 -> now</b> <br>

Season 2015-2016, url: 'https://projects.fivethirtyeight.com/2016-nba-picks/'    //(<b>"nba-picks"</b>)<br>
Season 2016-2017-> now, url: 'https://projects.fivethirtyeight.com/${year}-nba-predictions/ <br>

Then, create <b>"preseason_elo.csv"</b>: All NBA Team's elo from last season

In [4]:
def get_elo(year):
    url = f'https://projects.fivethirtyeight.com/{year - 1}-nba-predictions/'
    if year == 2017:
        url = 'https://projects.fivethirtyeight.com/2016-nba-picks/'
    browser.get(url)
    
    name = 'teams-table' if year == 2017 else "standings-table"
    
    table = browser.find_element_by_id(name)
    
    body = table.find_element_by_tag_name('tbody')
    rows = body.find_elements_by_tag_name('tr')
    team_names = [r.find_element_by_class_name('team').text for r in rows]

    dict_name = {}
    with open('full_name.txt', 'r') as f:
        for line in f:
            name = line.split(',') # [short_name, full_name]
            dict_name[name[0]] = name[1].strip()

    for i, n in enumerate(team_names):        
        for k in dict_name.keys():
            if n in dict_name[k]:
                team_names[i] = k
                break
            
    with tqdm(rows) as pbar:
        pbar.set_description("Get Elo")        
        team_elos = [r.find_element_by_tag_name('td').text for r in pbar]
        
    with open('preseason_elo.csv', 'w') as f:
        f.write('Name, Elo\n')
        for n, e in zip(team_names, team_elos):
            f.write(n + ', ' + e + '\n')

### Get colum descriptions

Crawl data to create <b>"raw_description.txt"</b>: Get stats' names and their descriptions

In [5]:
def get_description(year):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year - 1}.html'
    browser.get(url)
    
    table = browser.find_element_by_id('per_game-team')
    header = table.find_element_by_tag_name('thead')
    col_tags = header.find_elements_by_tag_name('th')
    date = ['Date']
    
    # create cols
    sign = ['H_', 'A_']
    cols = [c.text for c in col_tags]
    tmp = [[sign[i] + c for c in cols] for i in range(2)]
    col_table = date + tmp[0] + tmp[1]
    
    # create descriptions
    sign_description = ['Home ', 'Away ']
    with tqdm(col_tags) as pbar:
        pbar.set_description("Get Description")
        desc = [c.get_attribute("data-tip") 
                                    for c in pbar]
    # Team description missing
    desc[1] = 'Team'
    
    tmp = [[sign_description[i] + d for d in desc] for i in range(2)]
    description = date + tmp[0] + tmp[1]
    
    with open('raw_description.txt', 'w') as f:
        for c, d in zip(col_table, description):
            f.write(c + ' : ' + d + '\n')

### Get URL to every match in a month

Create url for games' stats in each month in 1 season

In [6]:
def get_html(url):
    browser.get(url)
    
    btns = browser.find_elements_by_xpath('//*[@data-stat="box_score_text"]')
    btns = [b for b in btns if b.text != ' ']
    links = [b.find_elements_by_xpath('.//*')[0].get_attribute('href') for b in btns]
    
    html_text = browser.page_source
    
    tree = BeautifulSoup(html_text, 'html.parser')
    return links, tree

### Get column names

Generate column names for dataframe (column name = symbol('H' or 'A') + stat_name) <br>
H_: Home team's stats <br>
A_: Away team's stats <br>

In [7]:
def get_info(url):
    browser.get(url)

    team = browser.find_element_by_id('line_score')\
                .find_element_by_tag_name('a').text

    table = browser.find_element_by_id(f'box-{team}-game-basic')
    
    header = table.find_element_by_tag_name('thead')
    col_tags = header.find_elements_by_tag_name('th')
    date = ['Date']
    
    # create cols
    sign = ['H_', 'A_']
    cols = ['Team'] + [c.text for c in col_tags][3:]
    tmp = [[sign[i] + c for c in cols] for i in range(2)]
    col_table = date + tmp[0] + tmp[1]
            
    return col_table

### Get Data from every match

Crawl 1 game's stats based on column names above

In [8]:
def get_data(url):
    browser.get(url)
    
    datetime = browser.find_element_by_class_name('scorebox_meta')\
                        .find_element_by_tag_name('div').text
    time, date = datetime.split(', ', 1)
    
    
    tmp = browser.find_element_by_id('line_score')\
                .find_elements_by_tag_name('a')
    teams = [t.text for t in tmp][::-1] # reverse() // home first

    tables = [browser.find_element_by_id(f'box-{t}-game-basic') 
                                for t in teams]
    data_table = [date]
    for i, t in enumerate(tables):
        footer = t.find_element_by_tag_name('tfoot')
        data_tags = footer.find_elements_by_tag_name('td')
        data = [teams[i]] + [d.text for d in data_tags]
        data_table += data
    
    
    return data_table

### Main 

Init current year: 2021 <br>
Generate recent n years(in this case: n = 5) <br>
Crawl stats, elo from preseason (current year - n - 1) <br>
Crawl all games' data during n years

In [9]:
main_url = 'https://www.basketball-reference.com'
cur_year = 2021
n = 5
years = [cur_year - i for i in range(n - 1, -1, -1)]

cols = []

get_old_stat(years[0])
get_elo(years[0])
get_description(years[0])

for year in years:
    print(year)
    year_url = f'/leagues/NBA_{year}_games.html'
    urls, tree = get_html(main_url + year_url)

    filter = tree.find('div', class_=['filter'])

    for i, tag in enumerate(filter.find_all('a')):
        if i: # first link same with main page // no need to get html
            link = main_url + tag['href']
            urls, tree = get_html(link)
            
        with tqdm(urls) as pbar:
            pbar.set_description("Processing %s" % tag.text)
            for link in pbar:
                if not cols:
                    cols = get_info(link)
                    df = pd.DataFrame(columns=cols)

                df = df.append(pd.DataFrame([get_data(link)], columns=cols),
                               ignore_index = True)


Get Elo: 100%|█████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 64.78it/s]
Get Description: 100%|████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 211.84it/s]


2017


Processing October: 100%|██████████████████████████████████████████████████████████████| 45/45 [01:36<00:00,  2.14s/it]
Processing November: 100%|███████████████████████████████████████████████████████████| 229/229 [08:24<00:00,  2.20s/it]
Processing December: 100%|███████████████████████████████████████████████████████████| 232/232 [08:35<00:00,  2.22s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 223/223 [08:19<00:00,  2.24s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 165/165 [05:48<00:00,  2.11s/it]
Processing March: 100%|██████████████████████████████████████████████████████████████| 241/241 [09:34<00:00,  2.38s/it]
Processing April: 100%|██████████████████████████████████████████████████████████████| 140/140 [04:50<00:00,  2.07s/it]
Processing May: 100%|██████████████████████████████████████████████████████████████████| 29/29 [01:01<00:00,  2.13s/it]
Processing June: 100%|██████████████████

2018


Processing October: 100%|████████████████████████████████████████████████████████████| 104/104 [03:45<00:00,  2.17s/it]
Processing November: 100%|███████████████████████████████████████████████████████████| 213/213 [07:32<00:00,  2.12s/it]
Processing December: 100%|███████████████████████████████████████████████████████████| 227/227 [08:03<00:00,  2.13s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 216/216 [07:24<00:00,  2.06s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 160/160 [05:38<00:00,  2.11s/it]
Processing March: 100%|██████████████████████████████████████████████████████████████| 222/222 [07:47<00:00,  2.11s/it]
Processing April: 100%|██████████████████████████████████████████████████████████████| 136/136 [04:44<00:00,  2.09s/it]
Processing May: 100%|██████████████████████████████████████████████████████████████████| 31/31 [01:01<00:00,  2.00s/it]
Processing June: 100%|██████████████████

2019


Processing October: 100%|████████████████████████████████████████████████████████████| 110/110 [03:51<00:00,  2.11s/it]
Processing November: 100%|███████████████████████████████████████████████████████████| 219/219 [07:47<00:00,  2.13s/it]
Processing December: 100%|███████████████████████████████████████████████████████████| 219/219 [07:49<00:00,  2.14s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 221/221 [08:08<00:00,  2.21s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 158/158 [05:47<00:00,  2.20s/it]
Processing March: 100%|██████████████████████████████████████████████████████████████| 224/224 [08:00<00:00,  2.15s/it]
Processing April: 100%|██████████████████████████████████████████████████████████████| 127/127 [04:26<00:00,  2.10s/it]
Processing May: 100%|██████████████████████████████████████████████████████████████████| 29/29 [01:01<00:00,  2.12s/it]
Processing June: 100%|██████████████████

2020


Processing October 2019: 100%|█████████████████████████████████████████████████████████| 68/68 [02:25<00:00,  2.14s/it]
Processing November: 100%|███████████████████████████████████████████████████████████| 215/215 [08:18<00:00,  2.32s/it]
Processing December: 100%|███████████████████████████████████████████████████████████| 220/220 [09:19<00:00,  2.54s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 222/222 [08:32<00:00,  2.31s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 168/168 [06:50<00:00,  2.44s/it]
Processing March: 100%|████████████████████████████████████████████████████████████████| 78/78 [03:16<00:00,  2.52s/it]
Processing July: 100%|███████████████████████████████████████████████████████████████████| 8/8 [00:18<00:00,  2.28s/it]
Processing August: 100%|█████████████████████████████████████████████████████████████| 123/123 [04:48<00:00,  2.35s/it]
Processing September: 100%|█████████████

2021


Processing December: 100%|█████████████████████████████████████████████████████████████| 67/67 [02:48<00:00,  2.51s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 222/222 [09:08<00:00,  2.47s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 212/212 [08:51<00:00,  2.51s/it]
Processing March: 100%|██████████████████████████████████████████████████████████████| 204/204 [09:34<00:00,  2.81s/it]
Processing April: 100%|██████████████████████████████████████████████████████████████| 240/240 [09:37<00:00,  2.41s/it]
Processing May: 100%|████████████████████████████████████████████████████████████████| 173/173 [06:51<00:00,  2.38s/it]
Processing June: 100%|█████████████████████████████████████████████████████████████████| 45/45 [01:40<00:00,  2.23s/it]
Processing July: 100%|███████████████████████████████████████████████████████████████████| 8/8 [00:15<00:00,  1.88s/it]


In [10]:
df.shape

(6247, 43)

Save df to <b>"raw_data.csv"</b>: All NBA games' stats during n years

In [11]:
df = df.iloc[::-1] # reverse dataframe // from lastest -> oldest
df.head()
df.to_csv('raw_data.csv', index=False)

In [12]:
browser.quit()