### Import neccessary library

In [1]:
# !pip install tqdm

In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
from tqdm import tqdm
import requests


opts = webdriver.ChromeOptions()
opts.headless = True
browser = webdriver.Chrome(options=opts)
browser.maximize_window()
df = pd.DataFrame()

### Get NBA Team Names and their abbreviations

In [3]:
def get_full_name():
    url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations'
    browser.get(url)
    
    body = browser.find_element_by_tag_name('tbody')
    teams = body.find_elements_by_tag_name('tr')[1:]
    team_short_names = []
    team_names = []
    
    with tqdm(teams) as pbar:
        pbar.set_description("Get Full Name")
        for t in pbar:
            tds = t.find_elements_by_tag_name('td')
            team_short_names.append(tds[0].text)
            team_names.append(tds[1].text)
            
    team_short_names[team_short_names.index("CHA")] = 'CHO'
    
    with open('full_name.txt', 'w') as f:
        for s, n in zip(team_short_names, team_names):
            f.write(s + ', ' + n + '\n')

### Get NBA Team Elo

In [4]:
def get_elo(year):
    url = f'https://projects.fivethirtyeight.com/{year}-nba-predictions'
    browser.get(url)
    
    body = browser.find_element_by_tag_name('tbody')
    teams = body.find_elements_by_tag_name('tr')
    team_names = [t.get_attribute("data-team") for t in teams]
    
    #fill-in missing character
    for i, t in enumerate(teams):
        if len(team_names[i]) < 3:
            # missing character
            c = t.find_element_by_tag_name('a').text[0]
            team_names[i] += c
            
    with tqdm(teams) as pbar:
        pbar.set_description("Get Elo")        
        team_elos = [t.find_element_by_tag_name('td').text
                                         for t in pbar]
            
    with open('preseason_elo.csv', 'w') as f:
        f.write('Name, Elo\n')
        for n, e in zip(team_names, team_elos):
            f.write(n + ', ' + e + '\n')

### Get statistics of all NBA teams from last season

In [5]:
def get_old_stat(year):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year - 1}.html'
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    tbl = soup.find("table",{"id":"per_game-team"})

    df = pd.read_html(str(tbl))[0]
    df['Team'] = df['Team'].str.replace('*', '', regex=False)
    df.drop(df.tail(1).index, inplace=True)
    df.to_csv('preseason_data.csv', header=True, index=False)

### Get colum descriptions

In [6]:
def get_description(year):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year - 1}.html'
    browser.get(url)
    
    table = browser.find_element_by_id('per_game-team')
    header = table.find_element_by_tag_name('thead')
    col_tags = header.find_elements_by_tag_name('th')
    date = ['Date']
    
    # create cols
    sign = ['H_', 'A_']
    cols = [c.text for c in col_tags]
    tmp = [[sign[i] + c for c in cols] for i in range(2)]
    col_table = date + tmp[0] + tmp[1]
    
    # create descriptions
    sign_description = ['Home ', 'Away ']
    with tqdm(col_tags) as pbar:
        pbar.set_description("Get Description")
        desc = [c.get_attribute("data-tip") 
                                    for c in pbar]
    # Team description missing
    desc[1] = 'Team'
    
    tmp = [[sign_description[i] + d for d in desc] for i in range(2)]
    description = date + tmp[0] + tmp[1]
    
    with open('description.txt', 'w') as f:
        for c, d in zip(col_table, description):
            f.write(c + ' : ' + d + '\n')

### Get URL to every match in a month

In [7]:
def get_html(url):
    browser.get(url)
    
    btns = browser.find_elements_by_xpath('//*[@data-stat="box_score_text"]')
    btns = [b for b in btns if b.text != ' ']
    links = [b.find_elements_by_xpath('.//*')[0].get_attribute('href') for b in btns]
    
    html_text = browser.page_source
    
    tree = BeautifulSoup(html_text, 'html.parser')
    return links, tree

### Get column names

In [8]:
def get_info(url):
    browser.get(url)

    team = browser.find_element_by_id('line_score')\
                .find_element_by_tag_name('a').text

    table = browser.find_element_by_id(f'box-{team}-game-basic')
    
    header = table.find_element_by_tag_name('thead')
    col_tags = header.find_elements_by_tag_name('th')
    date = ['Date']
    
    # create cols
    sign = ['H_', 'A_']
    cols = ['Team'] + [c.text for c in col_tags][3:]
    tmp = [[sign[i] + c for c in cols] for i in range(2)]
    col_table = date + tmp[0] + tmp[1]
            
    return col_table

### Get Data from every match

In [9]:
def get_data(url):
    browser.get(url)
    
    datetime = browser.find_element_by_class_name('scorebox_meta')\
                        .find_element_by_tag_name('div').text
    time, date = datetime.split(', ', 1)
    
    
    tmp = browser.find_element_by_id('line_score')\
                .find_elements_by_tag_name('a')
    teams = [t.text for t in tmp][::-1] # reverse() // home first

    tables = [browser.find_element_by_id(f'box-{t}-game-basic') 
                                for t in teams]
    data_table = [date]
    for i, t in enumerate(tables):
        footer = t.find_element_by_tag_name('tfoot')
        data_tags = footer.find_elements_by_tag_name('td')
        data = [teams[i]] + [d.text for d in data_tags]
        data_table += data
    
    
    return data_table

### Main 

In [10]:
main_url = 'https://www.basketball-reference.com'
years = [2019, 2020, 2021]
cols = []

get_full_name()
get_elo(years[0])
get_old_stat(years[0])
get_description(years[0])

for year in years:
    print(year)
    year_url = f'/leagues/NBA_{year}_games.html'
    urls, tree = get_html(main_url + year_url)

    filter = tree.find('div', class_=['filter'])

    for i, tag in enumerate(filter.find_all('a')):
        if i: # first link same with main page // no need to get html
            link = main_url + tag['href']
            urls, tree = get_html(link)
            
        with tqdm(urls) as pbar:
            pbar.set_description("Processing %s" % tag.text)
            for link in pbar:
                if not cols:
                    cols = get_info(link)
                    df = pd.DataFrame(columns=cols)

                df = df.append(pd.DataFrame([get_data(link)], columns=cols),
                               ignore_index = True)
#     break

Get Full Name: 100%|███████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 44.66it/s]
Get Elo: 100%|█████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 69.48it/s]
Get Description: 100%|████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 205.15it/s]


2019


Processing October: 100%|████████████████████████████████████████████████████████████| 110/110 [02:47<00:00,  1.52s/it]
Processing November: 100%|███████████████████████████████████████████████████████████| 219/219 [05:56<00:00,  1.63s/it]
Processing December: 100%|███████████████████████████████████████████████████████████| 219/219 [06:18<00:00,  1.73s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 221/221 [06:12<00:00,  1.68s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 158/158 [04:16<00:00,  1.62s/it]
Processing March: 100%|██████████████████████████████████████████████████████████████| 224/224 [06:06<00:00,  1.64s/it]
Processing April: 100%|██████████████████████████████████████████████████████████████| 127/127 [03:34<00:00,  1.69s/it]
Processing May: 100%|██████████████████████████████████████████████████████████████████| 29/29 [00:45<00:00,  1.58s/it]
Processing June: 100%|██████████████████

2020


Processing October 2019: 100%|█████████████████████████████████████████████████████████| 68/68 [01:56<00:00,  1.71s/it]
Processing November: 100%|███████████████████████████████████████████████████████████| 215/215 [06:00<00:00,  1.68s/it]
Processing December: 100%|███████████████████████████████████████████████████████████| 220/220 [06:04<00:00,  1.65s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 222/222 [06:11<00:00,  1.67s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 168/168 [04:37<00:00,  1.65s/it]
Processing March: 100%|████████████████████████████████████████████████████████████████| 78/78 [02:02<00:00,  1.57s/it]
Processing July: 100%|███████████████████████████████████████████████████████████████████| 8/8 [00:12<00:00,  1.57s/it]
Processing August: 100%|█████████████████████████████████████████████████████████████| 123/123 [03:26<00:00,  1.68s/it]
Processing September: 100%|█████████████

2021


Processing December: 100%|█████████████████████████████████████████████████████████████| 67/67 [01:59<00:00,  1.79s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 222/222 [06:22<00:00,  1.72s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 212/212 [06:24<00:00,  1.81s/it]
Processing March: 100%|██████████████████████████████████████████████████████████████| 204/204 [06:06<00:00,  1.80s/it]
Processing April: 100%|██████████████████████████████████████████████████████████████| 240/240 [06:57<00:00,  1.74s/it]
Processing May: 100%|████████████████████████████████████████████████████████████████| 173/173 [04:52<00:00,  1.69s/it]
Processing June: 100%|█████████████████████████████████████████████████████████████████| 45/45 [01:23<00:00,  1.85s/it]
Processing July: 100%|███████████████████████████████████████████████████████████████████| 8/8 [00:14<00:00,  1.84s/it]


In [11]:
df.shape

(3626, 43)

In [12]:
df = df.iloc[::-1] # reverse dataframe // from lastest -> oldest
df.head()
df.to_csv('raw_data.csv', header=False, index=False)

$R_i$: Elo of a team in $i^{th}$ match

$R_{i + 1} = k * (S_{team} - E_{team} + R_i)$

$S_{team} = 1$: win || $0$: lose

$E_{team} = \frac{1}{1 + 10^{\frac{oppElo - teamElo}{400}}}$

$k = 20 \frac{(MOV_{winner} + 3)^{0.8}}{7.5 + 0.006(\text{Elo_diff}_{winner})}$

$MOV_{winner} = winner_{score} - loser_{score}$

$\text{Elo_diff}_{winner} = winner_{elo} - loser_{elo}$

In [13]:
def K(MOV, elo_diff):
    f = 1 if MOV > 0 else -1 #factor

    k = 20 * (f * MOV + 3) ** (0.8) / (7.5 + 0.006 * (f * elo_diff))

    return k, -k # win: +elo, lose: -elo

In [14]:
def S(MOV):
    S_home, S_away = 0, 0
    if MOV > 0:
        S_home = 1
    elif MOV < 0:
        S_away = 1
    else:
        S_home, S_away = .5, .5
    return S_home, S_away

In [15]:
def elo_prediction(home_elo,away_elo):
    E_home = 1./(1 + 10 ** ((away_elo - home_elo) / (400.)))
    return E_home

In [16]:
def score_prediction(home_elo,away_elo):
    return (home_elo - away_elo)/28.

In [17]:
def elo_update(home_score, away_score, home_elo, away_elo):
    home_advan = 100. #home advantage
    home_elo += home_advan
    E_home = elo_prediction(home_elo, away_elo)
    E_away = 1 - E_home
    elo_diff = home_elo - away_elo
    MOV = home_score - away_score # margin of victory
    
    S_home, S_away = S(home_score,  away_score)
    
    K_home, K_away = K(MOV, elo_diff)
        
    return K_home * (S_home - E_home), K_away * (S_away - E_away)