### Import neccessary library

In [1]:
# !pip install tqdm

In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
from tqdm import tqdm


opts = webdriver.ChromeOptions()
opts.headless = True
browser = webdriver.Chrome(options=opts)
browser.maximize_window()
df = pd.DataFrame()

### Get URL to every match in a month

In [3]:
def get_html(url):
    browser.get(url)
    
    btns = browser.find_elements_by_xpath('//*[@data-stat="box_score_text"]')
    btns = [b for b in btns if b.text != ' ']
    links = [b.find_elements_by_xpath('.//*')[0].get_attribute('href') for b in btns]
    
    html_text = browser.page_source
    
    tree = BeautifulSoup(html_text, 'html.parser')
    return links, tree

### Get column-names and description

In [4]:
def get_info(url):
    browser.get(url)

    team = browser.find_element_by_id('line_score')\
                .find_element_by_tag_name('a').text

    table = browser.find_element_by_id(f'box-{team}-game-basic')
    
    header = table.find_element_by_tag_name('thead')
    col_tags = header.find_elements_by_tag_name('th')
    datetime = ['Date', 'Time']
    
    # create cols
    sign = ['H_', 'A_']
    cols = ['Name'] + [c.text for c in col_tags][3:]
    tmp = [[sign[i] + c for c in cols] for i in range(2)]
    col_table = datetime + tmp[0] + tmp[1]
    
    
    # create descriptions
    sign_description = ['Home ', 'Away ']
    desc = ['Team'] + [c.get_attribute("data-tip") 
                                    for c in col_tags][3:]
    
    tmp = [[sign_description[i] + d for d in desc] for i in range(2)]
    description = datetime + tmp[0] + tmp[1]
    
    with open('description.txt', 'w') as f:
        for c, d in zip(col_table, description):
            f.write(c + ' : ' + d + '\n')
            
    return col_table

### Get Data from every match

In [5]:
def get_data(url):
    browser.get(url)
    
    datetime = browser.find_element_by_class_name('scorebox_meta')\
                        .find_element_by_tag_name('div').text
    time, date = datetime.split(', ', 1)
    
    
    tmp = browser.find_element_by_id('line_score')\
                .find_elements_by_tag_name('a')
    teams = [t.text for t in tmp][::-1] # reverse() // home first

    tables = [browser.find_element_by_id(f'box-{t}-game-basic') 
                                for t in teams]
    data_table = [date, time]
    for i, t in enumerate(tables):
        footer = t.find_element_by_tag_name('tfoot')
        data_tags = footer.find_elements_by_tag_name('td')
        data = [teams[i]] + [d.text for d in data_tags]
        data_table += data
    
    
    return data_table

### Main 

In [6]:
main_url = 'https://www.basketball-reference.com'
years = [2019, 2020, 2021]
cols = []
for year in years:
    print(year)
    year_url = f'/leagues/NBA_{year}_games.html'
    urls, tree = get_html(main_url + year_url)

    filter = tree.find('div', class_=['filter'])

    for i, tag in enumerate(filter.find_all('a')):
        if i: # first link same with main page // no need to get html
            link = main_url + tag['href']
            urls, tree = get_html(link)
            
        with tqdm(urls) as pbar:
            pbar.set_description("Processing %s" % tag.text)
            for link in pbar:
                if not cols:
                    cols = get_info(link)
                    df = pd.DataFrame(columns=cols)

                df = df.append(pd.DataFrame([get_data(link)], columns=cols),
                               ignore_index = True)
    break

2019


Processing October: 100%|████████████████████████████████████████████████████████████| 110/110 [04:10<00:00,  2.27s/it]
Processing November: 100%|███████████████████████████████████████████████████████████| 219/219 [08:56<00:00,  2.45s/it]
Processing December: 100%|███████████████████████████████████████████████████████████| 219/219 [09:45<00:00,  2.68s/it]
Processing January: 100%|████████████████████████████████████████████████████████████| 221/221 [08:10<00:00,  2.22s/it]
Processing February: 100%|███████████████████████████████████████████████████████████| 158/158 [06:00<00:00,  2.28s/it]
Processing March: 100%|██████████████████████████████████████████████████████████████| 224/224 [08:47<00:00,  2.36s/it]
Processing April: 100%|██████████████████████████████████████████████████████████████| 127/127 [04:41<00:00,  2.22s/it]
Processing May: 100%|██████████████████████████████████████████████████████████████████| 29/29 [01:39<00:00,  3.44s/it]
Processing June: 100%|██████████████████

In [7]:
df.shape

(1312, 44)

In [8]:
df.to_csv('data.csv', mode = 'a', header=False, index=False)

$R_i$: Elo of a team in $i^{th}$ match

$R_{i + 1} = k * (S_{team} - E_{team} + R_i)$

$S_{team} = 1$: win || $0$: lose

$E_{team} = \frac{1}{1 + 10^{\frac{oppElo - teamElo}{400}}}$

$k = 20 \frac{(MOV_{winner} + 3)^{0.8}}{7.5 + 0.006(\text{Elo_diff}_{winner})}$

$MOV_{winner} = winner_{score} - loser_{score}$

$\text{Elo_diff}_{winner} = winner_{elo} - loser_{elo}$

In [None]:
def K(MOV, elo_diff):
    f = 1 if MOV > 0 else -1 #factor

    k = 20 * (f * MOV + 3) ** (0.8) / (7.5 + 0.006 * (f * elo_diff))

    return k, -k # win: +elo, lose: -elo

In [None]:
def S(MOV):
    S_home, S_away = 0, 0
    if MOV > 0:
        S_home = 1
    elif MOV < 0:
        S_away = 1
    else:
        S_home, S_away = .5, .5
    return S_home, S_away

In [None]:
def elo_prediction(home_elo,away_elo):
    E_home = 1./(1 + 10 ** ((away_elo - home_elo) / (400.)))
    return E_home

In [None]:
def score_prediction(home_elo,away_elo):
    return (home_elo - away_elo)/28.

In [None]:
def elo_update(home_score, away_score, home_elo, away_elo):
    home_advan = 100. #home advantage
    home_elo += home_advan
    E_home = elo_prediction(home_elo, away_elo)
    E_away = 1 - E_home
    elo_diff = home_elo - away_elo
    MOV = home_score - away_score # margin of victory
    
    S_home, S_away = S(home_score,  away_score)
    
    K_home, K_away = K(MOV, elo_diff)
#     if S_home > 0:
#         K_home, K_away = K(MOV, elo_diff)
#     else:
#         K_home, K_away = K(MOV, elo_diff)
        
    return K_home*(S_home-E_home),K_away*(S_away-E_away)