### Import neccessary library

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
## 1. Parse HTML

opts = webdriver.ChromeOptions()
opts.headless = True
browser = webdriver.Chrome(options=opts)
browser.maximize_window()
df = pd.DataFrame()

### Get URL to every match in a month

In [2]:
def get_html(url):
    browser.get(url)
    
    btns = browser.find_elements_by_xpath('//*[@data-stat="box_score_text"]')
    btns = [b for b in btns if b.text != ' ']
    links = [b.find_elements_by_xpath('.//*')[0].get_attribute('href') for b in btns]
    
    html_text = browser.page_source
    
    tree = BeautifulSoup(html_text, 'html.parser')
#     browser.quit()
    return links, tree

### Get column-names and description

In [3]:
def get_info(url):
    browser.get(url)

    tmp = browser.find_element_by_id('line_score')\
                .find_elements_by_tag_name('a')
    teams = [t.text for t in tmp]

    table = browser.find_element_by_id(f'box-{teams[0]}-game-basic')
    sign = ['A_', 'H_']
    
    header = table.find_element_by_tag_name('thead')
    col_tags = header.find_elements_by_tag_name('th')
    # ---------------
    sign2 = ['Away ', 'Home ']
    desc = [c.get_attribute("data-tip") for c in col_tags][3:]
    tmp = [[sign2[i] + d for d in desc] for i in range(2)]
    description = tmp[0] + tmp[1]
    
    # create cols
    cols = ['Name'] + [c.text for c in col_tags][3:]
    tmp = [[sign[i] + c for c in cols] for i in range(2)]
    col_table = tmp[0] + tmp[1]
    
#     browser.quit()
    with open('description.txt', 'w') as f:
        for c, d in zip(col_table, description):
            f.write(c + ' : ' + d + '\n')
    
    return col_table

### Get Data from every match

In [4]:
def get_data(url):
    browser.get(url)
    
    tmp = browser.find_element_by_id('line_score')\
                .find_elements_by_tag_name('a')
    teams = [t.text for t in tmp]

    tables = [browser.find_element_by_id(f'box-{t}-game-basic') 
                                for t in teams]
    data_table = []
    # Performing the mouse hover action on the target element.
    for i, t in enumerate(tables):
        footer = t.find_element_by_tag_name('tfoot')
        data_tags = footer.find_elements_by_tag_name('td')
        data = [teams[i]] + [d.text for d in data_tags]
        data_table += data
    
#     browser.quit()
    
    return data_table

### Main 

In [None]:
main_url = 'https://www.basketball-reference.com'
years = [2017, 2018, 2019, 2020, 2021]
for y in years:
    year_url = f'/leagues/NBA_{y}_games.html'

    urls, tree = get_html(main_url + year_url)

    filter = tree.find('div', class_=['filter'])

    for i, tag in enumerate(filter.find_all('a')):
        print(i)
        if i: # first link same with main page // no need to get html
            link = main_url + tag['href']
            urls, tree = get_html(link)

        for j, l in enumerate(urls):
            print(round(j / len(urls) * 100, 2), '%')
            if j == 0:
                cols = get_info(l)
                df = pd.DataFrame(columns = cols)
            df.loc[i] = get_data(l)
    break

0
0.0 %
2.22 %
4.44 %
6.67 %
8.89 %
11.11 %
13.33 %
15.56 %
17.78 %
20.0 %
22.22 %
24.44 %
26.67 %
28.89 %
31.11 %
33.33 %
35.56 %
37.78 %
40.0 %
42.22 %
44.44 %
46.67 %
48.89 %
51.11 %
53.33 %
55.56 %
57.78 %
60.0 %
62.22 %
64.44 %
66.67 %
68.89 %
71.11 %
73.33 %
75.56 %
77.78 %
80.0 %
82.22 %
84.44 %
86.67 %
88.89 %
91.11 %
93.33 %
95.56 %
97.78 %
1
0.0 %
0.44 %
0.87 %
1.31 %
1.75 %
2.18 %
2.62 %
3.06 %
3.49 %
3.93 %
4.37 %
4.8 %
5.24 %
5.68 %
6.11 %
6.55 %
6.99 %
7.42 %
7.86 %
8.3 %
8.73 %
9.17 %
9.61 %
10.04 %
10.48 %
10.92 %
11.35 %
11.79 %
12.23 %
12.66 %
13.1 %
13.54 %
13.97 %
14.41 %
14.85 %
15.28 %
15.72 %
16.16 %
16.59 %
17.03 %
17.47 %
17.9 %
18.34 %
18.78 %
19.21 %
19.65 %
20.09 %
20.52 %
20.96 %
21.4 %
21.83 %
22.27 %
22.71 %
23.14 %
23.58 %
24.02 %
24.45 %
24.89 %
25.33 %
25.76 %
26.2 %
26.64 %
27.07 %
27.51 %
27.95 %
28.38 %
28.82 %
29.26 %
29.69 %
30.13 %
30.57 %
31.0 %
31.44 %
31.88 %
32.31 %
32.75 %
33.19 %
33.62 %
34.06 %
34.5 %
34.93 %
35.37 %
35.81 %
36.24 %
36.68 

In [None]:
df.to_csv('data.csv', mode = 'a', header=False, index=False)