![](pngs/nba_html_crawler.png)
1. Define Foundation
2. Get Player Alphabet Urls
3. Get Player Urls
4. Save Player Html Pages
5. Get Award Pages
6. Save Award Html Pages
7. Get ALL Team Season Urls
8. Save ALL Team Season Html Pages 
---

### ➤ 1 Define Foundation 

In [1]:
import os
import sys
import time as t
from urllib.parse import urljoin

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
MAIN_URL = r"https://www.basketball-reference.com/"
ALPHABET_URL = r"https://www.basketball-reference.com/players/"
ALL_NBA_URL = r"https://www.basketball-reference.com/awards/all_league.html"
ALL_DEFENSIVE_URL= r"https://www.basketball-reference.com/awards/all_defense.html"
MVP_URL = r"https://www.basketball-reference.com/awards/mvp.html"
SEASON_URL = r"https://www.basketball-reference.com/leagues/"

DATA_PATH = r"C:\Users\knaue\Documents\Data\NBA"
PLAYER_HTML_PATH = os.path.join(DATA_PATH, "PLAYER_HTML")
AWARD_HTML_PATH = os.path.join(DATA_PATH, "AWARD_HTML")
SEASON_HTML_PATH = os.path.join(DATA_PATH, "SEASON_HTML")

ALPHABET_PATH = os.path.join(DATA_PATH, "Alphabet_Urls.csv")
PLAYER_PATH = os.path.join(DATA_PATH, "Player_Urls.csv")
AWARD_PATH = os.path.join(DATA_PATH, "Award_Urls.csv")
SEASON_PATH = os.path.join(DATA_PATH, "Season_Urls.csv")

PARSER = 'lxml'
ONLY_ACTIVE_PLAYER = None

In [3]:
def filter_out_comment(soup: BeautifulSoup) -> BeautifulSoup:
    content = str(soup).replace('<!--', '')
    content = content.replace('-->', '')
    return BeautifulSoup(content, PARSER)

def request_data(url: str, sleep_time_sec: float = 1.0, with_comment: bool = True) -> BeautifulSoup:
    t.sleep(sleep_time_sec)
    
    if with_comment: 
        return BeautifulSoup(requests.get(url).content, PARSER)
    return filter_out_comment(BeautifulSoup(requests.get(url).content, PARSER))

In [4]:
def season_to_int(cell_value: str):
    if cell_value[-2:] == "00":
        return (int(cell_value[:2]) + 1)*100
    else:
        return int(cell_value[:2] + cell_value[-2:])   

### ➤ 2 Get Player Alphabet Urls

In [5]:
content = request_data(ALPHABET_URL, 1.0, False)
content = content.find("div", id="div_alphabet")

alphabet_dict = {tag.get_text(): tag['href'] for tag in content.find_all("a")}
alphabet_dict = {key: urljoin(ALPHABET_URL, value) for key, value in alphabet_dict.items()}

df_alphabet_urls = pd.DataFrame.from_dict(alphabet_dict, orient="index", columns=["url"])
df_alphabet_urls.to_csv(ALPHABET_PATH, encoding="utf-8-sig")
print("Saved to: ", ALPHABET_PATH)

Saved to:  C:\Users\knaue\Documents\Data\NBA\Alphabet_Urls.csv


### ➤ 3 Get Player Urls

In [6]:
df_alphabet_urls = pd.read_csv(ALPHABET_PATH, encoding="utf-8-sig")
dfs = []
i = 0

for url in df_alphabet_urls["url"].values: 
    i += 1
    sys.stdout.write(f"\r{i}/{len(df_alphabet_urls)}...")
    
    content = request_data(url, 4.0, False)
    content = content.find("table", id="players")
    df = pd.read_html(str(content))[0]
    
    df['Hall_of_Fame'] = df['Player'].str.contains("\*")
    df['Player'] = df['Player'].str.replace("\*", "", regex=True)
    
    all_as = [a for a in content.find_all("a") if "players" in a['href']]
    all_as = [ [urljoin(ALPHABET_URL, a['href']) , True] if a.previous_element.name == "strong" else [urljoin(ALPHABET_URL, a['href']), False] 
             for a in all_as]

    df['Active'] = [is_active[-1] for is_active in all_as]
    df['Url'] = [is_active[0] for is_active in all_as]
    
    dfs.append(df)

dfs = pd.concat(dfs, ignore_index=True)
dfs['Path'] = dfs['Url'].apply(lambda cell: os.path.join(PLAYER_HTML_PATH, cell.replace("/", "{").replace(":", "}")))
dfs.to_csv(PLAYER_PATH, index=False, encoding="utf-8-sig")
print("\nSaved to: ", PLAYER_PATH)

25/25...
Saved to:  C:\Users\knaue\Documents\Data\NBA\Player_Urls.csv


### ➤ 4 Save Player Html Pages

In [8]:
df_player_urls = pd.read_csv(PLAYER_PATH, usecols=["Active", "Url"], encoding="utf-8-sig")

if ONLY_ACTIVE_PLAYER != None:
    df_player_urls = df_player_urls[df_player_urls['Active'] == ONLY_ACTIVE_PLAYER] 

i = 0
for active, url in df_player_urls[["Active", "Url"]].values:
    i += 1
    sys.stdout.write(f"\r{i}/{len(df_player_urls)}...")
    
    content = request_data(url=url, sleep_time_sec=4.0, with_comment=False)
    url = url.replace("/", "{").replace(":", "}")
    player_path = os.path.join(PLAYER_HTML_PATH, url)

    with open(player_path, "w", encoding='utf-8-sig') as f:
        f.write(str(content))
        f.close()
        
print("\nSaved to: ", PLAYER_HTML_PATH, "...")

5023/5023...
Saved to:  C:\Users\knaue\Documents\Data\NBA\PLAYER_HTML ...


### ➤ 5 Get Award Pages

In [11]:
# MVP Voting
content = request_data(url=MVP_URL, sleep_time_sec=4.0, with_comment=False)
table = content.find("table", id="mvp_NBA")
df_table = pd.read_html(str(table))[0]
df_table = df_table.droplevel(0, axis=1)
df_table['Season'] = df_table['Season'].apply(lambda x: season_to_int(x))

votings = []
for td in table.find("tbody").findAll("td", class_="center", attrs={"data-stat":"voting"}):
    votings.append(urljoin(MAIN_URL, td.a['href']))
df_table.insert(loc=len(df_table.columns), column='Voting_Url', value=votings)

df_table = df_table[['Season', 'Voting_Url']]
df_table['Voting_Path'] = df_table['Voting_Url'].apply(lambda cell: os.path.join(AWARD_HTML_PATH, cell.replace("/", "{").replace(":", "}")))

# All NBA
df_table.loc[len(df_table)] = ["All_NBA", ALL_NBA_URL, os.path.join(AWARD_HTML_PATH, ALL_NBA_URL.replace("/", "{").replace(":", "}"))]

# All Defensive
df_table.loc[len(df_table)] = ["All_DEFENSIVE", ALL_DEFENSIVE_URL, os.path.join(AWARD_HTML_PATH, ALL_DEFENSIVE_URL.replace("/", "{").replace(":", "}"))]

df_table.to_csv(AWARD_PATH, index=False, encoding="utf-8-sig")
print("Saved to: ", AWARD_PATH)

Saved to:  C:\Users\knaue\Documents\Data\NBA\Award_Urls.csv


### ➤ 6 Save Award Html Pages 

In [12]:
df_award = pd.read_csv(AWARD_PATH, encoding="utf-8-sig")
i = 0

for url, path in df_award[['Voting_Url', 'Voting_Path']].values:
    i += 1
    sys.stdout.write(f"\r{i}/{len(df_award)}...")
    
    content = request_data(url=url, sleep_time_sec=4.0, with_comment=False)
    with open(path, "w", encoding='utf-8-sig') as f:
        f.write(str(content))
        f.close()
        
print("\nSaved to: ", AWARD_HTML_PATH, "...")

69/69...
Saved to:  C:\Users\knaue\Documents\Data\NBA\AWARD_HTML ...


### ➤ 7 Get ALL Team Season Urls

In [13]:
content = request_data(SEASON_URL, with_comment=False)
content = content.find("table", id="stats")
df = pd.read_html(str(content))[0]
df = df.droplevel(0, axis=1)
df.drop(['MVP', 'Rookie of the Year', 'Points', 'Rebounds', 'Assists', 'Win Shares'], axis="columns", inplace=True)
df = df[df['Lg'] == 'NBA']

seasons = []
for season in df['Season'].values:
    season = content.find(text=season)
    seasons.append(urljoin(SEASON_URL, season.parent['href']))

df['Url_Season_Summary'] = seasons  
df['Url_Season_Standings'] = df['Url_Season_Summary'].apply(lambda cell: cell[:-len(".html")] + "_standings.html")
df['Url_Playoff_Standings'] = df['Url_Season_Standings'].str.replace("leagues", "playoffs")
df['Season'] = df['Season'].apply(lambda x: season_to_int(x))    
    
df.to_csv(SEASON_PATH, index=False, encoding="utf-8-sig")
print("Saved to: ", SEASON_PATH)

Saved to:  C:\Users\knaue\Documents\Data\NBA\Season_Urls.csv


### ➤ 8 Save ALL Team Season Html Pages 

In [14]:
df = pd.read_csv(SEASON_PATH, usecols=['Url_Season_Summary', 'Url_Season_Standings', 'Url_Playoff_Standings'], encoding="utf-8-sig")
i = 0

unique_url_season_sum = df['Url_Season_Summary'].unique()
unique_url_season_sta = df['Url_Season_Standings'].unique()
unique_url_playoff_sta = df['Url_Playoff_Standings'].unique()
unique_urls = np.concatenate((unique_url_season_sum, unique_url_season_sta, unique_url_playoff_sta), axis=0)

for url in unique_urls:
    i += 1
    sys.stdout.write(f"\r{i}/{len(unique_urls)}...")
    
    content = request_data(url=url, sleep_time_sec=4.0, with_comment=False)
    url = url.replace("/", "{").replace(":", "}")                           # Cant be saved the slash and : -> convert in /={ and :=}  
    season_path = os.path.join(SEASON_HTML_PATH, url)
    
    with open(season_path, "w", encoding='utf-8-sig') as f:
        f.write(str(content))
        f.close()

print("\nSaved to: ", SEASON_HTML_PATH, "...")

219/219...
Saved to:  C:\Users\knaue\Documents\Data\NBA\SEASON_HTML ...
