![](pngs/nba_html_crawler.png)
1. Define Foundation
2. Get Season Urls (1/2)
2. Get Season Urls (2/2)
3. Save ALL Team Seasons Html Pages 
---

### ➤ 1 Define Foundation 

In [1]:
import os
import sys
import time as t
from urllib.parse import urljoin

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
MAIN_URL = r"https://www.basketball-reference.com/"
SEASON_URL = r"https://www.basketball-reference.com/leagues/"

DATA_PATH = r"C:\Users\knaue\Documents\Data\NBA"
SEASON_HTML_PATH = os.path.join(DATA_PATH, "SEASON_HTML")
SEASON_PATH = os.path.join(DATA_PATH, "Season_Urls.csv")
SEASON_DETAIL_PATH = os.path.join(DATA_PATH, "Season_Detail_Urls.csv")

PARSER = 'lxml'

In [3]:
# Some infomations are hidden in comments --> convert comments in normal html code 
def filter_out_comment(soup: BeautifulSoup) -> BeautifulSoup:
    content = str(soup).replace('<!--', '')
    content = content.replace('-->', '')
    return BeautifulSoup(content, PARSER)

def request_data(url: str, sleep_time_sec: float = 1.0, with_comment: bool = True) -> BeautifulSoup:
    t.sleep(sleep_time_sec)
    
    if with_comment: 
        return BeautifulSoup(requests.get(url).content, PARSER)
    return filter_out_comment(BeautifulSoup(requests.get(url).content, PARSER))

### ➤ 2 Get Season Urls (1/2)

In [4]:
content = request_data(SEASON_URL, with_comment=False)
content = content.find("table", id="stats")
df = pd.read_html(str(content))[0]
df = df.droplevel(0, axis=1)

seasons = []
for season in df['Season'].values:
    season = content.find(text=season)
    seasons.append(urljoin(SEASON_URL, season.parent['href']))
    
df['Url_Season_Summary'] = seasons  
df.to_csv(SEASON_PATH, index=False, encoding="utf-8-sig")
print("Saved to: ", SEASON_PATH)

Saved to:  C:\Users\knaue\Documents\Data\NBA\Season_Urls.csv


### ➤ 2 Get Season Urls (2/2)

In [5]:
df_season_urls = pd.read_csv(SEASON_PATH, usecols=['Season', 'Champion', 'Lg', 'Url_Season_Summary'], encoding="utf-8-sig")
dfs = []
i = 0

for season, lg, champ, url in df_season_urls.values:
    i += 1
    sys.stdout.write(f"\r{i}/{len(df_season_urls)}...")
    
    content = request_data(url, sleep_time_sec=4.0, with_comment=False)
    table = content.find("table", id="per_game-team")
    df = pd.read_html(str(table))[0]
    df = df[:-1]
    
    df['Season'] = season
    df['Lg'] = lg
    df['Playoffs'] = df['Team'].str.contains("\*")
    df['Team'] = df['Team'].str.replace("\*", "", regex=True)
    
    if champ is np.nan: 
        df['Champion'] = None     
    else:    
        df['Champion'] = df['Team'].str.contains(champ)
    
    df.drop([name for name in df.columns if name not in ['Team', 'Season', 'Lg', 'Playoffs', 'Champion']], axis="columns", inplace=True)
    df['Url_Season_Summary'] = url
    sec_url = content.find("a", text="Standings")
    df['Url_Season_Standings'] = urljoin(SEASON_URL, sec_url['href'])
    df['Url_Playoff_Standings'] = df['Url_Season_Standings'].str.replace("leagues", "playoffs")
    
    teams_url = []
    for team in df['Team'].values:
        team = content.find(text=team)
        teams_url.append(urljoin(MAIN_URL, team.parent['href']))
        
    df['Url_Team_Season_Summary'] = teams_url  
    dfs.append(df)
    
dfs = pd.concat(dfs, ignore_index=True)
dfs.to_csv(SEASON_DETAIL_PATH, index=False, encoding="utf-8-sig")
print("\nSaved to: ", SEASON_DETAIL_PATH)

85/85...
Saved to:  C:\Users\knaue\Documents\Data\NBA\Season_Detail_Urls.csv


### ➤ 3 Save ALL Team Seasons Html Pages 

In [8]:
df_season_detail_urls = pd.read_csv(SEASON_DETAIL_PATH, usecols=['Url_Season_Summary', 'Url_Season_Standings', 'Url_Playoff_Standings'], encoding="utf-8-sig")
i = 0

unique_url_season_sum = df_season_detail_urls['Url_Season_Summary'].unique()
unique_url_season_sta = df_season_detail_urls['Url_Season_Standings'].unique()
unique_url_playoff_sta = df_season_detail_urls['Url_Playoff_Standings'].unique()
unique_urls = np.concatenate((unique_url_season_sum, unique_url_season_sta, unique_url_playoff_sta), axis=0)

for url in unique_urls:
    i += 1
    sys.stdout.write(f"\r{i}/{len(unique_urls)}...")
    
    content = request_data(url=url, sleep_time_sec=4.0, with_comment=True)
    url = url.replace("/", "{").replace(":", "}")                           # Cant be saved the slash and : -> convert in /={ and :=}  
    season_path = os.path.join(SEASON_HTML_PATH, url)
    
    with open(season_path, "w", encoding='utf-8-sig') as f:
        f.write(str(content))
        f.close()

print("\nSaved to: ", SEASON_HTML_PATH, "...")

228/228...
Saved to:  C:\Users\knaue\Documents\Data\NBA\SEASON_HTML ...
