In [34]:
import os
import sys
import requests

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from bs4 import BeautifulSoup


current_dir = os.path.dirname(os.path.abspath('/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Ball-Dont-Lie-API/nba-games.ipynb'))
project_root = os.path.join(current_dir, '..')
sys.path.insert(0, project_root)

In [10]:
dotenv_path = ("/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Ball-Dont-Lie-API/.env")
load_dotenv(dotenv_path)

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
TABLE_NAME = "Advanced_Stats"

DB_USER = os.getenv("DB_USER")

engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}')

In [15]:
base_url = "https://www.basketball-reference.com/leagues/NBA_2024.html"
response = requests.get(base_url)

In [20]:
soup = BeautifulSoup(response.text, "html.parser")

In [109]:
def get_advanced_stats(soup):
    table = soup.find("table", {"id": "advanced-team"})
    rows = []
    
    for row in table.find("tbody").find_all("tr"):
        cells = row.find_all(["th", "td"])
        formatted_cell = [cell.get_text(strip=True) for cell in cells]
        for clean_cell in formatted_cell:
            if len(clean_cell) == 0:
                formatted_cell.remove(clean_cell)
        rows.append(formatted_cell)
    return rows

def advanced_stats_headers():
    ## Advanced Stats Exraction
    headers = [
        "Rk", "Team", "Age", "W", "L", "PW", "PL", "MOV", "SOS", "SRS",
        "ORtg", "DRtg", "NRtg", "Pace", "FTr", "3PAr", "TS%", "",
        "eFG%", "TOV%", "ORB%", "FT/FGA", "", "eFG%", "TOV%", "DRB%", "FT/FGA",
        "Arena", "Attend.", "Attend./G"
    ]

    # Higher-level headers
    offensive_four_factors = ["eFG%", "TOV%", "ORB%", "FT/FGA"]
    defensive_four_factors = ["eFG%", "TOV%", "DRB%", "FT/FGA"]

    # Modify headers by prefixing higher-level headers
    adjusted_headers = []
    offense_added, defense_added = False, False

    for header in headers:
        if header in offensive_four_factors and not offense_added:
            adjusted_headers.append(f"Offensive {header}")
            offensive_four_factors.remove(header)
        elif header in defensive_four_factors and not defense_added:
            adjusted_headers.append(f"Defensive {header}")
            if header == "FT/FGA":
                defense_added = True
        elif header == "":
            continue
        else:
            adjusted_headers.append(header)
    return adjusted_headers

def convert_string_to_ints(df, year):
    
    for col in df.columns:
        if col != "Team" and col != "Arena":
            try:
                df[col] = df[col].str.replace(",", "").astype(float)
            except:
                continue
        elif col == "Team":
            df[col] = df[col].str.replace("*", "")
    
    df["season"] = year
    return df

In [None]:
years = [str(year) for year in range(2000, 2025)]

all_data= pd.DataFrame()
for year in years:
  base_url = f"https://www.basketball-reference.com/leagues/NBA_{year}.html"
  response = requests.get(base_url)
  soup = BeautifulSoup(response.text, "html.parser")
  try:
    year = int(year)
    rows = get_advanced_stats(soup)
    adjusted_headers = advanced_stats_headers()
    df = pd.DataFrame(rows, columns=adjusted_headers)
    df = convert_string_to_ints(df, year)
    all_data = pd.concat([all_data, df])
  except Exception as error:
    print(f" The error is {error}")