In [34]:
import os
import sys
import requests
import re

import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sqlalchemy import create_engine
from bs4 import BeautifulSoup


current_dir = os.path.dirname(os.path.abspath('/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Ball-Dont-Lie-API/nba-games.ipynb'))
project_root = os.path.join(current_dir, '..')
sys.path.insert(0, project_root)

In [10]:
dotenv_path = ("/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Ball-Dont-Lie-API/.env")
load_dotenv(dotenv_path)

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
TABLE_NAME = "NBA_Games"

DB_USER = os.getenv("DB_USER")

engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}')

In [15]:
base_url = "https://www.basketball-reference.com/leagues/NBA_2024.html"
response = requests.get(base_url)

In [20]:
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
def get_advanced_stats(soup):
    table = soup.find("table", {"id": "advanced-team"})
    rows = []
    
    for row in table.find("tbody").find_all("tr"):
        cells = row.find_all(["th", "td"])
        formatted_cell = [cell.get_text(strip=True) for cell in cells]
        for clean_cell in formatted_cell:
            if len(clean_cell) == 0:
                formatted_cell.remove(clean_cell)
        rows.append(formatted_cell)
    return rows

def advanced_stats_headers():
    ## Advanced Stats Exraction
    headers = [
        "Rk", "Team", "Age", "W", "L", "PW", "PL", "MOV", "SOS", "SRS",
        "ORtg", "DRtg", "NRtg", "Pace", "FTr", "3PAr", "TS%", "",
        "eFG%", "TOV%", "ORB%", "FT/FGA", "", "eFG%", "TOV%", "DRB%", "FT/FGA",
        "Arena", "Attend.", "Attend./G"
    ]

    # Higher-level headers
    offensive_four_factors = ["eFG%", "TOV%", "ORB%", "FT/FGA"]
    defensive_four_factors = ["eFG%", "TOV%", "DRB%", "FT/FGA"]

    # Modify headers by prefixing higher-level headers
    adjusted_headers = []
    offense_added, defense_added = False, False

    for header in headers:
        if header in offensive_four_factors and not offense_added:
            adjusted_headers.append(f"Offensive {header}")
            offensive_four_factors.remove(header)
        elif header in defensive_four_factors and not defense_added:
            adjusted_headers.append(f"Defensive {header}")
            if header == "FT/FGA":
                defense_added = True
        elif header == "":
            continue
        else:
            adjusted_headers.append(header)
    return adjusted_headers

def convert_string_to_ints(df):
    
    for col in df.columns:
        if col != "Team" and col != "Arena":
            try:
                df[col] = df[col].str.replace(",", "").astype(float)
            except:
                continue
        elif col == "Team":
            df[col] = df[col].str.replace("*", "")
    
    return df


In [105]:
rows = get_advanced_stats(soup)
adjusted_headers = advanced_stats_headers()
df = pd.DataFrame(rows, columns=adjusted_headers)
df = convert_string_to_ints(df)

In [108]:
df

Unnamed: 0,Rk,Team,Age,W,L,PW,PL,MOV,SOS,SRS,...,Offensive TOV%,Offensive ORB%,Offensive FT/FGA,Defensive eFG%,Defensive TOV%,Defensive DRB%,Defensive FT/FGA,Arena,Attend.,Attend./G
0,1,Boston Celtics,28.2,64,18,66,16,11.34,-0.6,10.75,...,10.8,24.9,0.18,0.523,10.8,76.3,0.145,TD Garden,785396,19156
1,2,Oklahoma City Thunder,23.4,57,25,58,24,7.41,-0.05,7.36,...,11.4,21.1,0.198,0.53,13.6,73.8,0.202,Paycom Center,715509,17451
2,3,Minnesota Timberwolves,27.2,56,26,57,25,6.45,-0.07,6.39,...,13.0,23.2,0.209,0.515,12.9,76.9,0.197,Target Center,738984,18024
3,4,Denver Nuggets,27.1,57,25,54,28,5.26,-0.03,5.23,...,11.5,25.5,0.17,0.526,11.2,75.6,0.195,Ball Arena,807062,19684
4,5,New York Knicks,26.4,50,32,53,29,4.59,-0.23,4.36,...,11.9,29.4,0.192,0.543,12.3,76.1,0.176,Madison Square Garden (IV),808885,19729
5,6,New Orleans Pelicans,26.0,49,33,52,30,4.41,0.05,4.46,...,11.8,24.4,0.202,0.541,12.9,76.9,0.186,Smoothie King Center,692054,17301
6,7,Los Angeles Clippers,30.4,51,31,49,33,3.28,0.13,3.41,...,12.0,24.4,0.211,0.541,11.7,74.8,0.183,Crypto.com Arena,776782,18946
7,8,Philadelphia 76ers,28.4,47,35,49,33,3.05,-0.54,2.51,...,10.7,24.8,0.218,0.537,13.1,74.3,0.215,Wells Fargo Center,821714,20042
8,9,Phoenix Suns,29.3,49,33,49,33,3.06,0.02,3.08,...,13.4,25.0,0.219,0.536,11.3,75.6,0.178,Footprint Center,699911,17071
9,10,Indiana Pacers,25.3,47,35,48,34,3.05,-0.3,2.75,...,11.3,23.8,0.173,0.556,12.1,74.1,0.228,Gainbridge Fieldhouse,677554,16526


In [None]:
years = [str(year) for year in range(1970, 2025)]
pages = range(1, 4)



all_data= pd.DataFrame()
for year in years:
  for page in pages:
    base_url = f"https://www.basketball-reference.com/leagues/NBA_{year}.html"
    response = requests.get(base_url)
    try:
      year = int(year)
      df = extract_nba_stats(response.text, year)
      all_data = pd.concat([all_data, df])
    except Exception as error:
      print(f" The error is {error}")

In [11]:

try:
    teams = client.nba.teams.list()
except AuthenticationError as e:
    print(f"Invalid API key. Status: {e.status_code}, Details: {e.response_data}")
except RateLimitError as e:
    print(f"Rate limit exceeded. Status: {e.status_code}, Details: {e.response_data}")
except ValidationError as e:
    print(f"Invalid request parameters. Status: {e.status_code}, Details: {e.response_data}")
except NotFoundError as e:
    print(f"Resource not found. Status: {e.status_code}, Details: {e.response_data}")
except ServerError as e:
    print(f"API server error. Status: {e.status_code}, Details: {e.response_data}")
except BallDontLieException as e:
    print(f"General API error. Status: {e.status_code}, Details: {e.response_data}")
except Exception as e:
    print(f"Unexpected error: {str(e)}")

In [14]:
teams

ListResponse(data=[NBATeam(id=1, conference='East', division='Southeast', city='Atlanta', name='Hawks', full_name='Atlanta Hawks', abbreviation='ATL'), NBATeam(id=2, conference='East', division='Atlantic', city='Boston', name='Celtics', full_name='Boston Celtics', abbreviation='BOS'), NBATeam(id=3, conference='East', division='Atlantic', city='Brooklyn', name='Nets', full_name='Brooklyn Nets', abbreviation='BKN'), NBATeam(id=4, conference='East', division='Southeast', city='Charlotte', name='Hornets', full_name='Charlotte Hornets', abbreviation='CHA'), NBATeam(id=5, conference='East', division='Central', city='Chicago', name='Bulls', full_name='Chicago Bulls', abbreviation='CHI'), NBATeam(id=6, conference='East', division='Central', city='Cleveland', name='Cavaliers', full_name='Cleveland Cavaliers', abbreviation='CLE'), NBATeam(id=7, conference='West', division='Southwest', city='Dallas', name='Mavericks', full_name='Dallas Mavericks', abbreviation='DAL'), NBATeam(id=8, conference='We