In [7]:
# notebooks/web_scraping.ipynb

# Import necessary libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape player statistics from Basketball Reference
def scrape_basketball_reference(url, table_id):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')

        # Debug: print all available table IDs
        available_tables = [tbl.get('id') for tbl in soup.find_all('table')]
        print("Available table IDs for URL", url, ":", available_tables)

        # Find the table with the specified ID
        table = soup.find('table', {'id': table_id})
        if table is not None:
            df = pd.read_html(str(table))[0]
            return df
        else:
            print(f"No matching table found for the URL: {url}")
            return pd.DataFrame()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return pd.DataFrame()

# URLs for WNBA and NBA player statistics from Basketball Reference
wnba_url = "https://www.basketball-reference.com/wnba/years/2023_per_game.html"
nba_url = "https://www.basketball-reference.com/leagues/NBA_2023_per_game.html"

# Table IDs to look for
wnba_table_id = "per_game"
nba_table_id = "per_game_stats"

# Scraping data
wnba_data = scrape_basketball_reference(wnba_url, wnba_table_id)
nba_data = scrape_basketball_reference(nba_url, nba_table_id)

# Save the data to CSV files only if scraping was successful
if not wnba_data.empty:
    wnba_data.to_csv('../data/wnba_data.csv', index=False)
    print("WNBA data saved to CSV file.")
else:
    print("Failed to scrape WNBA data. Previous data is retained.")

if not nba_data.empty:
    nba_data.to_csv('../data/nba_data.csv', index=False)
    print("NBA data saved to CSV file.")
else:
    print("Failed to scrape NBA data. Previous data is retained.")


Available table IDs for URL https://www.basketball-reference.com/wnba/years/2023_per_game.html : ['per_game']


  df = pd.read_html(str(table))[0]


Available table IDs for URL https://www.basketball-reference.com/leagues/NBA_2023_per_game.html : ['per_game_stats']
WNBA data saved to CSV file.
NBA data saved to CSV file.


  df = pd.read_html(str(table))[0]
