In [2]:
import os
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import random

# Set headers for HTTP requests
headers = {"User-Agent": "Mozilla/5.0"}

# Set range of years to scrape data for
start_year = 1991
end_year = 2024

for year in range(start_year, end_year + 1):
    filename = f"{year}_nba_stats.csv"

    # Check if the file already exists in the current directory
    if os.path.isfile(filename):
        print(f"{filename} already exists, skipping download.")
    else:
        # Scrape data from website
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
        source = requests.get(url, headers=headers)

        # Check if the request was successful
        if source.status_code == 200:
            soup = BeautifulSoup(source.content, "html.parser")

            # Find the table containing player statistics
            table = soup.find("table", id="per_game_stats")

            if table:
                # Extract column names from table header
                header_row = table.find("thead").find("tr")
                header = [th.getText() for th in header_row.findAll("th")]
                header = header[1:]  # Exclude the first column

                # Extract player statistics from table rows
                player_stats = []
                rows = table.find("tbody").findAll("tr")
                for row in rows:
                    player_data = [td.getText() for td in row.findAll("td")]
                    player_stats.append(player_data)

                # Create the DataFrame
                stats = pd.DataFrame(player_stats, columns=header)

                # Replace blank cells with NaN
                stats.replace("", np.nan, inplace=True)

                # Drop rows with all NaN values
                stats.dropna(how="all", inplace=True)

                # Fill remaining NaN values with 0.0
                stats.fillna(value=0.0, inplace=True)

                # Add a column for the year
                stats["Year"] = year

                # Write stats DataFrame to CSV file
                stats.to_csv(filename, index=False)
                print(f"Successfully wrote NBA stats for {year} to {filename}")
            else:
                print(f"Could not find table for {year}.")
        else:
            print(f"Failed to retrieve data for {year}. Status code: {source.status_code}")

        # Add a delay between requests to avoid getting blocked
        time.sleep(random.uniform(4, 6))

Successfully wrote NBA stats for 1991 to 1991_nba_stats.csv
Successfully wrote NBA stats for 1992 to 1992_nba_stats.csv
Successfully wrote NBA stats for 1993 to 1993_nba_stats.csv
Successfully wrote NBA stats for 1994 to 1994_nba_stats.csv
Successfully wrote NBA stats for 1995 to 1995_nba_stats.csv
Successfully wrote NBA stats for 1996 to 1996_nba_stats.csv
Successfully wrote NBA stats for 1997 to 1997_nba_stats.csv
Successfully wrote NBA stats for 1998 to 1998_nba_stats.csv
Successfully wrote NBA stats for 1999 to 1999_nba_stats.csv
Successfully wrote NBA stats for 2000 to 2000_nba_stats.csv
Successfully wrote NBA stats for 2001 to 2001_nba_stats.csv
Successfully wrote NBA stats for 2002 to 2002_nba_stats.csv
Successfully wrote NBA stats for 2003 to 2003_nba_stats.csv
Successfully wrote NBA stats for 2004 to 2004_nba_stats.csv
Successfully wrote NBA stats for 2005 to 2005_nba_stats.csv
Successfully wrote NBA stats for 2006 to 2006_nba_stats.csv
Successfully wrote NBA stats for 2007 to