In [1]:
import os
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import random

# Set headers for HTTP requests
headers = {"User-Agent": "Mozilla/5.0"}

# Set range of years to scrape data for
start_year = 1991
end_year = 1992

for year in range(start_year, end_year + 1):
    filename = f"{year}_nba_stats.csv"

    # Check if the file already exists in the current directory
    if os.path.isfile(filename):
        print(f"{filename} already exists, skipping download.")
    else:
        # Scrape data from website
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
        source = requests.get(url, headers=headers)

        # Check if the request was successful
        if source.status_code == 200:
            soup = BeautifulSoup(source.content, "html.parser")

            # Extract data and create DataFrame
            rows = soup.find_all("tr")
            for row in rows:
                if "class" in row.attrs and "thead" in row.attrs["class"]:
                    header = [th.getText() for th in row.findAll("th")]
                    header = header[1:]
                    break
            else:
                print(f"Could not find header row in HTML table for {year}.")
                header = []

            if header:
                player_stats = [
                    [td.getText() for td in row.findAll("td")] for row in rows if "class" in row.attrs and "full_table" in row.attrs["class"]
                ]
                stats = pd.DataFrame(player_stats, columns=header)

                # Replace blank cells with NaN
                stats.replace("", np.nan, inplace=True)

                # Drop rows with all NaN values
                stats.dropna(how="all", inplace=True)

                # Fill remaining NaN values with 0.0
                stats.fillna(value=0.0, inplace=True)

                # Add a column for the year
                stats["Year"] = year

                # Write stats DataFrame to CSV file
                stats.to_csv(filename, index=False)
                print(f"Successfully wrote NBA stats for {year} to {filename}")

            # Add a delay between requests to avoid getting blocked
            time.sleep(random.uniform(4, 6))
        else:
            print(f"Failed to retrieve data for {year}. Status code: {source.status_code}")

Successfully wrote NBA stats for 1991 to 1991_nba_stats.csv
Successfully wrote NBA stats for 1992 to 1992_nba_stats.csv
