In [2]:
"""
NBA Team Statistics Fetcher and Processor

This script fetches base and advanced team statistics for specified NBA seasons
using the nba_api library. It performs the following steps:

1. Defines the range of seasons to process.
2. Sets up relative paths for saving raw and cleaned data based on a
   predefined project structure ('../../Project-8---Sports-Analysis/data/').
3. Creates necessary directories if they don't exist.
4. Iterates through each specified season:
    a. Creates a season-specific subdirectory within the raw data path.
    b. Fetches 'Base' team statistics (totals) using nba_api.
    c. Saves the raw base stats to a CSV file in the season's raw data directory.
    d. Fetches 'Advanced' team statistics (totals) using nba_api.
    e. Saves the raw advanced stats to a CSV file.
    f. Pauses briefly between API calls to respect potential rate limits.
    g. Selects relevant columns from both base and advanced stats.
    h. Merges the selected dataframes.
    i. Calculates additional stats (e.g., 2-point attempts).
    j. Renames columns for clarity in the cleaned dataset.
    k. Reorders columns for the final clean output.
    l. Saves the processed (cleaned) data for the season to a CSV file in the
       main cleaned data directory.
5. Includes commented-out optional code to combine all cleaned seasonal data
   into a single dataframe and save it.

Dependencies:
- pandas
- nba_api (Install via: pip install nba_api)
- os, time, warnings (standard libraries)

Expected Project Structure for Output:
The script assumes it is run from a location where the following relative paths
are valid:
- ../../Project-8---Sports-Analysis/data/rawdata/
- ../../Project-8---Sports-Analysis/data/cleandata/

Make sure these base directories exist or can be created by the script.
"""

# Install nba_api if you haven't already
# pip install nba_api

import pandas as pd
from nba_api.stats.endpoints import leaguedashteamstats
import time
import os # Used for directory creation and path manipulation
import warnings

# Suppress potential FutureWarning messages from dependencies (e.g., pandas, nba_api)
# This helps keep the output clean during execution.
warnings.simplefilter(action='ignore', category=FutureWarning)

# --- Configuration ---

# Define the seasons to fetch data for (inclusive range)
start_year = 2021 # Season starts in this year (e.g., 2021 for 2021-22 season)
end_year = 2024 # Fetch up to the season starting in this year (e.g., 2024 for 2024-25)

# Generate season strings in the format required by nba_api (e.g., "2021-22")
seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(start_year, end_year + 1)]

# Define relative paths for saving data.
# IMPORTANT: These paths are relative to the script's execution location.
# Ensure the base 'Project-8---Sports-Analysis/data/' structure exists or adjust paths.
base_save_path = '../../Project-8---Sports-Analysis/data/rawdata/'  # Directory for raw API output
clean_save_path = '../../Project-8---Sports-Analysis/data/cleandata/' # Directory for processed data

# --- Setup Output Directories ---

# Create the main raw and clean data directories if they don't exist.
# exist_ok=True prevents an error if the directory already exists.
try:
    os.makedirs(base_save_path, exist_ok=True)
    os.makedirs(clean_save_path, exist_ok=True)
    print(f"Ensured raw data directory exists: {os.path.abspath(base_save_path)}")
    print(f"Ensured clean data directory exists: {os.path.abspath(clean_save_path)}")
except OSError as e:
    print(f"Error creating base directories: {e}")
    print("Script cannot continue without output directories. Exiting.")
    exit() # Exit if we can't create essential directories

# (Optional) List to store individual season dataframes if combining later
# all_seasons_data = []

print(f"\nFetching data for seasons: {seasons}")

# --- Main Processing Loop (Iterate through Seasons) ---

for season in seasons:
    # Create a short season folder name (e.g., '21-22' from '2021-22') for organization
    season_folder_name = f"{season[2:4]}-{season[-2:]}"
    # Construct the full path to the season-specific raw data directory
    season_raw_save_dir = os.path.join(base_save_path, season_folder_name)

    # Create the season-specific directory for raw data
    try:
        os.makedirs(season_raw_save_dir, exist_ok=True)
        print(f"\nProcessing {season}... Saving raw data to: {season_raw_save_dir}")
    except OSError as e:
        print(f"Error creating directory {season_raw_save_dir}: {e}")
        print(f"Skipping season {season} due to directory creation error.")
        continue # Skip to the next season if directory creation fails

    try:
        # --- Fetch Base Stats ---
        print(f"  Fetching Base stats for {season}...")
        base_stats = leaguedashteamstats.LeagueDashTeamStats(
            season=season,
            measure_type_detailed_defense='Base', # Specify 'Base' stats
            per_mode_detailed='Totals' # Get season totals, not per game
        )
        base_df = base_stats.get_data_frames()[0] # The stats are in the first DataFrame

        # --- Save Raw Base Stats ---
        base_filename = os.path.join(season_raw_save_dir, 'base_stats.csv')
        base_df.to_csv(base_filename, index=False)
        print(f"    Saved raw base stats to {base_filename}")
        # Pause briefly after API call to avoid hitting rate limits
        time.sleep(1)

        # --- Fetch Advanced Stats ---
        print(f"  Fetching Advanced stats for {season}...")
        advanced_stats = leaguedashteamstats.LeagueDashTeamStats(
            season=season,
            measure_type_detailed_defense='Advanced', # Specify 'Advanced' stats
            per_mode_detailed='Totals' # Get season totals
        )
        advanced_df = advanced_stats.get_data_frames()[0]

        # --- Save Raw Advanced Stats ---
        advanced_filename = os.path.join(season_raw_save_dir, 'advanced_stats.csv')
        advanced_df.to_csv(advanced_filename, index=False)
        print(f"    Saved raw advanced stats to {advanced_filename}")
        # Pause briefly after API call
        time.sleep(1)

        # --- Process Data for Clean Output ---
        print(f"  Processing and cleaning data for {season}...")

        # Select specific columns needed for the final analysis from base stats
        # We select identifiers, Wins, and key shooting percentages/attempts.
        base_subset = base_df[['TEAM_ID', 'TEAM_NAME', 'W', 'FG_PCT', 'FG3_PCT', 'FGA', 'FG3A']].copy()

        # Select specific columns needed for the final analysis from advanced stats
        # We select identifiers, ratings, and efficiency metrics.
        advanced_subset = advanced_df[[
            'TEAM_ID', 'TEAM_NAME', 'OFF_RATING', 'DEF_RATING', 'TS_PCT',
            'EFG_PCT', 'TM_TOV_PCT'
        ]].copy()
        # Using .copy() prevents SettingWithCopyWarning later

        # Merge the selected base and advanced stats based on team identifiers
        season_df = pd.merge(base_subset, advanced_subset, on=['TEAM_ID', 'TEAM_NAME'], how='inner')

        # Calculate missing statistics required for analysis
        # FG2A (2-point attempts) = Total Attempts (FGA) - 3-point Attempts (FG3A)
        season_df['FG2A'] = season_df['FGA'] - season_df['FG3A']

        # Add a column to identify the season for each row, useful when combining later
        season_df['SEASON'] = season

        # Rename columns to be more descriptive or standardized for the clean dataset
        season_df.rename(columns={
             'W': 'Wins', 'OFF_RATING': 'ORTG', 'DEF_RATING': 'DRTG',
             'TS_PCT': 'TS_Percent', 'FG3_PCT': 'FG3_Percent', 'FG_PCT': 'FG_Percent',
             'FG3A': 'Team_3s_Attempted', 'FG2A': 'Team_2s_Attempted', # Note: calculated above
             'EFG_PCT': 'eFG_Percent', 'TM_TOV_PCT': 'TOV_Percent'
        }, inplace=True)

        # Select and reorder columns for the final clean DataFrame output
        season_df = season_df[[
            'SEASON', 'TEAM_NAME', 'TEAM_ID', 'Wins', 'ORTG', 'DRTG',
            'eFG_Percent', 'TS_Percent', 'TOV_Percent', 'FG3_Percent',
            'FG_Percent', 'Team_3s_Attempted', 'Team_2s_Attempted'
        ]]

        # --- Save Clean Season DataFrame ---
        # Use the short season name for the clean file, saved in the main clean directory
        clean_filename = os.path.join(clean_save_path, f"{season_folder_name}.csv")
        season_df.to_csv(clean_filename, index=False)
        print(f"    Saved clean data for {season} to {clean_filename}")

        # Optional: Append the processed DataFrame to a list for later concatenation
        # all_seasons_data.append(season_df)

        print(f"  Successfully processed and saved data for {season}")

    # Catch any exception during API fetch, processing, or saving for a given season
    except Exception as e:
        print(f"  ERROR: Could not fetch, process, or save data for season {season}: {e}")
        # Continue to the next season even if one fails

    # Longer delay between processing different seasons (optional, but polite to the API)
    time.sleep(2)

print("\nScript finished.")
print("Individual cleaned season files saved in:", os.path.abspath(clean_save_path))
print("Raw data saved in subdirectories within:", os.path.abspath(base_save_path))

Ensured raw data directory exists: /home/grenadi3/Project-8---Sports-Analysis/data/rawdata
Ensured clean data directory exists: /home/grenadi3/Project-8---Sports-Analysis/data/cleandata

Fetching data for seasons: ['2021-22', '2022-23', '2023-24', '2024-25']

Processing 2021-22... Saving raw data to: ../../Project-8---Sports-Analysis/data/rawdata/21-22
  Fetching Base stats for 2021-22...
    Saved raw base stats to ../../Project-8---Sports-Analysis/data/rawdata/21-22/base_stats.csv
  Fetching Advanced stats for 2021-22...
    Saved raw advanced stats to ../../Project-8---Sports-Analysis/data/rawdata/21-22/advanced_stats.csv
  Processing and cleaning data for 2021-22...
    Saved clean data for 2021-22 to ../../Project-8---Sports-Analysis/data/cleandata/21-22.csv
  Successfully processed and saved data for 2021-22

Processing 2022-23... Saving raw data to: ../../Project-8---Sports-Analysis/data/rawdata/22-23
  Fetching Base stats for 2022-23...
    Saved raw base stats to ../../Project

In [3]:

import pandas as pd
# Make sure to import the correct endpoint for player stats
from nba_api.stats.endpoints import leaguedashplayerstats 
import time
import os 
import warnings

# Suppress potential warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Define the seasons to fetch data for (same as before)
start_year = 2021
end_year = 2024 

seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(start_year, end_year + 1)]

# Define the base paths using the exact strings provided
# IMPORTANT: Ensure these paths are correct relative to where your script/notebook is running
# If your notebook is in Project-8---Sports-Analysis/notebooks/, these paths might need adjustment (e.g., remove '../../')
# Assuming the script is run from a location where these paths make sense:
base_save_path = '../../Project-8---Sports-Analysis/data/rawdata/'  # Directory for raw API output
clean_save_path = '../../Project-8---Sports-Analysis/data/cleandata/'  # Directory for clean df

# --- Create Base Directories ---
# Use a consistent approach for directory creation based on the provided paths
# Let's adjust the path creation logic slightly to handle the explicit project name in the path
# We will create the base 'rawdata' and 'cleandata' folders if they don't exist within the specified structure

# Get the directory part of the provided paths
raw_base_dir = os.path.dirname(base_save_path) # Should be '../../Project-8---Sports-Analysis/data'
clean_base_dir = os.path.dirname(clean_save_path) # Should be '../../Project-8---Sports-Analysis/data'

try:
    # Create the parent directories ('data') if needed
    os.makedirs(raw_base_dir, exist_ok=True)
    os.makedirs(clean_base_dir, exist_ok=True)
    # Now ensure the 'rawdata' and 'cleandata' subdirectories exist
    os.makedirs(base_save_path, exist_ok=True)
    os.makedirs(clean_save_path, exist_ok=True)
    print(f"Ensured raw data directory exists: {os.path.abspath(base_save_path)}")
    print(f"Ensured clean data directory exists: {os.path.abspath(clean_save_path)}")
except OSError as e:
    print(f"Error creating base directories: {e}")
    exit()


print(f"\nFetching player minutes data for seasons: {seasons}")

for season in seasons:
    # Create season-specific directory name (e.g., '21-22')
    season_folder_name = f"{season[2:4]}-{season[-2:]}"
    # Construct the full path for the season's raw data using the provided base_save_path
    season_raw_save_dir = os.path.join(base_save_path, season_folder_name)
    
    # Create the season-specific raw directory
    try:
        os.makedirs(season_raw_save_dir, exist_ok=True)
        print(f"\nProcessing {season}... Saving raw player data to: {season_raw_save_dir}")
    except OSError as e:
        print(f"Error creating directory {season_raw_save_dir}: {e}")
        continue 

    try:
        # --- Fetch Player Stats (including Minutes) ---
        print(f"  Fetching Base player stats for {season}...")
        player_stats = leaguedashplayerstats.LeagueDashPlayerStats(
            season=season,
            measure_type_detailed_defense='Base', # Base stats include MIN
            per_mode_detailed='Totals' # Get total minutes for the season
        )
        player_stats_df = player_stats.get_data_frames()[0]
        
        # --- Save Raw Player Stats ---
        raw_filename = os.path.join(season_raw_save_dir, f'player_minutes_raw_{season_folder_name}.csv')
        player_stats_df.to_csv(raw_filename, index=False)
        print(f"    Saved raw player stats to {raw_filename}")
        time.sleep(1) 

        # --- Create Clean Player Minutes DataFrame ---
        # Select relevant columns
        # Note: Includes TEAM_ABBREVIATION. Players traded mid-season might have multiple rows 
        # or a row with TEAM_ABBREVIATION = 'TOT' representing their total across teams.
        clean_df = player_stats_df[[
            'PLAYER_ID', 
            'PLAYER_NAME', 
            'TEAM_ID', 
            'TEAM_ABBREVIATION', 
            'GP', # Games Played
            'MIN' # Minutes Played
        ]].copy() # Use copy to avoid SettingWithCopyWarning

        # Add Season Column
        clean_df['SEASON'] = season

        # --- Save Clean Player Minutes DataFrame ---
        # Construct filename for the clean data directory
        clean_filename = os.path.join(clean_save_path, f"player_minutes_{season_folder_name}.csv")
        clean_df.to_csv(clean_filename, index=False)
        print(f"    Saved clean player minutes data for {season} to {clean_filename}")
        
        print(f"  Successfully processed and saved player minutes for {season}")

    except Exception as e:
        print(f"  Could not fetch, process, or save player data for season {season}: {e}")
    
    time.sleep(2) # Longer delay between seasons

print("\nScript finished. Individual raw and cleaned player minutes files saved.")

Ensured raw data directory exists: /home/grenadi3/Project-8---Sports-Analysis/data/rawdata
Ensured clean data directory exists: /home/grenadi3/Project-8---Sports-Analysis/data/cleandata

Fetching player minutes data for seasons: ['2021-22', '2022-23', '2023-24', '2024-25']

Processing 2021-22... Saving raw player data to: ../../Project-8---Sports-Analysis/data/rawdata/21-22
  Fetching Base player stats for 2021-22...
    Saved raw player stats to ../../Project-8---Sports-Analysis/data/rawdata/21-22/player_minutes_raw_21-22.csv
    Saved clean player minutes data for 2021-22 to ../../Project-8---Sports-Analysis/data/cleandata/player_minutes_21-22.csv
  Successfully processed and saved player minutes for 2021-22

Processing 2022-23... Saving raw player data to: ../../Project-8---Sports-Analysis/data/rawdata/22-23
  Fetching Base player stats for 2022-23...
    Saved raw player stats to ../../Project-8---Sports-Analysis/data/rawdata/22-23/player_minutes_raw_22-23.csv
    Saved clean playe