In [9]:
import pandas as pd

In [4]:
# --- Season Range Definition ---

# Define the starting year for the period of interest.
# Represents the year the first season begins (e.g., 2021 for the 2021-22 season).
start_year = 2021

# Define the ending year for the period of interest.
# Represents the year the last season begins (e.g., 2024 for the 2024-25 season).
# The range function will include this year.
end_year = 2024

# --- Generate Season Strings (Short Format) ---

# Create a list of season strings in the short 'YY-YY' format (e.g., "21-22").
seasons_short_format = [f"{str(year)[-2:]}-{str(year+1)[-2:]}" for year in range(start_year, end_year + 1)]

# Optional: Print to verify the generated list
# print(f"Generated short-format seasons: {seasons_short_format}")
# Expected output: ['21-22', '22-23', '23-24', '24-25']

In [5]:
import pandas as pd
import os # Import os to potentially check file existence if needed

# Assume 'seasons_short_format' list (e.g., ['21-22', '22-23', ...]) is defined previously

# --- Configuration ---
# Define base paths relative to the script's current location.
# Adjust '../' if your script is nested differently relative to the 'data' folder.
RAW_DATA_DIR = '../data/rawdata/'
CLEAN_DATA_DIR = '../data/cleandata/'

# --- Load Non-Seasonal Player Data ---

# Define the full path using os.path.join for cross-platform compatibility
player_length_file = os.path.join(RAW_DATA_DIR, 'player_length.csv')
print(f"Attempting to load player data from: {player_length_file}")

try:
    # Load baseline player data (e.g., career length, contract info)
    player_length_data = pd.read_csv(player_length_file)
    print("Successfully loaded 'player_length.csv'.")
    # Display the first few rows (completing the original last line)
    print("\n--- Sample of player_length_data ---")
    print(player_length_data.head())
except FileNotFoundError:
    print(f"ERROR: File not found at {player_length_file}")
    print("Please ensure the relative path is correct based on script location.")
    player_length_data = None # Handle error appropriately
except Exception as e:
    print(f"ERROR: Failed to load {player_length_file}: {e}")
    player_length_data = None

# --- Load Seasonal Data (Team and Player Minutes) ---

# Prepare lists to store dataframes from each season
all_team_data = []
all_player_minute_data = [] # Using more specific name again

print("\nLoading seasonal team and player minute data...")
# Ensure 'seasons_short_format' (or your correct season list variable) is defined
if 'seasons_short_format' in locals() and player_length_data is not None:
    for season in seasons_short_format: # Use the appropriate season list variable
        print(f"  Processing season: {season}")
        try:
            # Construct relative paths to the cleaned seasonal data files
            team_data_path = os.path.join(CLEAN_DATA_DIR, f'{season}.csv')
            player_minute_path = os.path.join(CLEAN_DATA_DIR, f'player_minutes_{season}.csv')

            # Load the team data for the current season
            print(f"    Attempting to load team data: {team_data_path}")
            team_data = pd.read_csv(team_data_path)
            all_team_data.append(team_data) # Add the loaded dataframe to the list
            print(f"      Successfully loaded team data.")

            # Load the player minute data for the current season
            print(f"    Attempting to load player minute data: {player_minute_path}")
            # Renaming variable back to player_minute_data for clarity
            player_minute_data = pd.read_csv(player_minute_path)
            all_player_minute_data.append(player_minute_data) # Add the loaded dataframe to the list
            print(f"      Successfully loaded player minute data.")

        except FileNotFoundError as e:
            # Warning if a specific season's file is missing, but continue
            print(f"    WARNING: File not found for season {season}. Details: {e}")
            print(f"    Skipping missing file(s) for season {season}.")
        except Exception as e:
            # Error for other issues during loading/processing for a season
            print(f"    ERROR: Failed to load or process data for season {season}: {e}")
            # Consider whether to skip the season or halt execution depending on severity

    print("\nFinished loading seasonal data.")

    # Optional: Concatenate lists into single DataFrames after the loop
    # if all_team_data:
    #     combined_team_data = pd.concat(all_team_data, ignore_index=True)
    #     print(f"\nCombined team data shape: {combined_team_data.shape}")
    # if all_player_minute_data:
    #     combined_player_minute_data = pd.concat(all_player_minute_data, ignore_index=True)
    #     print(f"Combined player minute data shape: {combined_player_minute_data.shape}")

elif player_length_data is None:
    print("Skipping seasonal data loading because player_length_data failed to load.")
else:
    print("ERROR: The 'seasons_short_format' list is not defined. Cannot load seasonal data.")

# Data is now loaded:
# - player_length_data (DataFrame or None)
# - all_team_data (list of DataFrames)
# - all_player_minute_data (list of DataFrames)

Attempting to load player data from: ../data/rawdata/player_length.csv
Successfully loaded 'player_length.csv'.

--- Sample of player_length_data ---
   Index               Player Team Position    Height Wingspan  Length
0      1             Mo Bamba  NaN        C  6'11.25"    7'10"   10.75
1      2  Talen Horton-Tucker  CHI       SG    6'2.5"  7'1.25"   10.75
2      3       Jalen Williams  OKC       SG    6'4.5"  7'2.25"    9.75
3      4       Isaiah Stewart  DET        C   6'7.25"  7'4.75"    9.50
4      5  Robert Williams III  POR        C      6'8"   7'5.5"    9.50

Loading seasonal team and player minute data...
  Processing season: 21-22
    Attempting to load team data: ../data/cleandata/21-22.csv
      Successfully loaded team data.
    Attempting to load player minute data: ../data/cleandata/player_minutes_21-22.csv
      Successfully loaded player minute data.
  Processing season: 22-23
    Attempting to load team data: ../data/cleandata/22-23.csv
      Successfully loaded te

In [6]:
# -*- coding: utf-8 -*-
"""
Processes NBA player minutes and physical measurements (height, wingspan)
to calculate minute-weighted average height and wingspan for each team
per season, updating the team's cleaned data file.

Assumptions:
- Script is run from a directory where '../data/' points to the data folder.
  Expected structure:
    Project-Root/
    ├── execution_location/ (e.g., notebooks/)
    │   └── this_script.py (or notebook)
    └── data/
        ├── rawdata/
        │   └── player_length.csv
        └── cleandata/
            ├── YYYY-YY.csv (e.g., 2021-22.csv - input team data)
            └── player_minutes_YYYY-YY.csv (input player minutes)
            (Script will overwrite YYYY-YY.csv files in cleandata)

Inputs:
- data/rawdata/player_length.csv: Contains player names, height, wingspan.
- data/cleandata/{season}.csv: Contains team stats for a given season.
- data/cleandata/player_minutes_{season}.csv: Contains player minutes per team.

Outputs:
- Overwrites data/cleandata/{season}.csv files, adding columns:
  'Weighted_Avg_Height' and 'Weighted_Avg_Wingspan'.
"""

from pathlib import Path
import pandas as pd
import numpy as np
import warnings

# Suppress potential warnings (e.g., from string manipulations if needed)
warnings.simplefilter(action='ignore', category=FutureWarning) # Example warning suppression

# --- Helper Function: Parse Measurement Strings ---
def parse_to_inches(measurement_str: str) -> float:
    """
    Converts height or wingspan strings (e.g., "6'11.25\"" or "7'5\"")
    into total inches as a float.

    Args:
        measurement_str: The string representation of the measurement.

    Returns:
        The measurement in total inches, or np.nan if parsing fails
        or input is invalid/missing.
    """
    # Handle null or empty inputs
    if pd.isna(measurement_str) or not str(measurement_str).strip():
        return np.nan

    # Clean the string: remove quotes and extra whitespace
    cleaned_str = str(measurement_str).replace('"', '').strip()

    try:
        # Split into feet and inches based on the apostrophe
        feet_str, inches_str = cleaned_str.split("'")
        feet = float(feet_str)
        # Handle cases with only feet (e.g., "6'") or missing inches part
        inches = float(inches_str) if inches_str else 0.0
        # Calculate total inches
        return (feet * 12) + inches
    except ValueError: # Catches errors from split or float conversion
        # Return NaN if the format is unexpected
        return np.nan
    except Exception as e: # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error parsing '{measurement_str}': {e}")
        return np.nan

# --- Step 1: Define Project Directories using pathlib ---
# Get the current working directory (where the script/notebook is run)
cwd = Path.cwd()
print(f"Notebook CWD:      {cwd}")

# Define the project root relative to the CWD.
# Assumes the CWD is one level inside the main project directory (e.g., in 'notebooks/')
project_root = cwd.parent
# Define paths to data directories relative to the project root
rawdata_dir = project_root / "data" / "rawdata"
cleandata_dir = project_root / "data" / "cleandata"

# Verify the constructed paths
print(f"Project root:      {project_root}")
print(f"Raw data dir:      {rawdata_dir}")
print(f"Clean data dir:    {cleandata_dir}")

# Ensure data directories exist (optional but good practice)
if not rawdata_dir.is_dir() or not cleandata_dir.is_dir():
    print("\nWarning: One or both data directories may not exist at the expected location.")
    # Consider raising an error if directories are essential
    # raise FileNotFoundError("Required data directories not found.")

# --- Step 2: Load & Preprocess Player Measurement Data (Once) ---
player_length_file = rawdata_dir / "player_length.csv"

# Check if the essential player length file exists before proceeding
if not player_length_file.exists():
    raise FileNotFoundError(f"CRITICAL: Could not find required input file {player_length_file}")

print(f"\nLoading player measurement data from: {player_length_file}")
# Load the raw player data
pl_raw_df = pd.read_csv(player_length_file) # Renamed pl -> pl_raw_df for clarity

# Preprocess the data:
# 1. Parse height and wingspan strings into numerical inches using the helper function
pl_raw_df["Height_inches"] = pl_raw_df["Height"].apply(parse_to_inches)
pl_raw_df["Wingspan_inches"] = pl_raw_df["Wingspan"].apply(parse_to_inches)
# 2. Normalize player names (lowercase, strip whitespace) for consistent merging later
pl_raw_df["Player_norm"] = pl_raw_df["Player"].str.lower().str.strip()

# 3. Select relevant columns and filter out rows with missing measurements
#    These are the players with valid data we can use for calculations.
player_length_processed = (
    pl_raw_df[["Player_norm", "Height_inches", "Wingspan_inches"]]
    .dropna(subset=["Height_inches", "Wingspan_inches"])
    # Optional: Could also drop if Player_norm is NaN/empty if needed
    # .dropna(subset=["Player_norm"])
)
print(f"Processing complete. Found {len(player_length_processed)} players with valid height & wingspan records.")
# print(player_length_processed.head()) # Optional: uncomment to view sample processed data

# --- Step 3: Loop Through Seasons, Calculate Weighted Averages, and Update Team Files ---
# Define the range of seasons to process (using short YY-YY format)
start_year = 2021
end_year = 2024
seasons_to_process = [f"{str(y)[-2:]}-{str(y+1)[-2:]}" for y in range(start_year, end_year + 1)]
print(f"\nSeasons to process: {seasons_to_process}")

# Process each season individually
for season in seasons_to_process:
    print(f"\n➡️ Processing Season: {season}")

    # Define paths for the required input files for this season
    team_file = cleandata_dir / f"{season}.csv"
    mins_file = cleandata_dir / f"player_minutes_{season}.csv"

    # Check if both necessary files for the season exist before proceeding
    if not team_file.exists() or not mins_file.exists():
        missing_files = []
        if not team_file.exists(): missing_files.append(team_file.name)
        if not mins_file.exists(): missing_files.append(mins_file.name)
        print(f"  Skipping {season}: Missing required file(s): {', '.join(missing_files)}")
        continue # Skip to the next season

    try:
        # Load the team statistics and player minutes data for the current season
        team_df = pd.read_csv(team_file)
        mins_df = pd.read_csv(mins_file)
        print(f"  Loaded team_df ({team_df.shape}) and mins_df ({mins_df.shape})")

        # --- Data Preparation for Merging ---
        # Normalize player names in the minutes data for merging
        mins_df["PLAYER_NAME_norm"] = mins_df["PLAYER_NAME"].str.lower().str.strip()

        # --- Merging and Filtering ---
        # Merge player minutes with the processed player measurement data
        # Using a left merge to keep all player minute records, adding measurements where available
        merged_df = pd.merge(
            mins_df,
            player_length_processed,
            left_on="PLAYER_NAME_norm",
            right_on="Player_norm", # Merge key from the preprocessed length data
            how="left" # Keep all rows from mins_df
        )
        # print(f"  Shape after merging minutes with player lengths: {merged_df.shape}")

        # Filter for valid records needed for calculation:
        # - Must have minutes played (MIN)
        # - Must have successfully merged height and wingspan
        # - Minutes must be greater than 0 to be included in weighted average
        valid_player_data = (
            merged_df
            .dropna(subset=["MIN", "Height_inches", "Wingspan_inches"]) # Ensure required numerics are present
            .query("MIN > 0") # Ensure minutes are positive
            .copy() # Use .copy() to avoid SettingWithCopyWarning later
        )
        print(f"  Found {len(valid_player_data)} player records with valid minutes and measurements for weighting.")

        # --- Weighted Average Calculation ---
        # Check if there's any valid data to perform calculations
        if valid_player_data.empty:
            # If no valid player data for this season, fill team averages with NaN
            print("  No valid player data found for weighting → filling team averages with NaN for this season.")
            team_df["Weighted_Avg_Height"] = np.nan
            team_df["Weighted_Avg_Wingspan"] = np.nan
        else:
            # Calculate intermediate terms for weighted average: (Minute * Measurement)
            valid_player_data["minutes_x_height"] = valid_player_data["MIN"] * valid_player_data["Height_inches"]
            valid_player_data["minutes_x_wingspan"] = valid_player_data["MIN"] * valid_player_data["Wingspan_inches"]

            # Aggregate by team: sum of minutes, sum of (min*height), sum of (min*wingspan)
            team_aggregates = (
                valid_player_data
                .groupby("TEAM_ID")
                .agg(
                    Sum_MIN=("MIN", "sum"), # Total minutes played by valid players on team
                    Sum_Height_x_Minutes=("minutes_x_height", "sum"),
                    Sum_Wingspan_x_Minutes=("minutes_x_wingspan", "sum")
                )
                .reset_index() # Convert TEAM_ID back to a column
            )

            # Calculate the final minute-weighted averages for each team
            team_aggregates["Weighted_Avg_Height"] = team_aggregates["Sum_Height_x_Minutes"] / team_aggregates["Sum_MIN"]
            team_aggregates["Weighted_Avg_Wingspan"] = team_aggregates["Sum_Wingspan_x_Minutes"] / team_aggregates["Sum_MIN"]
            print(f"  Calculated weighted averages for {len(team_aggregates)} teams.")

            # --- Merging Results back to Team Data ---
            # Merge the calculated weighted averages back into the original team dataframe
            team_df = team_df.merge(
                team_aggregates[["TEAM_ID", "Weighted_Avg_Height", "Weighted_Avg_Wingspan"]],
                on="TEAM_ID",
                how="left" # Keep all original teams; teams without valid players will get NaN here
            )
            # print(f"  Merged weighted averages back into team_df → New shape: {team_df.shape}")

        # --- Save Updated Team File ---
        # Overwrite the original cleaned team file with the updated dataframe
        # (which now includes the weighted average columns)
        print(f"  Saving updated data with weighted averages to: {team_file.name}")
        team_df.to_csv(team_file, index=False)
        print(f"  ✅ Successfully saved updated {team_file.name}")

    except pd.errors.EmptyDataError:
        print(f"  ERROR: One of the input files for season {season} is empty. Skipping.")
        continue # Skip to next season
    except KeyError as e:
        print(f"  ERROR: Missing expected column in input file for season {season}: {e}. Skipping.")
        continue # Skip to next season
    except Exception as e:
        # Catch any other unexpected errors during the processing of a season
        print(f"  ERROR: An unexpected error occurred processing season {season}: {e}")
        # Depending on severity, might want to 'continue' or 'raise e'
        continue

# --- Completion ---
print("\n🎉 All seasons processed successfully!")

Notebook CWD:      /home/grenadi3/Project-8---Sports-Analysis/data
Project root:      /home/grenadi3/Project-8---Sports-Analysis
Raw data dir:      /home/grenadi3/Project-8---Sports-Analysis/data/rawdata
Clean data dir:    /home/grenadi3/Project-8---Sports-Analysis/data/cleandata

Loading player measurement data from: /home/grenadi3/Project-8---Sports-Analysis/data/rawdata/player_length.csv
Processing complete. Found 505 players with valid height & wingspan records.

Seasons to process: ['21-22', '22-23', '23-24', '24-25']

➡️ Processing Season: 21-22
  Loaded team_df ((30, 15)) and mins_df ((605, 7))
  Found 330 player records with valid minutes and measurements for weighting.
  Calculated weighted averages for 30 teams.
  Saving updated data with weighted averages to: 21-22.csv
  ✅ Successfully saved updated 21-22.csv

➡️ Processing Season: 22-23
  Loaded team_df ((30, 15)) and mins_df ((539, 7))
  Found 386 player records with valid minutes and measurements for weighting.
  Calculat