In [2]:
import pandas as pd

In [15]:
start_year = 2021
end_year = 2024 

seasons = [f"{str(year)[-2:]}-{str(year+1)[-2:]}" for year in range(start_year, end_year + 1)]


In [21]:
player_length_data = pd.read_csv(f'../data/rawdata/player_length.csv')
for season in seasons:
    team_data = pd.read_csv(f'../data/cleandata/{season}.csv')
    player_minute_data = pd.read_csv(f'../data/cleandata/player_minutes_{season}.csv')
    
player_length_data

Unnamed: 0,Index,Player,Team,Position,Height,Wingspan,Length
0,1,Mo Bamba,,C,"6'11.25""","7'10""",10.75
1,2,Talen Horton-Tucker,CHI,SG,"6'2.5""","7'1.25""",10.75
2,3,Jalen Williams,OKC,SG,"6'4.5""","7'2.25""",9.75
3,4,Isaiah Stewart,DET,C,"6'7.25""","7'4.75""",9.50
4,5,Robert Williams III,POR,C,"6'8""","7'5.5""",9.50
...,...,...,...,...,...,...,...
500,501,Desmond Bane,MEM,SG,"6'5""","6'4.25""",-0.75
501,502,Kyle Kuzma,MIL,PF,"6'8""","6'7""",-1.00
502,503,Kelly Olynyk,NOP,C,"6'10.75""","6'9.75""",-1.00
503,504,Tyler Herro,MIA,SG,"6'4.5""","6'3.25""",-1.25


In [2]:
import pandas as pd
import numpy as np # For NaN handling
import os
import re # For parsing height/wingspan potentially

# --- 1. Preprocessing Player Length Data (Outside Loop) ---

def parse_to_inches(measurement_str):
    """Converts height/wingspan string (e.g., "6'11.25\"" or "7'5\"") to total inches."""
    if pd.isna(measurement_str) or measurement_str == '':
        return np.nan
    try:
        # Remove quotes and handle potential extra spaces
        measurement_str = measurement_str.replace('"', '').strip()
        parts = measurement_str.split("'")
        feet = float(parts[0])
        inches = float(parts[1]) if len(parts) > 1 and parts[1] else 0
        return (feet * 12) + inches
    except Exception as e:
        # print(f"Could not parse measurement: {measurement_str}. Error: {e}") # Optional: for debugging
        return np.nan

# Define paths - adjust if needed
# Assuming script runs from 'notebooks/' or similar inside the project
player_length_file = '../data/rawdata/player_length.csv' # Path to your length data
clean_data_path = '../data/cleandata/' # Path for loading/saving processed files

try:
    player_length_data = pd.read_csv(player_length_file)
    print(f"Loaded player length data: {player_length_data.shape}")

    # Apply parsing functions
    player_length_data['Height_inches'] = player_length_data['Height'].apply(parse_to_inches)
    player_length_data['Wingspan_inches'] = player_length_data['Wingspan'].apply(parse_to_inches)

    # Normalize player names for merging (lowercase, strip whitespace)
    # You might need more sophisticated cleaning depending on name variations
    player_length_data['Player_norm'] = player_length_data['Player'].str.lower().str.strip()

    # Select relevant columns and remove rows where parsing failed
    player_length_processed = player_length_data[['Player_norm', 'Height_inches', 'Wingspan_inches']].dropna()
    print(f"Processed player length data (valid measurements): {player_length_processed.shape}")

except FileNotFoundError:
    print(f"Error: Player length file not found at {player_length_file}")
    exit()
except Exception as e:
    print(f"Error processing player length data: {e}")
    exit()


# --- 2. Loop Through Seasons ---

# Define the seasons to process (using 'YY-YY' format)
start_year = 2021
end_year = 2024 # Adjust if needed
seasons = [f"{str(year)[-2:]}-{str(year+1)[-2:]}" for year in range(start_year, end_year + 1)]

print(f"\nStarting seasonal processing for: {seasons}")

for season in seasons:
    print(f"\nProcessing season: {season}")
    try:
        # Load seasonal data
        team_data_path = os.path.join(clean_data_path, f'{season}.csv')
        player_data_path = os.path.join(clean_data_path, f'player_minutes_{season}.csv')

        if not os.path.exists(team_data_path) or not os.path.exists(player_data_path):
            print(f"  Skipping season {season}: Data file(s) not found.")
            continue

        team_data = pd.read_csv(team_data_path)
        player_minute_data = pd.read_csv(player_data_path)
        print(f"  Loaded team data ({team_data.shape}) and player minutes ({player_minute_data.shape})")

        # Normalize player names in minutes data for merging
        player_minute_data['PLAYER_NAME_norm'] = player_minute_data['PLAYER_NAME'].str.lower().str.strip()

        # Merge player minutes with processed length data
        player_merged = pd.merge(
            player_minute_data,
            player_length_processed,
            left_on='PLAYER_NAME_norm',
            right_on='Player_norm',
            how='left' # Keep all player minute records, add measurements if found
        )
        print(f"  Merged player minutes with length data: {player_merged.shape}")
        
        # Drop rows where measurements or minutes are missing (needed for weighted avg)
        player_merged_valid = player_merged.dropna(subset=['MIN', 'Height_inches', 'Wingspan_inches']).copy()
        # Also ensure minutes are positive
        player_merged_valid = player_merged_valid[player_merged_valid['MIN'] > 0] 
        
        print(f"  Filtered to valid players with minutes & measurements: {player_merged_valid.shape}")
        
        if player_merged_valid.empty:
            print(f"  No valid player data with measurements for season {season} after merge. Adding NaN columns.")
            team_data['Weighted_Avg_Height'] = np.nan
            team_data['Weighted_Avg_Wingspan'] = np.nan
        else:
            # Calculate weighted components
            player_merged_valid['minutes_x_height'] = player_merged_valid['MIN'] * player_merged_valid['Height_inches']
            player_merged_valid['minutes_x_wingspan'] = player_merged_valid['MIN'] * player_merged_valid['Wingspan_inches']

            # Aggregate by team
            team_agg = player_merged_valid.groupby('TEAM_ID').agg(
                Sum_MIN=('MIN', 'sum'),
                Sum_HxM=('minutes_x_height', 'sum'),
                Sum_WxM=('minutes_x_wingspan', 'sum')
            ).reset_index()

            # Calculate weighted averages, handle division by zero
            team_agg['Weighted_Avg_Height'] = team_agg.apply(
                lambda row: row['Sum_HxM'] / row['Sum_MIN'] if row['Sum_MIN'] > 0 else np.nan, axis=1
            )
            team_agg['Weighted_Avg_Wingspan'] = team_agg.apply(
                lambda row: row['Sum_WxM'] / row['Sum_MIN'] if row['Sum_MIN'] > 0 else np.nan, axis=1
            )
            print(f"  Calculated weighted averages for {team_agg.shape[0]} teams.")

            # Merge back into the original team_data for the season
            team_data = pd.merge(
                team_data,
                team_agg[['TEAM_ID', 'Weighted_Avg_Height', 'Weighted_Avg_Wingspan']],
                on='TEAM_ID',
                how='left' # Keep all original teams, add averages if calculated
            )
            print(f"  Merged weighted averages back into team data: {team_data.shape}")


        # --- Optional: Save the updated team_data ---
        # This will overwrite the previous clean file with the new columns
        updated_team_data_path = os.path.join(clean_data_path, f'{season}.csv')
        team_data.to_csv(updated_team_data_path, index=False)
        print(f"  Saved updated team data (with weighted averages) to {updated_team_data_path}")

    except Exception as e:
        print(f"  An error occurred processing season {season}: {e}")

print("\nFinished processing all seasons.")

Loaded player length data: (505, 7)
Processed player length data (valid measurements): (505, 3)

Starting seasonal processing for: ['21-22', '22-23', '23-24', '24-25']

Processing season: 21-22
  Loaded team data ((30, 13)) and player minutes ((605, 7))
  Merged player minutes with length data: (605, 11)
  Filtered to valid players with minutes & measurements: (330, 11)
  Calculated weighted averages for 30 teams.
  Merged weighted averages back into team data: (30, 15)
  Saved updated team data (with weighted averages) to ../data/cleandata/21-22.csv

Processing season: 22-23
  Loaded team data ((30, 13)) and player minutes ((539, 7))
  Merged player minutes with length data: (539, 11)
  Filtered to valid players with minutes & measurements: (386, 11)
  Calculated weighted averages for 30 teams.
  Merged weighted averages back into team data: (30, 15)
  Saved updated team data (with weighted averages) to ../data/cleandata/22-23.csv

Processing season: 23-24
  Loaded team data ((30, 13)