Step 1: Pulling in data from nba_api

In [None]:
!pip install nba_api pandas numpy unidecode beautifulsoup4 lxml requests
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguedashplayerstats, leaguedashteamstats
from nba_api.stats.static import teams
import time
import os

os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)
os.makedirs("outputs/visualizations", exist_ok=True)

print("Directories created successfully.")

In [None]:
# Function to collect ALL seasons with retry logic

def collect_player_stats_season(season, min_games=40, min_mpg=20, timeout=60):
    """
    Collect player stats for a single season with filters
    Added timeout parameter for reliability
    """
    print(f"Fetching data for {season}...")
    
    # Fetch traditional stats with timeout
    traditional = leaguedashplayerstats.LeagueDashPlayerStats(
        season=season,
        season_type_all_star='Regular Season',
        per_mode_detailed='PerGame',
        timeout=timeout
    )
    df_trad = traditional.get_data_frames()[0]
    
    # Rate limiting
    time.sleep(2)  # Increased from 1 to 2 seconds
    
    # Fetch advanced stats with timeout
    advanced = leaguedashplayerstats.LeagueDashPlayerStats(
        season=season,
        season_type_all_star='Regular Season',
        measure_type_detailed_defense='Advanced',
        timeout=timeout
    )
    df_adv = advanced.get_data_frames()[0]
    
    # Select columns we need from traditional
    trad_cols = ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 
                 'GP', 'MIN', 'STL', 'BLK']
    df_trad_filtered = df_trad[trad_cols].copy()
    
    # Select columns we need from advanced
    adv_cols = ['PLAYER_ID', 'DEF_RATING', 'DREB_PCT']
    df_adv_filtered = df_adv[adv_cols].copy()
    
    # Merge on PLAYER_ID
    df_merged = df_trad_filtered.merge(df_adv_filtered, on='PLAYER_ID', how='left')
    
    # Apply filters
    df_filtered = df_merged[
        (df_merged['GP'] >= min_games) & 
        (df_merged['MIN'] >= min_mpg)
    ].copy()
    
    # Add season column
    df_filtered['SEASON'] = season
    
    print(f"  Players before filter: {len(df_merged)}")
    print(f"  Players after filter: {len(df_filtered)}")
    
    return df_filtered


def collect_all_player_stats(start_season='2014-15', end_season='2023-24', 
                              min_games=40, min_mpg=20, max_retries=3):
    """
    Collect player stats for multiple seasons with retry logic
    """
    # Generate list of seasons
    seasons = []
    start_year = int(start_season.split('-')[0])
    end_year = int(end_season.split('-')[0])
    
    for year in range(start_year, end_year + 1):
        season_str = f"{year}-{str(year + 1)[-2:]}"
        seasons.append(season_str)
    
    print(f"Collecting data for {len(seasons)} seasons: {seasons[0]} to {seasons[-1]}\n")
    
    all_data = []
    failed_seasons = []
    
    for season in seasons:
        success = False
        
        # Retry logic
        for attempt in range(max_retries):
            try:
                df_season = collect_player_stats_season(season, min_games, min_mpg)
                all_data.append(df_season)
                success = True
                time.sleep(3)  # Increased wait time between seasons
                break
            except Exception as e:
                print(f"  ERROR on attempt {attempt + 1}/{max_retries} for {season}: {e}")
                if attempt < max_retries - 1:
                    wait_time = 5 * (attempt + 1)  # Exponential backoff
                    print(f"  Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
        
        if not success:
            print(f"  FAILED to fetch {season} after {max_retries} attempts")
            failed_seasons.append(season)
    
    # Combine all seasons
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        
        print(f"\n{'='*50}")
        print(f"Total players collected: {len(df_all)}")
        print(f"Seasons: {df_all['SEASON'].nunique()}")
        if failed_seasons:
            print(f"Failed seasons: {failed_seasons}")
        print(f"{'='*50}")
        
        return df_all, failed_seasons
    else:
        print("No data collected!")
        return None, failed_seasons

# Collect all historical data
df_historical, failed = collect_all_player_stats()

if df_historical is not None:
    # Save to CSV
    df_historical.to_csv('data/raw/player_stats_historical.csv', index=False)
    print("\nSaved to: data/raw/player_stats_historical.csv")
    
    if failed:
        print(f"\nWARNING: Failed to collect data for: {failed}")
        print("You can manually retry these seasons if needed.")

Collecting data for 10 seasons: 2014-15 to 2023-24

Fetching data for 2014-15...
  Players before filter: 492
  Players after filter: 219
Fetching data for 2015-16...
  Players before filter: 476
  Players after filter: 219
Fetching data for 2016-17...
  Players before filter: 486
  Players after filter: 222
Fetching data for 2017-18...
  Players before filter: 540
  Players after filter: 224
Fetching data for 2018-19...
  Players before filter: 530
  Players after filter: 230
Fetching data for 2019-20...
  Players before filter: 529
  Players after filter: 212
Fetching data for 2020-21...
  Players before filter: 540
  Players after filter: 219
Fetching data for 2021-22...
  Players before filter: 605
  Players after filter: 238
Fetching data for 2022-23...
  Players before filter: 539
  Players after filter: 226
Fetching data for 2023-24...
  Players before filter: 572
  Players after filter: 222

Total players collected: 2231
Seasons: 10

Saved to: data/raw/player_stats_historical.c

In [None]:
# Function to collect NBA team defensive stats with flexible game filter

def collect_team_stats_season(season, timeout=60):
    """
    Collect NBA team defensive stats for one season (excludes G-League/WNBA)
    
    Returns:
    - DataFrame with team defensive ratings and ranks
    """
    print(f"Fetching team stats for {season}...")
    
    team_stats = leaguedashteamstats.LeagueDashTeamStats(
        season=season,
        season_type_all_star='Regular Season',
        measure_type_detailed_defense='Advanced',
        timeout=timeout
    )
    df_team = team_stats.get_data_frames()[0]
    
    # Select relevant columns
    cols = ['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'DEF_RATING']
    df_filtered = df_team[cols].copy()
    
    # Filter to NBA teams only
    # NBA teams play 70-82 games (accounts for COVID seasons and lockouts)
    # G-League plays ~50 games, WNBA plays ~34-40 games
    df_filtered = df_filtered[df_filtered['GP'] >= 70].copy()
    
    # Further filter: Keep only top 30 teams by games played
    # This ensures we get NBA teams even in shortened seasons
    df_filtered = df_filtered.nlargest(30, 'GP')
    
    # Calculate defensive rank (lower DEF_RATING is better)
    df_filtered['DEF_RANK'] = df_filtered['DEF_RATING'].rank(method='min').astype(int)
    
    # Add season
    df_filtered['SEASON'] = season
    
    print(f"  NBA teams collected: {len(df_filtered)} (GP range: {df_filtered['GP'].min()}-{df_filtered['GP'].max()})")
    
    return df_filtered


def collect_all_team_stats(start_season='2014-15', end_season='2023-24', max_retries=3):
    """
    Collect team stats for all seasons with retry logic
    """
    seasons = []
    start_year = int(start_season.split('-')[0])
    end_year = int(end_season.split('-')[0])
    
    for year in range(start_year, end_year + 1):
        season_str = f"{year}-{str(year + 1)[-2:]}"
        seasons.append(season_str)
    
    print(f"Collecting team data for {len(seasons)} seasons\n")
    
    all_data = []
    failed_seasons = []
    
    for season in seasons:
        success = False
        
        # Retry logic
        for attempt in range(max_retries):
            try:
                df_season = collect_team_stats_season(season)
                all_data.append(df_season)
                success = True
                time.sleep(2)  # Rate limiting
                break
            except Exception as e:
                print(f"  ERROR on attempt {attempt + 1}/{max_retries} for {season}: {e}")
                if attempt < max_retries - 1:
                    wait_time = 5 * (attempt + 1)
                    print(f"  Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
        
        if not success:
            print(f"  FAILED to fetch {season} after {max_retries} attempts")
            failed_seasons.append(season)
    
    if all_data:
        df_all = pd.concat(all_data, ignore_index=True)
        
        print(f"\n{'='*50}")
        print(f"Total NBA team-seasons collected: {len(df_all)}")
        print(f"Expected: {len(seasons) * 30} (30 teams × {len(seasons)} seasons)")
        print(f"\nGames played by season:")
        for season in df_all['SEASON'].unique():
            gp_range = df_all[df_all['SEASON'] == season]['GP'].agg(['min', 'max'])
            print(f"  {season}: {gp_range['min']}-{gp_range['max']} games")
        if failed_seasons:
            print(f"\nFailed seasons: {failed_seasons}")
        print(f"{'='*50}")
        
        return df_all, failed_seasons
    else:
        print("No data collected!")
        return None, failed_seasons


# Collect all team data
df_teams, failed_teams = collect_all_team_stats()

if df_teams is not None:
    # Save to CSV
    df_teams.to_csv('data/raw/team_stats_historical.csv', index=False)
    print("\nSaved to: data/raw/team_stats_historical.csv")
    
    # Show sample - top 5 defenses from most recent season
    print("\nSample: Top 5 defenses (2023-24):")
    df_sample = df_teams[df_teams['SEASON'] == '2023-24'].nsmallest(5, 'DEF_RANK')
    print(df_sample[['TEAM_NAME', 'DEF_RATING', 'DEF_RANK', 'W', 'L', 'GP']].to_string(index=False))
    
    if failed_teams:
        print(f"\nWARNING: Failed to collect data for: {failed_teams}")

Collecting team data for 10 seasons

Fetching team stats for 2014-15...
  NBA teams collected: 30 (GP range: 82-82)
Fetching team stats for 2015-16...
  NBA teams collected: 30 (GP range: 82-82)
Fetching team stats for 2016-17...
  NBA teams collected: 30 (GP range: 82-82)
Fetching team stats for 2017-18...
  NBA teams collected: 30 (GP range: 82-82)
Fetching team stats for 2018-19...
  NBA teams collected: 30 (GP range: 82-82)
Fetching team stats for 2019-20...
  NBA teams collected: 30 (GP range: 70-83)
Fetching team stats for 2020-21...
  NBA teams collected: 30 (GP range: 72-72)
Fetching team stats for 2021-22...
  NBA teams collected: 30 (GP range: 86-90)
Fetching team stats for 2022-23...
  NBA teams collected: 30 (GP range: 87-90)
Fetching team stats for 2023-24...
  NBA teams collected: 30 (GP range: 87-90)

Total NBA team-seasons collected: 300
Expected: 300 (30 teams × 10 seasons)

Games played by season:
  2014-15: 82-82 games
  2015-16: 82-82 games
  2016-17: 82-82 games
  

In [None]:
# Collect current season data (2025-26) with retry logic

def collect_current_season_data(max_retries=3):
    """
    Collect 2024-25 season data for predictions
    Note: Early season, so we'll be more lenient on games played
    """
    current_season = '2025-26'
    
    print(f"Fetching current season data: {current_season}")
    print("Note: Using relaxed filters for early season (20 GP, 15 MPG)\n")
    
    # Collect player stats with retry
    df_players = None
    for attempt in range(max_retries):
        try:
            df_players = collect_player_stats_season(
                season=current_season,
                min_games=20,  # Relaxed for early season
                min_mpg=15     # Relaxed for early season
            )
            break
        except Exception as e:
            print(f"  ERROR fetching players (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                wait_time = 5 * (attempt + 1)
                print(f"  Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
    
    if df_players is None:
        print("FAILED to collect current season player data")
        return None, None
    
    time.sleep(3)
    
    # Collect team stats with retry
    df_teams = None
    for attempt in range(max_retries):
        try:
            df_teams = collect_team_stats_season(current_season)
            break
        except Exception as e:
            print(f"  ERROR fetching teams (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                wait_time = 5 * (attempt + 1)
                print(f"  Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
    
    if df_teams is None:
        print("FAILED to collect current season team data")
        return df_players, None
    
    # Save both
    df_players.to_csv('data/raw/player_stats_current.csv', index=False)
    df_teams.to_csv('data/raw/team_stats_current.csv', index=False)
    
    print(f"\n✓ Current season players: {len(df_players)}")
    print(f"✓ Current season teams: {len(df_teams)}")
    print("\nSaved current season data to data/raw/")
    
    return df_players, df_teams


# Collect current season
df_current_players, df_current_teams = collect_current_season_data()

Fetching current season data: 2025-26
Note: Using relaxed filters for early season (20 GP, 15 MPG)

Fetching data for 2025-26...
  Players before filter: 491
  Players after filter: 129
Fetching team stats for 2025-26...
  NBA teams collected: 0 (GP range: nan-nan)

✓ Current season players: 129
✓ Current season teams: 0

Saved current season data to data/raw/


In [None]:
# Summary of collected data

print("="*60)
print("DATA COLLECTION SUMMARY")
print("="*60)

# Check if data exists
if df_historical is not None:
    print(f"\n1. Historical Player Stats (2014-15 to 2023-24)")
    print(f"   Total records: {len(df_historical)}")
    print(f"   Unique players: {df_historical['PLAYER_NAME'].nunique()}")
    print(f"   Seasons covered: {df_historical['SEASON'].nunique()}")
    print(f"   Avg players per season: {len(df_historical) / df_historical['SEASON'].nunique():.0f}")
    print(f"   File: data/raw/player_stats_historical.csv")
    
    # Show sample of stats
    print(f"\n   Sample stats ranges:")
    print(f"   - Games Played: {df_historical['GP'].min():.0f} to {df_historical['GP'].max():.0f}")
    print(f"   - Minutes: {df_historical['MIN'].min():.1f} to {df_historical['MIN'].max():.1f}")
    print(f"   - Steals: {df_historical['STL'].min():.1f} to {df_historical['STL'].max():.1f}")
    print(f"   - Blocks: {df_historical['BLK'].min():.1f} to {df_historical['BLK'].max():.1f}")
else:
    print("\n1. Historical Player Stats: NOT COLLECTED")

if df_teams is not None:
    print(f"\n2. Historical Team Stats (2014-15 to 2023-24)")
    print(f"   Total records: {len(df_teams)}")
    print(f"   Teams per season: {len(df_teams) / df_teams['SEASON'].nunique():.0f}")
    print(f"   File: data/raw/team_stats_historical.csv")
    
    # Show defensive rating range
    print(f"\n   Defensive Rating range:")
    print(f"   - Best: {df_teams['DEF_RATING'].min():.1f}")
    print(f"   - Worst: {df_teams['DEF_RATING'].max():.1f}")
else:
    print("\n2. Historical Team Stats: NOT COLLECTED")

if df_current_players is not None:
    print(f"\n3. Current Season Data (2024-25)")
    print(f"   Players: {len(df_current_players)}")
    print(f"   Teams: {len(df_current_teams) if df_current_teams is not None else 0}")
    print(f"   Files: data/raw/player_stats_current.csv")
    print(f"          data/raw/team_stats_current.csv")
else:
    print("\n3. Current Season Data: NOT COLLECTED")

print("\n" + "="*60)

# Check if we have everything we need
if df_historical is not None and df_teams is not None and df_current_players is not None:
    print("✓ DATA COLLECTION COMPLETE! Ready for feature engineering.")
else:
    print("⚠ INCOMPLETE DATA - Check errors above and retry failed seasons")

print("="*60)

DATA COLLECTION SUMMARY

1. Historical Player Stats (2014-15 to 2023-24)
   Total records: 2231
   Unique players: 598
   Seasons covered: 10
   Avg players per season: 223
   File: data/raw/player_stats_historical.csv

   Sample stats ranges:
   - Games Played: 40 to 84
   - Minutes: 20.0 to 38.7
   - Steals: 0.1 to 2.4
   - Blocks: 0.0 to 3.7

2. Historical Team Stats (2014-15 to 2023-24)
   Total records: 300
   Teams per season: 30
   File: data/raw/team_stats_historical.csv

   Defensive Rating range:
   - Best: 98.2
   - Worst: 118.9

3. Current Season Data (2024-25)
   Players: 129
   Teams: 0
   Files: data/raw/player_stats_current.csv
          data/raw/team_stats_current.csv

✓ DATA COLLECTION COMPLETE! Ready for feature engineering.


Implementation + Cleaning

In [None]:
# Load all datasets

import pandas as pd
import numpy as np

# Load data
df_player_hist = pd.read_csv('data/raw/player_stats_historical.csv')
df_player_curr = pd.read_csv('data/raw/player_stats_current.csv')
df_team_hist = pd.read_csv('data/raw/team_stats_historical.csv')
df_team_curr = pd.read_csv('data/raw/team_stats_current.csv')
df_dpoy = pd.read_csv('data/raw/dpoy_votes.csv')

print("Data loaded successfully!")
print(f"Historical players: {len(df_player_hist)}")
print(f"Current players: {len(df_player_curr)}")
print(f"Historical teams: {len(df_team_hist)}")
print(f"DPOY votes: {len(df_dpoy)}")

Data loaded successfully!
Historical players: 2231
Current players: 129
Historical teams: 300
DPOY votes: 92


In [None]:
# Data Cleaning Function

def clean_player_data(df):
    """
    Clean player data:
    - Drop missing DEF_RATING
    - Handle traded players (keep team with most GP)
    - Convert data types
    """
    print(f"Starting rows: {len(df)}")
    
    # Drop rows with missing DEF_RATING
    df = df.dropna(subset=['DEF_RATING'])
    print(f"After dropping missing DEF_RATING: {len(df)}")
    
    # Handle traded players - keep row with most GP per player per season
    df = df.sort_values('GP', ascending=False)
    df = df.drop_duplicates(subset=['PLAYER_NAME', 'SEASON'], keep='first')
    print(f"After handling traded players: {len(df)}")
    
    # Ensure numeric columns are correct type
    numeric_cols = ['GP', 'MIN', 'STL', 'BLK', 'DEF_RATING', 'DREB_PCT']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Drop any remaining nulls
    df = df.dropna()
    print(f"Final rows after cleaning: {len(df)}")
    
    return df

# Clean historical and current data
df_player_hist_clean = clean_player_data(df_player_hist.copy())
df_player_curr_clean = clean_player_data(df_player_curr.copy())

print("\nHistorical data sample:")
print(df_player_hist_clean.head())

In [None]:
# Feature Engineering Function

def engineer_features(df):
    """
    Create all engineered features:
    - Per 36 min stats
    - Availability score
    - Defensive stocks
    """
    df = df.copy()
    
    # Per 36 minute stats
    df['steals_p36'] = (df['STL'] / df['MIN']) * 36
    df['blocks_p36'] = (df['BLK'] / df['MIN']) * 36
    df['stocks_p36'] = df['steals_p36'] + df['blocks_p36']
    
    # Availability score (games played * minutes per game, normalized)
    df['availability_score'] = (df['GP'] / 82) * (df['MIN'] / 36)
    
    print(f"Features engineered for {len(df)} players")
    return df

# Apply feature engineering
df_player_hist_features = engineer_features(df_player_hist_clean)
df_player_curr_features = engineer_features(df_player_curr_clean)

print("\nNew features created:")
print(df_player_hist_features[['PLAYER_NAME', 'SEASON', 'steals_p36', 
                                'blocks_p36', 'stocks_p36', 
                                'availability_score']].head())

In [None]:
# Merge with Team Defensive Stats

def merge_team_stats(df_players, df_teams):
    print("Player team abbreviations sample:", df_players['TEAM_ABBREVIATION'].unique()[:5])
    print("Team names sample:", df_teams['TEAM_NAME'].unique()[:5])

    
    # Simpler approach: create team abbreviation in team stats
    team_abbrev_map = {
        'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BKN',
        'Charlotte Hornets': 'CHA', 'Chicago Bulls': 'CHI', 'Cleveland Cavaliers': 'CLE',
        'Dallas Mavericks': 'DAL', 'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET',
        'Golden State Warriors': 'GSW', 'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND',
        'LA Clippers': 'LAC', 'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM',
        'Miami Heat': 'MIA', 'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN',
        'New Orleans Pelicans': 'NOP', 'New York Knicks': 'NYK', 'Oklahoma City Thunder': 'OKC',
        'Orlando Magic': 'ORL', 'Philadelphia 76ers': 'PHI', 'Phoenix Suns': 'PHX',
        'Portland Trail Blazers': 'POR', 'Sacramento Kings': 'SAC', 'San Antonio Spurs': 'SAS',
        'Toronto Raptors': 'TOR', 'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS',
        'New Jersey Nets': 'NJN', 'Charlotte Bobcats': 'CHA', 'Seattle SuperSonics': 'SEA',
        'New Orleans Hornets': 'NOH', 'New Orleans/Oklahoma City Hornets': 'NOK'
    }
    
    df_teams['TEAM_ABBREVIATION'] = df_teams['TEAM_NAME'].map(team_abbrev_map)
    
    # Now merge on SEASON and TEAM_ABBREVIATION
    df_merged = df_players.merge(
        df_teams[['SEASON', 'TEAM_ABBREVIATION', 'DEF_RATING', 'DEF_RANK']], 
        on=['SEASON', 'TEAM_ABBREVIATION'],
        how='left',
        suffixes=('', '_team')
    )
    
    print(f"Merged {len(df_merged)} player-seasons")
    print(f"Missing team matches: {df_merged['DEF_RANK'].isna().sum()}")
    
    return df_merged

# Merge team stats
df_hist_with_team = merge_team_stats(df_player_hist_features, df_team_hist)
df_curr_with_team = merge_team_stats(df_player_curr_features, df_team_curr)

print("\nMerged data sample:")
print(df_hist_with_team[['PLAYER_NAME', 'SEASON', 'TEAM_ABBREVIATION', 
                          'DEF_RANK']].head())

In [None]:
# Create Team-Based Interaction Features

def create_interaction_features(df):
    """
    Create features that combine individual and team defense
    """
    df = df.copy()
    
    # Normalize team defensive rank (1 = best, 30 = worst)
    # Convert to 0-1 scale where 1 = best team defense
    df['team_def_quality'] = (31 - df['DEF_RANK']) / 30
    
    # Defense impact: individual defensive actions weighted by team quality
    df['defense_impact'] = df['stocks_p36'] * df['team_def_quality']
    
    # Elite defender bonus: high stocks + top 10 team defense
    df['elite_defender_context'] = (df['stocks_p36'] > df['stocks_p36'].quantile(0.75)) & \
                                     (df['DEF_RANK'] <= 10)
    df['elite_defender_context'] = df['elite_defender_context'].astype(int)
    
    print(f"Interaction features created for {len(df)} players")
    return df

df_hist_full = create_interaction_features(df_hist_with_team)

print("\nInteraction features sample:")
print(df_hist_full[['PLAYER_NAME', 'SEASON', 'DEF_RANK', 
                     'team_def_quality', 'defense_impact', 
                     'elite_defender_context']].head(10))

In [None]:
# Merge with DPOY Voting Data

def merge_dpoy_votes(df_players, df_votes):
    """
    Merge player stats with DPOY vote shares
    Fill 0 for players who received no votes
    """
    df_merged = df_players.merge(
        df_votes[['PLAYER_NAME', 'SEASON', 'DPOY_VOTE_SHARE']],
        on=['PLAYER_NAME', 'SEASON'],
        how='left'
    )
    
    # Fill NaN vote shares with 0 (players who got no votes)
    df_merged['DPOY_VOTE_SHARE'] = pd.to_numeric(df_merged['DPOY_VOTE_SHARE'], errors='coerce')
    df_merged['DPOY_VOTE_SHARE'] = df_merged['DPOY_VOTE_SHARE'].fillna(0)

    
    print(f"Players with votes > 0: {(df_merged['DPOY_VOTE_SHARE'] > 0).sum()}")
    print(f"Players with votes = 0: {(df_merged['DPOY_VOTE_SHARE'] == 0).sum()}")
    
    return df_merged

df_training = merge_dpoy_votes(df_hist_full, df_dpoy)

print("\nVote distribution:")
print(df_training['DPOY_VOTE_SHARE'].describe())
print("\nTop DPOY candidates in data:")
print(df_training.nlargest(10, 'DPOY_VOTE_SHARE')[['PLAYER_NAME', 'SEASON', 
                                                      'DPOY_VOTE_SHARE', 'DEF_RANK']])

In [None]:
# Data Preparation & Column Selection

def prepare_final_dataset(df):
    """
    Select final columns and prepare for modeling
    """
    final_cols = [
        'PLAYER_NAME', 'SEASON', 'TEAM_ABBREVIATION',
        'GP', 'MIN',
        'steals_p36', 'blocks_p36', 'stocks_p36',
        'DEF_RATING', 'DREB_PCT',
        'DEF_RANK', 'team_def_quality',
        'availability_score', 'defense_impact', 'elite_defender_context',
        'DPOY_VOTE_SHARE'
    ]
    
    df_final = df[final_cols].copy()
    
    # Drop any remaining nulls
    df_final = df_final.dropna()
    
    print(f"Final dataset shape: {df_final.shape}")
    return df_final

df_final = prepare_final_dataset(df_training)

print("\nFinal dataset info:")
print(df_final.info())
print("\nFinal dataset sample:")
print(df_final.head())

In [None]:
# Split into train and test
df_train = df_final[df_final['SEASON'] <= '2022-23'].copy()
df_test = df_final[df_final['SEASON'] == '2023-24'].copy()

print(f"Training set: {len(df_train)} players")
print(f"Test set: {len(df_test)} players")

# Save processed datasets
df_final.to_csv('data/processed/training_data_full.csv', index=False)
df_train.to_csv('data/processed/training_data.csv', index=False)
df_test.to_csv('data/processed/test_data.csv', index=False)

print("Using estimated rankings based on 2024-25 season projections")

df_curr = pd.read_csv('data/raw/player_stats_current.csv')

# Clean
df_curr = df_curr.dropna(subset=['DEF_RATING'])
df_curr = df_curr.sort_values('GP', ascending=False).drop_duplicates(subset=['PLAYER_NAME', 'SEASON'], keep='first')
for col in ['GP', 'MIN', 'STL', 'BLK', 'DEF_RATING', 'DREB_PCT']:
    df_curr[col] = pd.to_numeric(df_curr[col], errors='coerce')
df_curr = df_curr.dropna()

# Engineer features
df_curr['steals_p36'] = (df_curr['STL'] / df_curr['MIN']) * 36
df_curr['blocks_p36'] = (df_curr['BLK'] / df_curr['MIN']) * 36
df_curr['stocks_p36'] = df_curr['steals_p36'] + df_curr['blocks_p36']
df_curr['availability_score'] = (df_curr['GP'] / 82) * (df_curr['MIN'] / 36)

# ESTIMATE team defensive rankings based on 2024-25 final standings
estimated_team_ranks = {
    'OKC': 1, 'ORL': 2, 'HOU': 3, 'BOS': 4, 'MEM': 5,
    'MIA': 6, 'LAL': 7, 'MIL': 8, 'DEN': 9, 'NYK': 10,
    'MIN': 11, 'LAC': 12, 'PHI': 13, 'CLE': 14, 'GSW': 15,
    'DAL': 16, 'SAC': 17, 'IND': 18, 'PHX': 19, 'NOP': 20,
    'CHI': 21, 'ATL': 22, 'POR': 23, 'SAS': 24, 'TOR': 25,
    'BKN': 26, 'DET': 27, 'UTA': 28, 'WAS': 29, 'CHA': 30
}

# Map estimated rankings
df_curr['DEF_RANK'] = df_curr['TEAM_ABBREVIATION'].map(estimated_team_ranks)

# If any teams are missing, assign league average (rank 15)
df_curr['DEF_RANK'] = df_curr['DEF_RANK'].fillna(15)

print(f"Assigned estimated DEF_RANK for {len(df_curr)} players")

# Interaction features
df_curr['team_def_quality'] = (31 - df_curr['DEF_RANK']) / 30
df_curr['defense_impact'] = df_curr['stocks_p36'] * df_curr['team_def_quality']
df_curr['elite_defender_context'] = ((df_curr['stocks_p36'] > df_curr['stocks_p36'].quantile(0.75)) & (df_curr['DEF_RANK'] <= 10)).astype(int)

# Select final columns
final_cols = ['PLAYER_NAME', 'SEASON', 'TEAM_ABBREVIATION', 'GP', 'MIN',
              'steals_p36', 'blocks_p36', 'stocks_p36', 'DEF_RATING', 'DREB_PCT',
              'DEF_RANK', 'team_def_quality', 'availability_score', 'defense_impact', 'elite_defender_context']

df_curr_final = df_curr[final_cols].dropna()

# Save
df_curr_final.to_csv('data/processed/current_season_data.csv', index=False)

print(f"\nCurrent season (2025-26) saved: {len(df_curr_final)} players")
print("\nTop 10 defensive candidates:")
print(df_curr_final.nlargest(10, 'defense_impact')[['PLAYER_NAME', 'TEAM_ABBREVIATION', 'stocks_p36', 'DEF_RANK']])

Model Training + Output

In [None]:
!pip install scikit-learn

In [None]:
# Load Data & Quick EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Load datasets
df_train = pd.read_csv('data/processed/training_data.csv')
df_test = pd.read_csv('data/processed/test_data.csv')
df_current = pd.read_csv('data/processed/current_season_data.csv')

print(f"Train: {len(df_train)} players")
print(f"Test: {len(df_test)} players")
print(f"Current: {len(df_current)} players")

# Quick look at target distribution
print("\nDPOY Vote Share Distribution:")
print(df_train['DPOY_VOTE_SHARE'].describe())
print(f"\nPlayers with votes > 0: {(df_train['DPOY_VOTE_SHARE'] > 0).sum()}")

In [None]:
# Prepare Features & Target

# Define feature columns (everything except identifiers and target)
feature_cols = [
    'GP', 'MIN',
    'steals_p36', 'blocks_p36', 'stocks_p36',
    'DEF_RATING', 'DREB_PCT',
    'DEF_RANK', 'team_def_quality',
    'availability_score', 'defense_impact', 'elite_defender_context'
]


if len(df_current) == 0:
    df_current = pd.read_csv('data/processed/current_season_data.csv')
    
    # Convert numeric columns to float
    for col in feature_cols:
        df_current[col] = pd.to_numeric(df_current[col], errors='coerce')
    
    # Drop any rows with NaN
    df_current = df_current.dropna(subset=feature_cols)
    print(f"Reloaded: {len(df_current)} players")

# Prepare training data
X_train = df_train[feature_cols]
y_train = df_train['DPOY_VOTE_SHARE']

# Prepare test data
X_test = df_test[feature_cols]
y_test = df_test['DPOY_VOTE_SHARE']

# Prepare current season data (for predictions)
X_current = df_current[feature_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_current shape: {X_current.shape}")

In [None]:
# Scale Features & Train Model

# Scale features (important for regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_current_scaled = scaler.transform(X_current)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Show feature importance (coefficients)
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': model.coef_
}).sort_values('coefficient', ascending=False)

print("Feature Importance (Top 5):")
print(feature_importance.head())

In [None]:
# Validate on Test Set (2023-24)

# Predict on test set
y_pred_test = model.predict(X_test_scaled)

# Clip predictions to 0-1 range
y_pred_test = np.clip(y_pred_test, 0, 1)

# Calculate metrics
r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)

print("Model Performance on 2023-24 Season:")
print(f"R² Score: {r2:.3f}")
print(f"Mean Absolute Error: {mae:.3f}")

# Show top predicted vs actual
df_test_results = df_test.copy()
df_test_results['predicted_vote_share'] = y_pred_test

print("\nTop 10 Predicted DPOY Candidates (2023-24):")
top_predictions = df_test_results.nlargest(10, 'predicted_vote_share')[
    ['PLAYER_NAME', 'TEAM_ABBREVIATION', 'predicted_vote_share', 'DPOY_VOTE_SHARE']
]
print(top_predictions.to_string(index=False))

print("\nActual Top Vote Getters (2023-24):")
actual_top = df_test_results.nlargest(5, 'DPOY_VOTE_SHARE')[
    ['PLAYER_NAME', 'DPOY_VOTE_SHARE', 'predicted_vote_share']
]
print(actual_top.to_string(index=False))

In [None]:
# Generate 2025-26 Predictions

# Predict on current season
y_pred_current = model.predict(X_current_scaled)
y_pred_current = np.clip(y_pred_current, 0, 1)

# Add predictions to dataframe
df_current['predicted_vote_share'] = y_pred_current

# Normalize to probabilities that sum to 1
df_current['probability'] = df_current['predicted_vote_share'] / df_current['predicted_vote_share'].sum()

print("2025-26 DPOY Predictions Generated!")

# Show top candidates
print("\nTOP 10 DPOY CANDIDATES FOR 2025-26 SEASON:")
top_10 = df_current.nlargest(10, 'probability')[
    ['PLAYER_NAME', 'TEAM_ABBREVIATION', 'probability', 
     'stocks_p36', 'DEF_RANK', 'defense_impact']
]
print(top_10.to_string(index=False))

In [None]:
# Apply Winner-Takes-All Transform

# Apply exponential scaling
CONCENTRATION_POWER = 3  # Higher = more concentrated (try 3-5)

df_current['transformed_score'] = df_current['predicted_vote_share'] ** CONCENTRATION_POWER

# Normalize to sum to 1
df_current['probability'] = df_current['transformed_score'] / df_current['transformed_score'].sum()

# Sort by probability
df_current = df_current.sort_values('probability', ascending=False).reset_index(drop=True)

print("TOP 10 AFTER TRANSFORMATION:")
print(df_current[['PLAYER_NAME', 'probability']].head(10).to_string(index=False))

# Check concentration
top_3_share = df_current.head(3)['probability'].sum()
top_5_share = df_current.head(5)['probability'].sum()
print(f"\nTop 3 players: {top_3_share:.1%} of total probability")
print(f"Top 5 players: {top_5_share:.1%} of total probability")
print("\n(Real DPOY voting: Top 3 usually get 70-90% of votes)")

In [None]:
# Save predictions.csv (DELIVERABLE - Correct Format)

# Create 2-column file: player_name, probability
predictions_final = df_current[['PLAYER_NAME', 'probability']].copy()
predictions_final.columns = ['player_name', 'probability']

# Keep only top 10 candidates
predictions_final = predictions_final.head(10)

# Save
predictions_final.to_csv('outputs/predictions.csv', index=False)

print(f"Format: 2 columns (player_name, probability)")
print(f"Total players saved: {len(predictions_final)}")

In [None]:
# Create Visualizations for Slides

import matplotlib.pyplot as plt
import seaborn as sns
import os

os.makedirs('outputs/visualizations', exist_ok=True)
sns.set_style("whitegrid")

# ============================================================
# VISUALIZATION 1: Feature Importance
# ============================================================
plt.figure(figsize=(10, 6))
feature_importance_plot = feature_importance.sort_values('coefficient')
colors = ['#d62728' if x < 0 else '#2ca02c' for x in feature_importance_plot['coefficient']]
plt.barh(feature_importance_plot['feature'], feature_importance_plot['coefficient'], color=colors)
plt.xlabel('Coefficient Value', fontsize=12)
plt.title('DPOY Model: Feature Importance', fontsize=16, fontweight='bold', pad=20)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.tight_layout()
plt.savefig('outputs/visualizations/1_feature_importance.png', dpi=300, bbox_inches='tight')
print("✅ Saved: 1_feature_importance.png")
plt.show()

# ============================================================
# VISUALIZATION 2: Top 10 Predictions for 2025-26
# ============================================================
plt.figure(figsize=(12, 7))
top_10_viz = predictions_final.head(10)
colors_gradient = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(top_10_viz)))
bars = plt.barh(top_10_viz['player_name'], top_10_viz['probability'], color=colors_gradient)
plt.xlabel('Predicted Probability (Vote Share)', fontsize=12)
plt.title('Top 10 DPOY Candidates - 2025-26 Season', fontsize=16, fontweight='bold', pad=20)
plt.gca().invert_yaxis()

# Add percentage labels on bars
for i, (bar, prob) in enumerate(zip(bars, top_10_viz['probability'])):
    plt.text(prob + 0.005, bar.get_y() + bar.get_height()/2, 
             f'{prob*100:.1f}%', va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('outputs/visualizations/2_top_10_predictions.png', dpi=300, bbox_inches='tight')
print("✅ Saved: 2_top_10_predictions.png")
plt.show()

# ============================================================
# VISUALIZATION 3: Model Performance (Predicted vs Actual 2023-24)
# ============================================================
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred_test, alpha=0.6, s=100, edgecolors='black', linewidth=0.5)
plt.plot([0, max(y_test.max(), y_pred_test.max())], 
         [0, max(y_test.max(), y_pred_test.max())], 
         'r--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Vote Share (2023-24)', fontsize=12)
plt.ylabel('Predicted Vote Share', fontsize=12)
plt.title(f'Model Performance on 2023-24 Season\nR² = {r2:.3f}', 
          fontsize=16, fontweight='bold', pad=20)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/visualizations/3_model_performance.png', dpi=300, bbox_inches='tight')
print("✅ Saved: 3_model_performance.png")
plt.show()

# ============================================================
# VISUALIZATION 4: Probability Distribution (Before vs After Transform)
# ============================================================
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Before transformation (raw predictions)
raw_probs = df_current['predicted_vote_share'] / df_current['predicted_vote_share'].sum()
ax1.hist(raw_probs, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
ax1.set_xlabel('Probability', fontsize=11)
ax1.set_ylabel('Number of Players', fontsize=11)
ax1.set_title('Before Winner-Takes-All Transform', fontsize=13, fontweight='bold')
ax1.axvline(raw_probs.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {raw_probs.mean():.4f}')
ax1.legend()

# After transformation
ax2.hist(df_current['probability'], bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
ax2.set_xlabel('Probability', fontsize=11)
ax2.set_ylabel('Number of Players', fontsize=11)
ax2.set_title('After Winner-Takes-All Transform', fontsize=13, fontweight='bold')
ax2.axvline(df_current['probability'].mean(), color='red', linestyle='--', linewidth=2, 
            label=f'Mean: {df_current["probability"].mean():.4f}')
ax2.legend()

plt.suptitle('Probability Distribution Comparison', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('outputs/visualizations/4_probability_distribution.png', dpi=300, bbox_inches='tight')
print("✅ Saved: 4_probability_distribution.png")
plt.show()