In [1]:
import os
import time
import random
import pandas as pd
from tqdm.notebook import tqdm
from requests.exceptions import ReadTimeout, ConnectTimeout
import urllib3
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog, shotchartdetail, leaguedashplayerstats, playergamelogs

# ==========================================
# 1. Environment & Parameters Setup
# ==========================================
seasons = ['2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24', '2024-25']
topNPlayers = 150  # Top N scorers per season
datasetDir = 'dataset'

# Create Directory
if not os.path.exists(datasetDir):
    os.makedirs(datasetDir)

gamesCsvPath = os.path.join(datasetDir, 'games.csv')
shotsCsvPath = os.path.join(datasetDir, 'shots.csv')

In [2]:
# ==========================================
# 2. Helper Functions
# ==========================================

def fetchWithRetry(apiFunc, maxRetries=3, **kwargs):
    """API Request Wrapper: Includes timeout and random retry delay"""
    kwargs['timeout'] = 25
    for i in range(maxRetries):
        try:
            time.sleep(random.uniform(0.3, 0.6)) 
            return apiFunc(**kwargs)
        except (ReadTimeout, ConnectTimeout, urllib3.exceptions.ReadTimeoutError, ConnectionResetError):
            time.sleep(5) # Wait longer on error
        except Exception:
            break
    return None

def getTopScorers(season, topN=100):
    """Get Top N Scorers for the season"""
    try:
        stats = leaguedashplayerstats.LeagueDashPlayerStats(season=season, per_mode_detailed='PerGame', timeout=30)
        df = stats.get_data_frames()[0]
        return df.sort_values(by='PTS', ascending=False).head(topN)[['PLAYER_ID', 'PLAYER_NAME']].to_dict('records')
    except: return []

def cleanDuplicates(filepath, subsetCols):
    """Data Deduplication Tool"""
    if os.path.exists(filepath):
        print(f"üßπ Cleaning duplicates: {filepath} ...", end='\r')
        # Use low_memory=False to avoid Dtype warnings
        df = pd.read_csv(filepath, low_memory=False)
        df = df.drop_duplicates(subset=subsetCols, keep='last')
        df.to_csv(filepath, index=False)
        print(f"‚úÖ Cleanup Complete: {filepath}        ")

In [3]:
# ==========================================
# 3. Prepare Tasks & Check Resume
# ==========================================
print("Step 1/3: Building and filtering task list...")

# A. Create All Target Tasks (Player x Season)
targetPlayerIds = {} 
for season in seasons:
    # print(f"Fetching top scorers for {season}...")
    for p in getTopScorers(season, topNPlayers):
        targetPlayerIds[p['PLAYER_ID']] = p['PLAYER_NAME']

allTasks = []
for pid, pname in targetPlayerIds.items():
    for season in seasons:
        allTasks.append((str(pid), pname, str(season)))

# B. Read Completed Progress 
processedTasks = set()
if os.path.exists(gamesCsvPath):
    try:
        # Force read as string to avoid int/float mismatch
        dfExist = pd.read_csv(gamesCsvPath, usecols=['Player_ID', 'Season'], dtype=str, low_memory=False)
        if not dfExist.empty:
            for _, row in dfExist.iterrows():
                # Handle formatted strings like '201939.0'
                pidClean = str(row['Player_ID']).split('.')[0].strip()
                seasonClean = str(row['Season']).strip()
                processedTasks.add((pidClean, seasonClean))
        print(f"üîÑ Locked {len(processedTasks)} completed tasks (Skipping automatically)")
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading old file (checking again): {e}")

# C. Filter Remaining Tasks
tasksToRun = [t for t in allTasks if (str(t[0]), str(t[2])) not in processedTasks]
print(f"üöÄ Total Tasks: {len(allTasks)} | Pending: {len(tasksToRun)}")

Step 1/3: Building and filtering task list...
üîÑ Locked 1891 completed tasks (Skipping automatically)
üöÄ Total Tasks: 3321 | Pending: 1434


In [None]:
# ==========================================
# 4. Execute Crawler
# ==========================================

if not tasksToRun:
    print("üéâ All tasks completed! Proceeding to cleanup.")
else:
    # dynamic_ncols=True adapts to window width
    with tqdm(total=len(tasksToRun), desc="Initializing", dynamic_ncols=True, unit="task") as pbar:
        
        for pid, pName, season in tasksToRun:
            # Double Check
            if (str(pid), str(season)) in processedTasks:
                pbar.update(1); continue

            pbar.set_description(f"Fetching: {pName} ({season})")
            
            batchGames = []
            batchShots = []

            # --- A. Fetch Base Stats ---
            baseApi = fetchWithRetry(playergamelog.PlayerGameLog, player_id=pid, season=season)
            if not baseApi: 
                pbar.update(1); continue
            dfBase = baseApi.get_data_frames()[0]
            if dfBase.empty: 
                pbar.update(1); continue

            # --- B. Fetch Advanced Stats & Merge ---
            advApi = fetchWithRetry(
                playergamelogs.PlayerGameLogs, 
                player_id_nullable=pid, season_nullable=season,
                measure_type_player_game_logs_nullable='Advanced'
            )
            dfMerged = dfBase
            if advApi:
                dfAdv = advApi.get_data_frames()[0]
                if not dfAdv.empty:
                    dfBase['Game_ID'] = dfBase['Game_ID'].astype(str)
                    dfAdv['GAME_ID'] = dfAdv['GAME_ID'].astype(str)
                    
                    advCols = ['GAME_ID', 'OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO', 
                                'OREB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'USG_PCT', 'PACE', 'PIE']
                    validCols = [c for c in advCols if c in dfAdv.columns]
                    # Merge Base & Advanced
                    dfMerged = pd.merge(dfBase, dfAdv[validCols], left_on='Game_ID', right_on='GAME_ID', how='left')

            # --- C. Clean & Label Generation ---
            try: dfMerged['GAME_DATE'] = pd.to_datetime(dfMerged['GAME_DATE'])
            except: pass
            
            dfMerged = dfMerged.sort_values('GAME_DATE').reset_index(drop=True)
            dfMerged['TARGET_PTS'] = dfMerged['PTS'].shift(-1) # Next Game Points
            dfMerged['Player_ID'] = pid
            dfMerged['Player_Name'] = pName
            dfMerged['Season'] = season
            dfMerged = dfMerged.dropna(subset=['TARGET_PTS']) # Remove last game (no label)
            batchGames.append(dfMerged)

            # --- D. Fetch Shot Charts ---
            shotApi = fetchWithRetry(
                shotchartdetail.ShotChartDetail,
                team_id=0, player_id=pid, 
                context_measure_simple='FGA', season_nullable=season
            )
            if shotApi:
                dfShots = shotApi.get_data_frames()[0]
                if not dfShots.empty:
                    sCols = ['Player_ID', 'GAME_ID', 'LOC_X', 'LOC_Y', 'SHOT_MADE_FLAG', 'SHOT_TYPE', 'ACTION_TYPE']
                    validS = [c for c in sCols if c in dfShots.columns]
                    batchShots.append(dfShots[validS])

            # --- E. Incremental Save (Checkpoint) ---
            if batchGames:
                dfG = pd.concat(batchGames, ignore_index=True)
                dfG.to_csv(gamesCsvPath, mode='a', header=not os.path.exists(gamesCsvPath), index=False)
            
            if batchShots:
                dfS = pd.concat(batchShots, ignore_index=True)
                dfS.to_csv(shotsCsvPath, mode='a', header=not os.path.exists(shotsCsvPath), index=False)

            # Update Progress
            processedTasks.add((str(pid), str(season)))
            pbar.update(1)

Initializing:   0%|          | 0/1434 [00:00<?, ?task/s]

  try: dfMerged['GAME_DATE'] = pd.to_datetime(dfMerged['GAME_DATE'])


KeyboardInterrupt: 

In [None]:
# ==========================================
# 5. Final Cleanup & Deduplication
# ==========================================
print("\nüèÅ Crawling Finished. Starting Final Cleanup...")

# Clean Game Data
cleanDuplicates(gamesCsvPath, subsetCols=['Game_ID', 'Player_ID'])

# Clean Shot Data
cleanDuplicates(shotsCsvPath, subsetCols=['GAME_ID', 'LOC_X', 'LOC_Y', 'SHOT_TYPE'])

print("‚ú® All Done! Ready for model training.")


üèÅ Crawling Finished. Starting Final Cleanup...
‚úÖ Cleanup Complete: dataset\games.csv        
‚úÖ Cleanup Complete: dataset\shots.csv        
‚ú® All Done! Ready for model training.
