# API Call That Gives the CSV with EVERYTHING we need, no FUNCTION 

In [3]:
from nba_api.stats.endpoints import PlayByPlayV2, boxscoresummaryv2
from nba_api.live.nba.endpoints import boxscore
import pandas as pd
import numpy as np

# =================================================================
# SECTION 1: INITIALIZE GAME AND GET TEAM INFORMATION
# =================================================================

# Game ID
game_id = "0022200001"

# Get team names from boxscore
box = boxscore.BoxScore(game_id)
data = box.get_dict()

# Extract team names
home_team_city = data['game']['homeTeam']['teamCity']
home_team_name = data['game']['homeTeam']['teamName']
away_team_city = data['game']['awayTeam']['teamCity']
away_team_name = data['game']['awayTeam']['teamName']

# Create full team names
full_home_team = f"{home_team_city} {home_team_name}"
full_away_team = f"{away_team_city} {away_team_name}"

# =================================================================
# SECTION 2: GET TEAM WINNING PERCENTAGES BEFORE THE GAME
# =================================================================

# Get team winning percentages before the game
box_score = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id)
line_score = box_score.line_score.get_data_frame()

# Extract win-loss records and calculate percentages
win_percentages = {}

for _, row in line_score.iterrows():
    team_abbr = row['TEAM_ABBREVIATION']
    wins_losses = row['TEAM_WINS_LOSSES']
    
    # Parse the W-L format (e.g., "24-8" into wins and losses)
    wins, losses = map(int, wins_losses.split('-'))
    
    # Calculate winning percentage
    win_pct = wins / (wins + losses) if (wins + losses) > 0 else 0
    
    win_percentages[team_abbr] = {
        'record': wins_losses,
        'wins': wins,
        'losses': losses,
        'win_percentage': win_pct,
        'full_team_name': full_home_team if team_abbr == data['game']['homeTeam']['teamTricode'] else full_away_team
    }

print("Team winning percentages before the game:")
print(win_percentages)

# =================================================================
# SECTION 3: GET PLAY-BY-PLAY DATA AND PREPARE DATAFRAME
# =================================================================

# Make the API request for play-by-play data
pbp = PlayByPlayV2(game_id=game_id)

# Convert to DataFrame
pbp_df = pbp.get_data_frames()[0]

# Select relevant columns for analysis
relevant_columns = [
    'GAME_ID',          # Needed to identify the game
    'EVENTNUM',         # To maintain event sequence
    'PERIOD',           # Quarter/overtime information
    'PCTIMESTRING',     # Time remaining in period
    'SCORE',            # Current score
    'SCOREMARGIN',      # Score differential
]

# Create filtered DataFrame
filtered_pbp = pbp_df[relevant_columns]

# =================================================================
# SECTION 4: CLEAN AND TRANSFORM DATA
# =================================================================

# Initialize first row if data is missing
if pd.isna(filtered_pbp['SCORE'].iloc[0]):
    filtered_pbp['SCORE'].iloc[0] = '0 - 0'
if pd.isna(filtered_pbp['SCOREMARGIN'].iloc[0]):
    filtered_pbp['SCOREMARGIN'].iloc[0] = 0

# Forward fill missing scores with the last valid score
filtered_pbp['SCORE'] = filtered_pbp['SCORE'].fillna(method='ffill')
filtered_pbp['SCOREMARGIN'] = filtered_pbp['SCOREMARGIN'].fillna(method='ffill')
filtered_pbp['HOME_TEAM'] = full_home_team
filtered_pbp['AWAY_TEAM'] = full_away_team

# Handle 'TIE' values and convert to numeric
filtered_pbp['SCOREMARGIN'] = filtered_pbp['SCOREMARGIN'].replace('TIE', '0')
filtered_pbp['SCOREMARGIN'] = pd.to_numeric(filtered_pbp['SCOREMARGIN'], errors='coerce')

# Split score into home and away scores
filtered_pbp[['AWAY_SCORE', 'HOME_SCORE']] = filtered_pbp['SCORE'].str.split(' - ', expand=True).astype(int)

# Calculate score difference and verify margin
filtered_pbp['SCORE_DIFF'] = filtered_pbp['HOME_SCORE'] - filtered_pbp['AWAY_SCORE']
filtered_pbp['MARGIN_CHECK'] = filtered_pbp['SCOREMARGIN'] == filtered_pbp['SCORE_DIFF']

# Flag when home team is leading
filtered_pbp['IS_HOME_LEADING'] = (filtered_pbp['SCORE_DIFF'] > 0).astype(int)

# =================================================================
# SECTION 5: ADD TEAM WIN PERCENTAGES TO DATAFRAME
# =================================================================

# Add team win percentages to the dataframe
for team_abbr, team_data in win_percentages.items():
    if team_abbr == data['game']['homeTeam']['teamTricode']:
        filtered_pbp['HOME_TEAM_WIN_PCT'] = team_data['win_percentage']
    else:
        filtered_pbp['AWAY_TEAM_WIN_PCT'] = team_data['win_percentage']

# =================================================================
# SECTION 6: DETERMINE GAME OUTCOME
# =================================================================

# Get the final game state from the last period
final_row = filtered_pbp.loc[filtered_pbp['PERIOD'].idxmax()]
if 'HOME_SCORE' not in final_row or 'AWAY_SCORE' not in final_row:
    # If we don't have separate HOME_SCORE and AWAY_SCORE yet, calculate them
    final_scores = final_row['SCORE'].split(' - ')
    final_home_score = int(final_scores[0])
    final_away_score = int(final_scores[1])
else:
    final_home_score = final_row['HOME_SCORE']
    final_away_score = final_row['AWAY_SCORE']

# Determine if home team won (1 if yes, 0 if no)
home_team_won = 1 if final_home_score > final_away_score else 0

# Add a column to indicate if the home team won
filtered_pbp['HOME_TEAM_WON'] = home_team_won

# =================================================================
# SECTION 7: CONVERT PCTIMESTRING TO SECONDS ELAPSED
# =================================================================

def convert_to_seconds_elapsed(time_str):
    # split the time string into minutes and seconds
    minutes, seconds = map(int, time_str.split(':'))
    seconds_remaining = (minutes * 60) + seconds
    # Time elapsed is the difference between the amount of time in a quarter and the time remaining
    seconds_elapsed =  720 - seconds_remaining
    return seconds_elapsed

filtered_pbp['seconds_elapsed'] = filtered_pbp['PCTIMESTRING'].apply(convert_to_seconds_elapsed)

# =================================================================
# SECTION 8: CREATE 30 SECOND SNAPSHOTS
# =================================================================

def create_30sec_snapshots(df):
    # Create a list to store our snapshot dataframes
    snapshots = []
    
    # Group by GAME_ID and PERIOD
    for (game_id, period), group in df.groupby(['GAME_ID', 'PERIOD']):
        # Create time bins for 30-second intervals (0, 30, 60, ..., 720)
        # Each quarter has 720 seconds total
        time_bins = np.arange(0, 721, 30)
        
        # Create a new dataframe to fill with snapshots
        snapshot_df = pd.DataFrame(time_bins, columns=['seconds_elapsed'])
        snapshot_df['GAME_ID'] = game_id
        snapshot_df['PERIOD'] = period
        
        # Merge with the original data to get the closest observation before each snapshot
        # First sort the group by seconds_elapsed
        group_sorted = group.sort_values('seconds_elapsed')
        
        # For each 30-second mark, find the latest event before it
        for i, seconds in enumerate(time_bins):
            # Get entries before or at this time mark
            events_before = group_sorted[group_sorted['seconds_elapsed'] <= seconds]
            
            if not events_before.empty:
                # Get the latest event
                latest_event = events_before.iloc[-1]
                
                # Update the snapshot with the latest scores and other info
                for col in ['AWAY_SCORE', 'HOME_SCORE', 'SCORE_DIFF', 'IS_HOME_LEADING', 
                           'HOME_TEAM', 'AWAY_TEAM', 'SCORE', 'SCOREMARGIN']:
                    if col in latest_event:
                        snapshot_df.loc[i, col] = latest_event[col]
            
        snapshots.append(snapshot_df)
    
    # Combine all snapshots into one dataframe
    result = pd.concat(snapshots, ignore_index=True)
    
    # Convert seconds_elapsed back to PCTIMESTRING format if needed
    result['PCTIMESTRING'] = result['seconds_elapsed'].apply(
        lambda x: f"{int((720-x)//60):d}:{int((720-x)%60):02d}")
    
    return result

# Apply the function to create 30-second snapshots
snapshots_df = create_30sec_snapshots(filtered_pbp)
    
    

# =================================================================
# SECTION 9: SAVE TO CSV
# =================================================================

# Save to CSV
filtered_pbp.to_csv('nba_game_analytics.csv', index=False)

Team winning percentages before the game:
{'BOS': {'record': '1-0', 'wins': 1, 'losses': 0, 'win_percentage': 1.0, 'full_team_name': 'Boston Celtics'}, 'PHI': {'record': '0-1', 'wins': 0, 'losses': 1, 'win_percentage': 0.0, 'full_team_name': 'Philadelphia 76ers'}}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pbp['SCORE'].iloc[0] = '0 - 0'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pbp['SCOREMARGIN'].iloc[0] = 0
  filtered_pbp['SCORE'] = filtered_pbp['SCORE'].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pbp['SCORE'] = filtered_pbp['SCORE'].fillna(method='ffill')
  filtered_pbp['SCOREMARGIN'] = filtered_pbp['SCOREMARGIN'].fillna(method='ffill')
A value is tryin

### All Data Retrieval Organized Into Functions

In [None]:
# =================================================================
# SECTION 1: INITIALIZE GAME AND GET TEAM INFORMATION
# =================================================================

def initialize_game(game_id):
    """
    Get team information for a given game ID.
    
    Args:
        game_id (str): NBA game ID
        
    Returns:
        tuple: Full home team name, full away team name
    """
    # Get team names from boxscore
    box = boxscore.BoxScore(game_id)
    data = box.get_dict()

    # Extract team names
    home_team_city = data['game']['homeTeam']['teamCity']
    home_team_name = data['game']['homeTeam']['teamName']
    away_team_city = data['game']['awayTeam']['teamCity']
    away_team_name = data['game']['awayTeam']['teamName']

    # Create full team names
    full_home_team = f"{home_team_city} {home_team_name}"
    full_away_team = f"{away_team_city} {away_team_name}"
    
    return full_home_team, full_away_team, data

# =================================================================
# SECTION 2: GET TEAM WINNING PERCENTAGES BEFORE THE GAME
# =================================================================

def get_team_win_percentages(game_id, data, full_home_team, full_away_team):
    """
    Get team winning percentages before the game.
    
    Args:
        game_id (str): NBA game ID
        data (dict): Boxscore data
        full_home_team (str): Full home team name
        full_away_team (str): Full away team name
        
    Returns:
        dict: Dictionary with team winning percentages
    """
    # Get team winning percentages before the game
    box_score = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id)
    line_score = box_score.line_score.get_data_frame()

    # Extract win-loss records and calculate percentages
    win_percentages = {}

    for _, row in line_score.iterrows():
        team_abbr = row['TEAM_ABBREVIATION']
        wins_losses = row['TEAM_WINS_LOSSES']
        
        # Parse the W-L format (e.g., "24-8" into wins and losses)
        wins, losses = map(int, wins_losses.split('-'))
        
        # Calculate winning percentage
        win_pct = wins / (wins + losses) if (wins + losses) > 0 else 0
        
        win_percentages[team_abbr] = {
            'record': wins_losses,
            'wins': wins,
            'losses': losses,
            'win_percentage': win_pct,
            'full_team_name': full_home_team if team_abbr == data['game']['homeTeam']['teamTricode'] else full_away_team
        }

 
    return win_percentages

# =================================================================
# SECTION 3: GET PLAY-BY-PLAY DATA AND PREPARE DATAFRAME
# =================================================================

def get_play_by_play_data(game_id, full_home_team, full_away_team):
    """
    Get play-by-play data and prepare the dataframe.
    
    Args:
        game_id (str): NBA game ID
        full_home_team (str): Full home team name
        full_away_team (str): Full away team name
        
    Returns:
        DataFrame: Filtered play-by-play data
    """
    # Make the API request for play-by-play data
    pbp = PlayByPlayV2(game_id=game_id)

    # Convert to DataFrame
    pbp_df = pbp.get_data_frames()[0]

    # Select relevant columns for analysis
    relevant_columns = [
        'GAME_ID',          # Needed to identify the game
        'EVENTNUM',         # To maintain event sequence
        'PERIOD',           # Quarter/overtime information
        'PCTIMESTRING',     # Time remaining in period
        'SCORE',            # Current score
        'SCOREMARGIN',      # Score differential
    ]

    # Create filtered DataFrame
    filtered_pbp = pbp_df[relevant_columns]
    
    return filtered_pbp

# =================================================================
# SECTION 4: CLEAN AND TRANSFORM DATA
# =================================================================

def clean_and_transform_data(filtered_pbp, full_home_team, full_away_team):
    """
    Clean and transform the play-by-play data.
    
    Args:
        filtered_pbp (DataFrame): Filtered play-by-play data
        full_home_team (str): Full home team name
        full_away_team (str): Full away team name
        
    Returns:
        DataFrame: Cleaned and transformed play-by-play data
    """
    # Initialize first row if data is missing
    if pd.isna(filtered_pbp['SCORE'].iloc[0]):
        filtered_pbp['SCORE'].iloc[0] = '0 - 0'
    if pd.isna(filtered_pbp['SCOREMARGIN'].iloc[0]):
        filtered_pbp['SCOREMARGIN'].iloc[0] = 0

    # Forward fill missing scores with the last valid score
    filtered_pbp['SCORE'] = filtered_pbp['SCORE'].fillna(method='ffill')
    filtered_pbp['SCOREMARGIN'] = filtered_pbp['SCOREMARGIN'].fillna(method='ffill')
    filtered_pbp['HOME_TEAM'] = full_home_team
    filtered_pbp['AWAY_TEAM'] = full_away_team

    # Handle 'TIE' values and convert to numeric
    filtered_pbp['SCOREMARGIN'] = filtered_pbp['SCOREMARGIN'].replace('TIE', '0')
    filtered_pbp['SCOREMARGIN'] = pd.to_numeric(filtered_pbp['SCOREMARGIN'], errors='coerce')

    # Split score into home and away scores
    filtered_pbp[['AWAY_SCORE', 'HOME_SCORE']] = filtered_pbp['SCORE'].str.split(' - ', expand=True).astype(int)

    # Calculate score difference and verify margin
    filtered_pbp['SCORE_DIFF'] = filtered_pbp['HOME_SCORE'] - filtered_pbp['AWAY_SCORE']
    filtered_pbp['MARGIN_CHECK'] = filtered_pbp['SCOREMARGIN'] == filtered_pbp['SCORE_DIFF']

    # Flag when home team is leading
    filtered_pbp['IS_HOME_LEADING'] = (filtered_pbp['SCORE_DIFF'] > 0).astype(int)
    
    return filtered_pbp

# =================================================================
# SECTION 5: ADD TEAM WIN PERCENTAGES TO DATAFRAME
# =================================================================

def add_win_percentages_to_df(filtered_pbp, win_percentages, data):
    """
    Add team win percentages to the dataframe.
    
    Args:
        filtered_pbp (DataFrame): Filtered play-by-play data
        win_percentages (dict): Dictionary with team winning percentages
        data (dict): Boxscore data
        
    Returns:
        DataFrame: DataFrame with added win percentages
    """
    # Add team win percentages to the dataframe
    for team_abbr, team_data in win_percentages.items():
        if team_abbr == data['game']['homeTeam']['teamTricode']:
            filtered_pbp['HOME_TEAM_WIN_PCT'] = team_data['win_percentage']
        else:
            filtered_pbp['AWAY_TEAM_WIN_PCT'] = team_data['win_percentage']
    
    return filtered_pbp

# =================================================================
# SECTION 6: DETERMINE GAME OUTCOME
# =================================================================

def determine_game_outcome(filtered_pbp):
    """
    Determine the game outcome.
    
    Args:
        filtered_pbp (DataFrame): Filtered play-by-play data
        
    Returns:
        DataFrame: DataFrame with added game outcome
    """
    # Get the final game state from the last period
    final_row = filtered_pbp.loc[filtered_pbp['PERIOD'].idxmax()]
    if 'HOME_SCORE' not in final_row or 'AWAY_SCORE' not in final_row:
        # If we don't have separate HOME_SCORE and AWAY_SCORE yet, calculate them
        final_scores = final_row['SCORE'].split(' - ')
        final_home_score = int(final_scores[0])
        final_away_score = int(final_scores[1])
    else:
        final_home_score = final_row['HOME_SCORE']
        final_away_score = final_row['AWAY_SCORE']

    # Determine if home team won (1 if yes, 0 if no)
    home_team_won = 1 if final_home_score > final_away_score else 0

    # Add a column to indicate if the home team won
    filtered_pbp['HOME_TEAM_WON'] = home_team_won
    
    return filtered_pbp

# =================================================================
# SECTION 7: CONVERT PCTIMESTRING TO SECONDS ELAPSED
# =================================================================

def convert_to_seconds_elapsed(time_str):
    """
    Convert PCTIMESTRING to seconds elapsed.
    
    Args:
        time_str (str): Time string in MM:SS format
        
    Returns:
        int: Seconds elapsed in the quarter
    """
    # split the time string into minutes and seconds
    minutes, seconds = map(int, time_str.split(':'))
    seconds_remaining = (minutes * 60) + seconds
    # Time elapsed is the difference between the amount of time in a quarter and the time remaining
    seconds_elapsed =  720 - seconds_remaining
    return seconds_elapsed

def add_seconds_elapsed(filtered_pbp):
    """
    Add seconds elapsed column to the dataframe.
    
    Args:
        filtered_pbp (DataFrame): Filtered play-by-play data
        
    Returns:
        DataFrame: DataFrame with added seconds elapsed
    """
    filtered_pbp['seconds_elapsed'] = filtered_pbp['PCTIMESTRING'].apply(convert_to_seconds_elapsed)
    return filtered_pbp

# =================================================================
# SECTION 8: CREATE 30 SECOND SNAPSHOTS
# =================================================================

def create_30sec_snapshots(df):
    """
    Create 30-second snapshots from play-by-play data.
    
    Args:
        df (DataFrame): Play-by-play data
        
    Returns:
        DataFrame: 30-second snapshots with all required columns
    """
    # Create a list to store our snapshot dataframes
    snapshots = []
    
    # Group by GAME_ID and PERIOD
    for (game_id, period), group in df.groupby(['GAME_ID', 'PERIOD']):
        # Create time bins for 30-second intervals (0, 30, 60, ..., 720)
        # Each quarter has 720 seconds total
        time_bins = np.arange(0, 721, 30)
        
        # Create a new dataframe to fill with snapshots
        snapshot_df = pd.DataFrame(time_bins, columns=['seconds_elapsed'])
        snapshot_df['GAME_ID'] = game_id
        snapshot_df['PERIOD'] = period
        
        # Merge with the original data to get the closest observation before each snapshot
        # First sort the group by seconds_elapsed
        group_sorted = group.sort_values('seconds_elapsed')
        
        # For each 30-second mark, find the latest event before it
        for i, seconds in enumerate(time_bins):
            # Get entries before or at this time mark
            events_before = group_sorted[group_sorted['seconds_elapsed'] <= seconds]
            
            if not events_before.empty:
                # Get the latest event
                latest_event = events_before.iloc[-1]
                
                # Update the snapshot with the latest scores and other info
                columns_to_copy = [
                    'AWAY_SCORE', 'HOME_SCORE', 'SCORE_DIFF', 'IS_HOME_LEADING', 
                    'HOME_TEAM', 'AWAY_TEAM', 'HOME_TEAM_WIN_PCT', 'AWAY_TEAM_WIN_PCT', 'HOME_TEAM_WON'
                ]
                
                for col in columns_to_copy:
                    if col in latest_event:
                        snapshot_df.loc[i, col] = latest_event[col]
            
        snapshots.append(snapshot_df)
    
    # Combine all snapshots into one dataframe
    result = pd.concat(snapshots, ignore_index=True)
    
    # Convert seconds_elapsed back to PCTIMESTRING format if needed
    result['PCTIMESTRING'] = result['seconds_elapsed'].apply(
        lambda x: f"{int((720-x)//60):d}:{int((720-x)%60):02d}")
    
    # Ensure all required columns are present
    required_columns = [
        'seconds_elapsed', 'GAME_ID', 'PERIOD', 
        'AWAY_SCORE', 'HOME_SCORE', 'SCORE_DIFF', 'IS_HOME_LEADING',
        'HOME_TEAM_WIN_PCT', 'AWAY_TEAM_WIN_PCT', 'HOME_TEAM_WON'
    ]
    
    # Fill missing required columns with data from the original dataframe
    for col in required_columns:
        if col not in result.columns and col in df.columns:
            # For columns that should be constant for the whole game
            if col in ['HOME_TEAM_WIN_PCT', 'AWAY_TEAM_WIN_PCT', 'HOME_TEAM_WON']:
                result[col] = df[col].iloc[0]
    
    # Select only the required columns in the correct order
    # Include any other columns that exist but ensure required ones are present
    all_columns = required_columns + [col for col in result.columns if col not in required_columns]
    result = result[all_columns].copy()
    
    return result

# =================================================================
# SECTION 9: SAVE TO CSV
# =================================================================

def save_to_csv(df, filename='nba_game_analytics.csv'):
    """
    Save dataframe to CSV.
    
    Args:
        df (DataFrame): Dataframe to save
        filename (str): Filename to save to
    """
    df.to_csv(filename, index=False)

# =================================================================
# MAIN FUNCTION
# =================================================================

def process_nba_game(game_id):
    """
    Process an NBA game and create analytics data.
    
    Args:
        game_id (str): NBA game ID
        
    Returns:
        tuple: Filtered play-by-play data, 30-second snapshots
    """
    # Initialize game
    full_home_team, full_away_team, data = initialize_game(game_id)
    
    # Get team winning percentages
    win_percentages = get_team_win_percentages(game_id, data, full_home_team, full_away_team)
    
    # Get play-by-play data
    filtered_pbp = get_play_by_play_data(game_id, full_home_team, full_away_team)
    
    # Clean and transform data
    filtered_pbp = clean_and_transform_data(filtered_pbp, full_home_team, full_away_team)
    
    # Add team win percentages to dataframe
    filtered_pbp = add_win_percentages_to_df(filtered_pbp, win_percentages, data)
    
    # Determine game outcome
    filtered_pbp = determine_game_outcome(filtered_pbp)
    
    # Add seconds elapsed
    filtered_pbp = add_seconds_elapsed(filtered_pbp)
    
    # Create 30-second snapshots
    snapshots_df = create_30sec_snapshots(filtered_pbp)
    
    # Save to CSV
    save_to_csv(filtered_pbp)
    
    return filtered_pbp, snapshots_df

# Example usage
if __name__ == "__main__":
    game_id = "0022200001"
    filtered_pbp, snapshots_df = process_nba_game(game_id)

Team winning percentages before the game:
{'BOS': {'record': '1-0', 'wins': 1, 'losses': 0, 'win_percentage': 1.0, 'full_team_name': 'Boston Celtics'}, 'PHI': {'record': '0-1', 'wins': 0, 'losses': 1, 'win_percentage': 0.0, 'full_team_name': 'Philadelphia 76ers'}}


  filtered_pbp['SCORE'] = filtered_pbp['SCORE'].fillna(method='ffill')
  filtered_pbp['SCOREMARGIN'] = filtered_pbp['SCOREMARGIN'].fillna(method='ffill')
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]


In [5]:
# =================================================================
# SECTION 9: SAVE TO CSV
# =================================================================

def save_to_csv(df, filename='nba_game_analytics.csv'):
    """
    Save dataframe to CSV.
    
    Args:
        df (DataFrame): Dataframe to save
        filename (str): Filename to save to
    """
    df.to_csv(filename, index=False)

# =================================================================
# MAIN FUNCTION
# =================================================================

def process_nba_game(game_id):
    """
    Process an NBA game and create analytics data.
    
    Args:
        game_id (str): NBA game ID
        
    Returns:
        tuple: Filtered play-by-play data, 30-second snapshots
    """
    # Initialize game
    full_home_team, full_away_team, data = initialize_game(game_id)
    
    # Get team winning percentages
    win_percentages = get_team_win_percentages(game_id, data, full_home_team, full_away_team)
    
    # Get play-by-play data
    filtered_pbp = get_play_by_play_data(game_id, full_home_team, full_away_team)
    
    # Clean and transform data
    filtered_pbp = clean_and_transform_data(filtered_pbp, full_home_team, full_away_team)
    
    # Add team win percentages to dataframe
    filtered_pbp = add_win_percentages_to_df(filtered_pbp, win_percentages, data)
    
    # Determine game outcome
    filtered_pbp = determine_game_outcome(filtered_pbp)
    
    # Add seconds elapsed
    filtered_pbp = add_seconds_elapsed(filtered_pbp)
    
    # Create 30-second snapshots
    snapshots_df = create_30sec_snapshots(filtered_pbp)
    
    # Save both dataframes to CSV
    save_to_csv(filtered_pbp, 'nba_game_analytics.csv')
    save_to_csv(snapshots_df, 'nba_game_snapshots.csv')  # Save snapshots to a separate file
    
    return filtered_pbp, snapshots_df

# Example usage
if __name__ == "__main__":
    game_id = "0022200001"
    filtered_pbp, snapshots_df = process_nba_game(game_id)

Team winning percentages before the game:
{'BOS': {'record': '1-0', 'wins': 1, 'losses': 0, 'win_percentage': 1.0, 'full_team_name': 'Boston Celtics'}, 'PHI': {'record': '0-1', 'wins': 0, 'losses': 1, 'win_percentage': 0.0, 'full_team_name': 'Philadelphia 76ers'}}


  filtered_pbp['SCORE'] = filtered_pbp['SCORE'].fillna(method='ffill')
  filtered_pbp['SCOREMARGIN'] = filtered_pbp['SCOREMARGIN'].fillna(method='ffill')
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]
  snapshot_df.loc[i, col] = latest_event[col]


In [29]:
import warnings
# Suppress all warnings globally
warnings.filterwarnings('ignore')

def process_all_2022_23_games(start_from_game=0, retry_attempts=3, retry_delay=5):
    """
    Process all games of the 2022-23 NBA season in batches of 300 games with 10-second delays between batches.
    
    Args:
        start_from_game: Index of game to start from (for resuming after failures)
        retry_attempts: Number of times to retry a game before skipping
        retry_delay: Seconds to wait between retry attempts
    
    Returns:
        str: Filename of the created CSV file
    """
    from nba_api.stats.endpoints import leaguegamefinder
    from nba_api.stats.library.parameters import SeasonType
    import time
    import pandas as pd
    from contextlib import redirect_stdout
    from io import StringIO
    import json
    import random
    
    # Get all games from 2022-23 regular season
    print("Fetching 2022-23 season games from NBA API...")
    gamefinder = leaguegamefinder.LeagueGameFinder(
        season_nullable="2022-23", 
        season_type_nullable=SeasonType.regular
    )
    
    # Get unique game IDs in chronological order
    all_games_df = gamefinder.get_data_frames()[0].sort_values('GAME_DATE')
    game_ids = all_games_df['GAME_ID'].unique().tolist()
    
    # Fix any malformed game IDs before starting
    fixed_count = 0
    for i, game_id in enumerate(game_ids):
        if game_id.startswith('20') and len(game_id) == 10:
            # This is a malformed ID - should start with 00 not 20
            game_ids[i] = '00' + game_id[2:]
            fixed_count += 1
    
    if fixed_count > 0:
        print(f"Fixed {fixed_count} malformed game IDs")
    
    # Skip games that have already been processed if resuming
    if start_from_game > 0:
        print(f"Resuming from game {start_from_game + 1}")
        game_ids = game_ids[start_from_game:]
    
    print(f"Processing {len(game_ids)} games in batches of 300 with 10-second delays between batches...")
    
    all_snapshots = []
    successful_games = 0
    failed_games = []
    
    # Process in batches of 300
    for batch_num in range(0, len(game_ids), 300):
        # This slice handles partial batches - if fewer than 300 games are left,
        # batch_games will just contain those remaining games
        batch_games = game_ids[batch_num:batch_num + 300]
        batch_num_display = batch_num // 300 + 1
        total_batches = (len(game_ids) + 299) // 300  # Ceiling division
        
        print(f"\nBatch {batch_num_display}/{total_batches}: Processing {len(batch_games)} games...")
        batch_successful = 0
        
        for i, game_id in enumerate(batch_games):
            overall_game_num = start_from_game + batch_num + i + 1
            
            # Fix malformed IDs - ensure they start with '00' not '20'
            if game_id.startswith('20') and len(game_id) == 10:
                fixed_game_id = '00' + game_id[2:]
                print(f"  ℹ️ Fixing malformed ID: {game_id} → {fixed_game_id}")
                game_id = fixed_game_id
            
            # Add random delay (0.2-0.7 seconds) between individual games to avoid consistent request patterns
            time.sleep(random.uniform(0.2, 0.7))
            
            # Try multiple times before giving up on a game
            success = False
            for attempt in range(retry_attempts):
                try:
                    # Suppress verbose output from process_nba_game
                    with redirect_stdout(StringIO()):
                        filtered_pbp, snapshots_df = process_nba_game(game_id)
                    
                    success = True
                    break
                    
                except json.JSONDecodeError:
                    if attempt < retry_attempts - 1:
                        print(f"  ⚠️ Game {overall_game_num} (ID: {game_id}): JSON error, retrying in {retry_delay} seconds... (attempt {attempt+1}/{retry_attempts})")
                        time.sleep(retry_delay + random.uniform(1, 3))  # Add randomness to delay
                    else:
                        print(f"  ❌ Game {overall_game_num} (ID: {game_id}): Failed after {retry_attempts} attempts")
                        failed_games.append(game_id)
                        
                except Exception as e:
                    if attempt < retry_attempts - 1:
                        print(f"  ⚠️ Game {overall_game_num} (ID: {game_id}): Error: {str(e)[:60]}... retrying in {retry_delay} seconds")
                        time.sleep(retry_delay + random.uniform(1, 3))
                    else:
                        print(f"  ❌ Game {overall_game_num} (ID: {game_id}): Failed due to {type(e).__name__}")
                        failed_games.append(game_id)
            
            if success and snapshots_df is not None and not snapshots_df.empty:
                all_snapshots.append(snapshots_df)
                successful_games += 1
                batch_successful += 1
                print(f"  ✓ Game {overall_game_num}: {game_id}")
            elif success:
                print(f"  ⚠️ Game {overall_game_num}: {game_id} (processed but no data)")
                
            # Every 30 games within a batch, save intermediate results and print progress
            if (i + 1) % 30 == 0 or i == len(batch_games) - 1:
                print(f"  Progress: {i + 1}/{len(batch_games)} games in current batch, {batch_successful} successful")
                
                # Save intermediate results
                if all_snapshots:
                    intermediate_df = pd.concat(all_snapshots, ignore_index=True)
                    temp_filename = f"nba_2022_23_interim_snapshots_{overall_game_num}.csv"
                    save_to_csv(intermediate_df, temp_filename)
                    print(f"  💾 Saved intermediate results ({len(intermediate_df):,} snapshots) to {temp_filename}")
        
        # 10-second delay between batches (except after last batch)
        if batch_num + 300 < len(game_ids):
            print(f"  Batch complete: {batch_successful}/{len(batch_games)} successful")
            delay_time = 15 + random.uniform(5, 10)  # Increased delay with randomness
            print(f"  Waiting {delay_time:.1f} seconds before next batch...")
            time.sleep(delay_time)
    
    # Save final results
    if all_snapshots:
        combined_snapshots = pd.concat(all_snapshots, ignore_index=True)
        filename = 'nba_2022_23_all_games_snapshots.csv'
        save_to_csv(combined_snapshots, filename)
        
        print(f"\n🎉 SUCCESS: {successful_games}/{start_from_game + len(game_ids)} games processed")
        print(f"💾 Saved {len(combined_snapshots):,} snapshots to {filename}")
        
        if failed_games:
            failed_filename = 'failed_game_ids.txt'
            with open(failed_filename, 'w') as f:
                for game_id in failed_games:
                    f.write(f"{game_id}\n")
            print(f"❗ {len(failed_games)} games failed. Their IDs are saved in {failed_filename}")
            
        return filename
    else:
        print("❌ No games processed successfully!")
        return None


if __name__ == "__main__":
    # Process all games of 2022-23 season starting from the first game
    print("Processing ALL games of 2022-23 NBA season (from the beginning)")
    
    # Start from game 0 (first game of the season)
    filename = process_all_2022_23_games(start_from_game=0, retry_attempts=3, retry_delay=8)
    
    if filename:
        print(f"\n📊 Your snapshot CSV is ready: {filename}")
        print("You can now use this file for your PSTAT 131 project!")

Processing ALL games of 2022-23 NBA season (from the beginning)
Fetching 2022-23 season games from NBA API...
Fixed 480 malformed game IDs
Processing 1710 games in batches of 300 with 10-second delays between batches...

Batch 1/6: Processing 300 games...
  ✓ Game 1: 0022200002
  ✓ Game 2: 0022200001
  ✓ Game 3: 0022200006
  ✓ Game 4: 0022200012
  ✓ Game 5: 0022200003
  ✓ Game 6: 0022200010
  ✓ Game 7: 0022200011
  ✓ Game 8: 0022200014
  ✓ Game 9: 0022200007
  ✓ Game 10: 0022200009
  ✓ Game 11: 0022200013
  ✓ Game 12: 0022200005
  ✓ Game 13: 0022200004
  ✓ Game 14: 0022200008
  ✓ Game 15: 0022200015
  ✓ Game 16: 0022200016
  ✓ Game 17: 0022200025
  ✓ Game 18: 0022200021
  ✓ Game 19: 0022200024
  ✓ Game 20: 0022200022
  ✓ Game 21: 0022200020
  ✓ Game 22: 0022200027
  ✓ Game 23: 0022200018
  ✓ Game 24: 0022200023
  ✓ Game 25: 0022200026
  ✓ Game 26: 0022200017
  ✓ Game 27: 0022200019
  ✓ Game 28: 0022200035
  ✓ Game 29: 0022200028
  ✓ Game 30: 0022200036
  Progress: 30/300 games in curre

In [26]:
first_300 = pd.read_csv(r"..\Data Retrieval\nba_2022_23_first_300_games_snapshots.csv")

for columns in first_300.columns:
    print(columns)
    print(f' {columns} has {first_300[columns].isna().sum()} missing values')

# For a specific column (e.g., AWAY_SCORE)
first_missing_index = first_300[first_300['AWAY_SCORE'].isna()].index[0]
print(f"First missing AWAY_SCORE at index: {first_missing_index}")

seconds_elapsed
 seconds_elapsed has 0 missing values
GAME_ID
 GAME_ID has 0 missing values
PERIOD
 PERIOD has 0 missing values
AWAY_SCORE
 AWAY_SCORE has 350 missing values
HOME_SCORE
 HOME_SCORE has 350 missing values
SCORE_DIFF
 SCORE_DIFF has 350 missing values
IS_HOME_LEADING
 IS_HOME_LEADING has 350 missing values
HOME_TEAM_WIN_PCT
 HOME_TEAM_WIN_PCT has 350 missing values
AWAY_TEAM_WIN_PCT
 AWAY_TEAM_WIN_PCT has 350 missing values
HOME_TEAM_WON
 HOME_TEAM_WON has 350 missing values
HOME_TEAM
 HOME_TEAM has 350 missing values
AWAY_TEAM
 AWAY_TEAM has 350 missing values
PCTIMESTRING
 PCTIMESTRING has 0 missing values
First missing AWAY_SCORE at index: 600
