# Data Preparation

### Data Loading and Merging

In [52]:
import pandas as pd

# Read the CSV files
attendance = pd.read_csv('PublicData/game_info.csv') #contains the attendancce column
games = pd.read_csv('PublicData/game.csv') #contains game information (which teams, who won, etc.)
stadion_capacities = pd.read_csv('CreatedData/stadion_capacity.csv') #contains stadium capacities and when the stadium was used

# Merge attendance and games on game_id
games = pd.merge(attendance, games, on='game_id', how='inner')

games.drop(columns=['game_date_y'], inplace=True) # drop second game date
games.rename(columns={'game_date_x': 'game_date'}, inplace=True) 
games['game_date'] = pd.to_datetime(games['game_date'])

# Filter for games from 1996-97 season onwards
# NBA season starts in October, so keep games from October 1996 onwards
games_1996 = games[
    ((games['game_date'].dt.year == 1996) & (games['game_date'].dt.month >= 10)) |
    (games['game_date'].dt.year > 1996)
]

# Add a year column
games_1996.loc[:, 'game_year'] = games_1996['game_date'].dt.year

# Save to CSV
# games_1996.to_csv('nba_games_since_1996.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_1996.loc[:, 'game_year'] = games_1996['game_date'].dt.year


In [53]:
# Create a function to determine if a game is in first or second half of season
def get_season_half(date):
    # NBA season typically starts in October and ends in April
    # If month is October through December, it's first half
    # If month is January through June, it's second half
    if date.month >= 10:
        return 'First Half'
    else:
        return 'Second Half'

# Function to match game with correct arena capacity
def get_arena_capacity(row, stadiums_df):
    game_date = row['game_date']
    team_id = row['team_id_home']
    
    # Get all stadiums for this team
    team_stadiums = stadiums_df[stadiums_df['team_id'] == team_id]
    
    # Find the correct stadium based on date
    for _, stadium in team_stadiums.iterrows():
        start_year = stadium['start_year']
        end_year = stadium['end_year']
        
        # Handle the mid-year season transition
        if game_date.month >= 10:
            game_year = game_date.year + 1
        else:
            game_year = game_date.year
            
        if start_year <= game_year <= end_year:
            return stadium['capacity']
    
    return None

In [54]:
# Add season half (First or Second Half)
games_1996.loc[:, 'season_half'] = games_1996['game_date'].apply(get_season_half)

# Get arena capacity for each game
games_1996.loc[:, 'arena_capacity'] = games_1996.apply(lambda x: get_arena_capacity(x, stadion_capacities), axis=1)

# Calculate occupancy rate
games_1996.loc[:, 'occupancy_rate'] = (games_1996['attendance'] / games_1996['arena_capacity']) * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_1996.loc[:, 'season_half'] = games_1996['game_date'].apply(get_season_half)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_1996.loc[:, 'arena_capacity'] = games_1996.apply(lambda x: get_arena_capacity(x, stadion_capacities), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_1

In [55]:
# Move "arena_capacity" and "occupancy_rate" right after "attendance" and disregard columns like fg % -> shorten the dataframe
new_order = [
    "game_id", "game_date", "attendance", 
    "arena_capacity", "occupancy_rate", 
    "game_time", "game_year", "season_id",
    "team_id_home", "team_abbreviation_home", "matchup_home", "wl_home",
    "team_id_away", "team_abbreviation_away",
    "season_type", "season_half"
]

master_short_data = games_1996[new_order]
master_short_data.to_csv('Master_Short_Data.csv', index=False)

### Filtering and Cleaning

In [56]:
Master = master_short_data

# Daten laden
df = pd.read_csv('Master_Short_Data.csv')
df['game_date'] = pd.to_datetime(df['game_date'])

# Extract date-based features
df['game_year'] = df['game_date'].dt.year
df['game_month'] = df['game_date'].dt.month
df['game_dayofweek'] = df['game_date'].dt.dayofweek  # Monday=0, Sunday=6
df['game_hour'] = pd.to_datetime(df['game_time'], errors='coerce').dt.hour


df = df.dropna(subset=['arena_capacity'])
df = df.dropna(subset=['attendance'])

# Liste aller Teams, die mindestens einmal als Heimteam vorkommen
home_teams = df['team_abbreviation_home'].unique()

# Nur Einträge behalten, bei denen das Auswärtsteam auch in den Heimteams vorkommt
df = df[df['team_abbreviation_away'].isin(home_teams)]

df.to_csv('Master_Short_Data.csv', index=False) 

  df['game_hour'] = pd.to_datetime(df['game_time'], errors='coerce').dt.hour


In [46]:
df = pd.read_csv('Master_Short_Data.csv')

# drop all rows between 2020 march and 2021 september
df = df[(df['game_date'] < '2020-03-01') | (df['game_date'] > '2021-10-15')]

df.to_csv('Master_Short_Data.csv', index=False) 

### Adding New Features

### Adding Records of Games to Master Short 

In [47]:
def calculate_team_records(df):
    # Create copy to avoid modifying original
    df = df.copy()
    
    # Sort by season and date to ensure correct order
    # df = df.sort_values(['season_id', 'game_date'])
    
    # Initialize new columns
    df['home_wins'] = 0
    df['home_losses'] = 0
    df['home_win_streak'] = 0
    df['away_wins'] = 0
    df['away_losses'] = 0
    df['away_win_streak'] = 0
    
    # Dictionary to store team records for each season
    # Format: {season_id: {team_id: {'wins': x, 'losses': y, 'streak': z}}}
    season_records = {}
    
    # Process each game
    for idx, row in df.iterrows():
        season = row['season_id']
        home_team = row['team_id_home']
        away_team = row['team_id_away']
        
        # Initialize season records if needed
        if season not in season_records:
            season_records[season] = {}
            
        # Get or initialize home team records
        if home_team not in season_records[season]:
            home_record = {'wins': 0, 'losses': 0, 'streak': 0}
        else:
            home_record = season_records[season][home_team].copy()
            
        # Get or initialize away team records
        if away_team not in season_records[season]:
            away_record = {'wins': 0, 'losses': 0, 'streak': 0}
        else:
            away_record = season_records[season][away_team].copy()
        
        # Add result of current game
        if row['wl_home'] == 'W':
            home_record['wins'] += 1
            away_record['losses'] += 1
            home_record['streak'] = max(1, home_record['streak'] + 1)
            away_record['streak'] = min(-1, away_record['streak'] - 1)
        else:
            home_record['losses'] += 1
            away_record['wins'] += 1
            home_record['streak'] = min(-1, home_record['streak'] - 1)
            away_record['streak'] = max(1, away_record['streak'] + 1)
        
        # Store current records including this game
        df.at[idx, 'home_wins'] = home_record['wins']
        df.at[idx, 'home_losses'] = home_record['losses']
        df.at[idx, 'home_win_streak'] = home_record['streak']
        df.at[idx, 'away_wins'] = away_record['wins']
        df.at[idx, 'away_losses'] = away_record['losses']
        df.at[idx, 'away_win_streak'] = away_record['streak']
        
        # Store updated records for next games
        season_records[season][home_team] = home_record
        season_records[season][away_team] = away_record
    
    return df

In [48]:
calculate_team_records(df).to_csv('Master_Short_Data.csv', index=False)

### Adding Rivalries to Dataset

In [49]:
import pandas as pd

# Load Master Short Data and rivalries
master_df = pd.read_csv('Master_Short_Data.csv')
rivalries_df = pd.read_csv('CreatedData/rivalries.csv')

# Prepare set of rivalry pairs (both directions)
rival_pairs = set()
for _, row in rivalries_df.iterrows():
    t1 = row['team_id']
    t2 = row['rival_team_id']
    rival_pairs.add((t1, t2))
    rival_pairs.add((t2, t1))

def is_rival_game(row):
    return int((row['team_id_home'], row['team_id_away']) in rival_pairs)

# Add rival_game column (1 = yes, 0 = no)
master_df['rival_game'] = master_df.apply(is_rival_game, axis=1)

# Save updated dataframe
master_df.to_csv('Master_Short_Data.csv', index=False)

### Adding defending champions

In [50]:
import pandas as pd

# Load champions data if not already loaded
champions_df = pd.read_csv('CreatedData/champions.csv')

# Function to get defending champion for a given team, year, and season_half
def get_defending_champ(team_id, game_year, season_half):
    if season_half == 'Second Half':
        champ_year = game_year - 1
    else:
        champ_year = game_year
    champ_row = champions_df[champions_df['year'] == champ_year]
    if not champ_row.empty and champ_row.iloc[0]['team_id'] == team_id:
        return 1
    return 0

# Add defending champion columns for home and away teams
master_df['defending_champ_home'] = master_df.apply(
    lambda row: get_defending_champ(row['team_id_home'], row['game_year'], row['season_half']), axis=1
)
master_df['defending_champ_away'] = master_df.apply(
    lambda row: get_defending_champ(row['team_id_away'], row['game_year'], row['season_half']), axis=1
)

# Save updated dataframe
master_df.to_csv('Master_Short_Data.csv', index=False)

### Adding MVPs to Dataset

In [51]:
import pandas as pd

# Load Master Short Data and MVPs
master_df = pd.read_csv('Master_Short_Data.csv')
mvps_df = pd.read_csv('CreatedData/mvps.csv')

# Prepare MVP mapping for both season splits
mvp_map = {}
for _, row in mvps_df.iterrows():
    mvp_team = row['team_id']
    mvp_year = row['year']
    # Assign MVP to previous year + Second Half and current year + First Half
    mvp_map[(mvp_year - 1, 'Second Half', mvp_team)] = 1
    mvp_map[(mvp_year, 'First Half', mvp_team)] = 1

def has_mvp(row):
    key = (row['game_year'], row['season_half'], row['team_id_home'])
    return mvp_map.get(key, 0)

# Add MVP column for home team
master_df['home_mvp'] = master_df.apply(has_mvp, axis=1)

# Add MVP column for away team
def has_mvp_away(row):
    key = (row['game_year'], row['season_half'], row['team_id_away'])
    return mvp_map.get(key, 0)

master_df['away_mvp'] = master_df.apply(has_mvp_away, axis=1)

# Save updated dataframe
master_df.to_csv('Master_Short_Data.csv', index=False)

### Adding attendance of the previous season to Dataset

In [39]:
import pandas as pd
import numpy as np

# Ensure game_date is datetime
master_df['game_date'] = pd.to_datetime(master_df['game_date'])

def get_prev_season_avg(row, df):
    team = row['team_id_home']
    season = row['game_year']
    half = row['season_half']
    
    if half == 'Second Half':
        # Previous season: previous year Second Half + current year First Half
        mask = (
            ((df['team_id_home'] == team) & 
             (((df['game_year'] == season - 1) & (df['season_half'] == 'Second Half')) |
              ((df['game_year'] == season) & (df['season_half'] == 'First Half'))))
        )
    else:  # First Half
        # Previous season: two years back Second Half + one year back First Half
        mask = (
            ((df['team_id_home'] == team) & 
             (((df['game_year'] == season - 2) & (df['season_half'] == 'Second Half')) |
              ((df['game_year'] == season - 1) & (df['season_half'] == 'First Half'))))
        )
    prev_games = df.loc[mask, 'attendance']
    if len(prev_games) == 0:
        return np.nan
    return prev_games.mean()

master_df['attendance_prev_season_avg'] = master_df.apply(lambda row: get_prev_season_avg(row, master_df), axis=1)

# Save to CSV
master_df.to_csv('Master_Short_Data.csv', index=False)

### Adding laged attendance (two games) to Dataset

In [40]:
# Add lag features for attendance (previous 1 and 2 games for each home team)
df = pd.read_csv('Master_Short_Data.csv')
df = df.sort_values(['team_id_home', 'game_date'])

df['attendance_lag1'] = df.groupby('team_id_home')['attendance'].shift(1)
df['attendance_lag2'] = df.groupby('team_id_home')['attendance'].shift(2)


df = df.sort_values(['game_date'])

df.to_csv('Master_Short_Data.csv', index=False)