# NBA Data Feature Engineering and Preprocessing

This notebook performs comprehensive feature engineering and data preprocessing on NBA data including:
- Player information and demographics
- Career statistics
- Game-by-game box scores
- Team information

## Data Sources:
- `active_players.csv`: Current NBA player information
- `career_stats.csv`: Player career statistics by season
- `boxscores_*.csv`: Game-by-game box scores for multiple seasons
- `nba_teams.csv`: Team information

## 1. Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import os
from pathlib import Path
import glob

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

## 2. Data Loading and Initial Exploration

In [2]:
def find_backend_dir(start_path=None):
    """
    Walk up directories from start_path (or cwd) until a folder named 'backend' is found.
    Returns the absolute path to the 'backend' folder.
    """
    if start_path is None:
        start_path = os.getcwd()
    curr_path = os.path.abspath(start_path)
    while True:
        # Check if 'backend' exists in this directory
        candidate = os.path.join(curr_path, "backend")
        if os.path.isdir(candidate):
            return candidate
        # If at filesystem root, stop
        parent = os.path.dirname(curr_path)
        if curr_path == parent:
            break
        curr_path = parent
    raise FileNotFoundError(f"No 'backend' directory found upward from {start_path}")

# Find the backend directory and CSV folder
backend_dir = find_backend_dir()
csv_dir = os.path.join(backend_dir, "CSVs")

In [3]:
# Load all datasets
print("Loading datasets...")

# Load player data
players_df = pd.read_csv(os.path.join(csv_dir, 'active_players.csv'))
print(f"Players data shape: {players_df.shape}")

# Load career stats
career_stats_df = pd.read_csv(os.path.join(csv_dir, 'career_stats.csv'))
print(f"Career stats shape: {career_stats_df.shape}")

# Load teams data
teams_df = pd.read_csv(os.path.join(csv_dir, 'nba_teams.csv'))
print(f"Teams data shape: {teams_df.shape}")

# Load box scores (we'll load a few recent seasons for demonstration)
boxscore_files = glob.glob(os.path.join(csv_dir, 'boxscores_*.csv'))
print(f"Found {len(boxscore_files)} boxscore files")

recent_boxscores = []
for file in boxscore_files:
    season = file.split('_')[-1].replace('.csv', '')
    df = pd.read_csv(file)
    df['season'] = season
    recent_boxscores.append(df)
boxscores_df = pd.concat(recent_boxscores, ignore_index=True)
print(f"Recent boxscores shape: {boxscores_df.shape}")

Loading datasets...
Players data shape: (572, 15)
Career stats shape: (3637, 27)
Teams data shape: (30, 6)
Found 22 boxscore files
Recent boxscores shape: (167718, 28)


In [4]:
# Display basic information about each dataset
print("=== PLAYERS DATA ===")
print(players_df.info())
print("\nFirst few rows:")
print(players_df.head())

print("\n=== CAREER STATS DATA ===")
print(career_stats_df.info())
print("\nFirst few rows:")
print(career_stats_df.head())

print("\n=== TEAMS DATA ===")
print(teams_df.info())
print("\nFirst few rows:")
print(teams_df.head())

print("\n=== BOXSCORES DATA ===")
print(boxscores_df.info())
print("\nFirst few rows:")
print(boxscores_df.head())

=== PLAYERS DATA ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572 entries, 0 to 571
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PERSON_ID           572 non-null    int64  
 1   DISPLAY_FIRST_LAST  572 non-null    object 
 2   SCHOOL              571 non-null    object 
 3   TEAM_ID             572 non-null    int64  
 4   TEAM_ABBREVIATION   533 non-null    object 
 5   DRAFT_YEAR          572 non-null    object 
 6   DRAFT_ROUND         571 non-null    object 
 7   DRAFT_NUMBER        570 non-null    object 
 8   FROM_YEAR           572 non-null    int64  
 9   TO_YEAR             572 non-null    int64  
 10  POSITION            572 non-null    object 
 11  HEIGHT              572 non-null    object 
 12  WEIGHT              570 non-null    float64
 13  BIRTHDATE           572 non-null    object 
 14  CURRENT_AGE         572 non-null    int64  
dtypes: float64(1), int64(5), object(9)
m

## 3. Data Cleaning and Preprocessing

In [5]:
# Clean players data
print("Cleaning players data...")

# Convert birthdate to datetime
players_df['BIRTHDATE'] = pd.to_datetime(players_df['BIRTHDATE'])

# Handle missing values
players_df['SCHOOL'] = players_df['SCHOOL'].fillna('Unknown')
players_df['DRAFT_YEAR'] = pd.to_numeric(players_df['DRAFT_YEAR'], errors='coerce')
players_df['DRAFT_ROUND'] = pd.to_numeric(players_df['DRAFT_ROUND'], errors='coerce')
players_df['DRAFT_NUMBER'] = pd.to_numeric(players_df['DRAFT_NUMBER'], errors='coerce')

# Create draft status feature
players_df['DRAFT_STATUS'] = players_df['DRAFT_YEAR'].apply(
    lambda x: 'Drafted' if pd.notna(x) and x != 'Undrafted' else 'Undrafted'
)

# Extract height in inches
def height_to_inches(height_str):
    if pd.isna(height_str) or height_str == '':
        return np.nan
    try:
        feet, inches = map(int, height_str.split('-'))
        return feet * 12 + inches
    except:
        return np.nan

players_df['HEIGHT_INCHES'] = players_df['HEIGHT'].apply(height_to_inches)

# Create position categories
def categorize_position(position):
    if pd.isna(position):
        return 'Unknown'
    position = str(position).upper()
    if 'GUARD' in position:
        return 'Guard'
    elif 'FORWARD' in position:
        return 'Forward'
    elif 'CENTER' in position:
        return 'Center'
    else:
        return 'Other'

players_df['POSITION_CATEGORY'] = players_df['POSITION'].apply(categorize_position)

print(f"Players data cleaned. Shape: {players_df.shape}")
print("\nPosition distribution:")
print(players_df['POSITION_CATEGORY'].value_counts())

Cleaning players data...
Players data cleaned. Shape: (572, 18)

Position distribution:
POSITION_CATEGORY
Guard      287
Forward    231
Center      54
Name: count, dtype: int64


In [6]:
# Clean career stats data
print("Cleaning career stats data...")

# Convert season to year format
career_stats_df['SEASON_YEAR'] = career_stats_df['SEASON_ID'].str[:4].astype(int)

# Calculate per-game statistics
career_stats_df['PPG'] = career_stats_df['PTS'] / career_stats_df['GP']
career_stats_df['RPG'] = career_stats_df['REB'] / career_stats_df['GP']
career_stats_df['APG'] = career_stats_df['AST'] / career_stats_df['GP']
career_stats_df['SPG'] = career_stats_df['STL'] / career_stats_df['GP']
career_stats_df['BPG'] = career_stats_df['BLK'] / career_stats_df['GP']
career_stats_df['TOPG'] = career_stats_df['TOV'] / career_stats_df['GP']
career_stats_df['MPG'] = career_stats_df['MIN'] / career_stats_df['GP']

# Calculate efficiency metrics
career_stats_df['TS_PCT'] = career_stats_df['PTS'] / (2 * (career_stats_df['FGA'] + 0.44 * career_stats_df['FTA']))
career_stats_df['AST_TO_RATIO'] = career_stats_df['AST'] / career_stats_df['TOV'].replace(0, 1)

# Handle infinite values
career_stats_df = career_stats_df.replace([np.inf, -np.inf], np.nan)

print(f"Career stats cleaned. Shape: {career_stats_df.shape}")

Cleaning career stats data...
Career stats cleaned. Shape: (3637, 37)


In [7]:
# Clean boxscores data
print("Cleaning boxscores data...")

# Convert date to datetime
boxscores_df['Date'] = pd.to_datetime(boxscores_df['Date'])

# Convert numeric columns
numeric_columns = ['Minutes', 'Points', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 
                   'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF', '+/-', 'SPI']

for col in numeric_columns:
    if col in boxscores_df.columns:
        boxscores_df[col] = pd.to_numeric(boxscores_df[col], errors='coerce')

# Handle percentage columns that might have '-' values
percentage_columns = ['FG%', '3P%', 'FT%']
for col in percentage_columns:
    if col in boxscores_df.columns:
        boxscores_df[col] = boxscores_df[col].replace('-', np.nan)
        boxscores_df[col] = pd.to_numeric(boxscores_df[col], errors='coerce')

# Extract game result (W/L)
boxscores_df['GAME_RESULT'] = boxscores_df['Result'].str[0]

# Create season year from season string
boxscores_df['SEASON_YEAR'] = boxscores_df['season'].str[:4].astype(int)

print(f"Boxscores cleaned. Shape: {boxscores_df.shape}")
print(f"Date range: {boxscores_df['Date'].min()} to {boxscores_df['Date'].max()}")

Cleaning boxscores data...
Boxscores cleaned. Shape: (167718, 30)
Date range: 2003-10-29 00:00:00 to 2025-04-13 00:00:00


## 4. Feature Engineering

In [8]:
# Create player-level features
print("Creating player-level features...")

# Age features
players_df['AGE'] = (pd.Timestamp.now() - players_df['BIRTHDATE']).dt.days / 365.25
players_df['EXPERIENCE_YEARS'] = pd.Timestamp.now().year - players_df['FROM_YEAR']

# Height and weight features
players_df['BMI'] = (players_df['WEIGHT'] * 0.453592) / ((players_df['HEIGHT_INCHES'] * 0.0254) ** 2)

# Draft features
players_df['DRAFT_POSITION'] = players_df['DRAFT_ROUND'] * 30 + players_df['DRAFT_NUMBER']
players_df['TOP_10_PICK'] = (players_df['DRAFT_POSITION'] <= 10).astype(int)
players_df['LOTTERY_PICK'] = (players_df['DRAFT_POSITION'] <= 14).astype(int)

# Team features
players_df['HAS_TEAM'] = (players_df['TEAM_ID'] != 0).astype(int)

print(f"Player features created. Shape: {players_df.shape}")

Creating player-level features...
Player features created. Shape: (572, 25)


In [9]:
# Create career-level features
print("Creating career-level features...")

# Calculate career averages for each player
career_averages = career_stats_df.groupby('PLAYER_ID').agg({
    'PPG': 'mean', 'RPG': 'mean', 'APG': 'mean', 'SPG': 'mean', 'BPG': 'mean',
    'TOPG': 'mean', 'MPG': 'mean', 'TS_PCT': 'mean', 'AST_TO_RATIO': 'mean',
    'GP': 'sum', 'PTS': 'sum', 'REB': 'sum', 'AST': 'sum', 'STL': 'sum', 'BLK': 'sum',
    'SEASON_YEAR': ['min', 'max', 'count']
}).round(3)

# Flatten column names
career_averages.columns = ['_'.join(col).strip() for col in career_averages.columns]
career_averages = career_averages.reset_index()

# Rename columns for clarity
career_averages = career_averages.rename(columns={
    'SEASON_YEAR_min': 'FIRST_SEASON',
    'SEASON_YEAR_max': 'LAST_SEASON',
    'SEASON_YEAR_count': 'SEASONS_PLAYED'
})

# Calculate career totals and averages
career_averages['CAREER_PTS'] = career_averages['PTS_sum']
career_averages['CAREER_REB'] = career_averages['REB_sum']
career_averages['CAREER_AST'] = career_averages['AST_sum']
career_averages['CAREER_STL'] = career_averages['STL_sum']
career_averages['CAREER_BLK'] = career_averages['BLK_sum']

print(f"Career features created. Shape: {career_averages.shape}")
print("\nCareer averages sample:")
print(career_averages.head())

Creating career-level features...
Career features created. Shape: (568, 24)

Career averages sample:
   PLAYER_ID  PPG_mean  RPG_mean  APG_mean  SPG_mean  BPG_mean  TOPG_mean  MPG_mean  TS_PCT_mean  AST_TO_RATIO_mean  GP_sum  PTS_sum  REB_sum  AST_sum  STL_sum  BLK_sum  FIRST_SEASON  LAST_SEASON  SEASONS_PLAYED  CAREER_PTS  CAREER_REB  CAREER_AST  CAREER_STL  CAREER_BLK
0       2544    27.006     7.547     7.426     1.477     0.730      3.509    37.581        0.592              2.118    1562    42184    11731    11584     2345     1150          2003         2024              22       42184       11731       11584        2345        1150
1     101108    16.964     4.413     9.206     1.994     0.167      2.279    33.637        0.580              4.091    1354    23011     5978    12499     2717      222          2005         2024              20       23011        5978       12499        2717         222
2     200768    12.425     3.927     5.553     1.207     0.294      2.052    29.544

In [10]:
# Create game-level features
print("Creating game-level features...")

# Efficiency metrics
boxscores_df['GAME_EFFICIENCY'] = boxscores_df['Points'] + (boxscores_df['REB'] * 1.2) + (boxscores_df['AST'] * 1.5) + (boxscores_df['STL'] * 2) + (boxscores_df['BLK'] * 2) - (boxscores_df['TO'] * 1)

# Shooting efficiency
boxscores_df['TRUE_SHOOTING_PCT'] = boxscores_df['Points'] / (2 * (boxscores_df['FGA'] + 0.44 * boxscores_df['FTA']))

# Usage rate approximation
boxscores_df['USAGE_RATE'] = (boxscores_df['FGA'] + 0.44 * boxscores_df['FTA'] + boxscores_df['TO']) / boxscores_df['Minutes'] * 100

# Game context features
boxscores_df['IS_WIN'] = (boxscores_df['GAME_RESULT'] == 'W').astype(int)
boxscores_df['IS_HOME'] = boxscores_df['Result'].str.contains('vs.').astype(int)

# Season and month features
boxscores_df['MONTH'] = boxscores_df['Date'].dt.month
boxscores_df['DAY_OF_WEEK'] = boxscores_df['Date'].dt.dayofweek
boxscores_df['IS_PLAYOFFS'] = (boxscores_df['MONTH'] >= 4).astype(int)  # Simplified playoff detection

print(f"Game features created. Shape: {boxscores_df.shape}")

Creating game-level features...
Game features created. Shape: (167718, 38)


In [11]:
# Create rolling average features for players
print("Creating rolling average features...")

# Sort by player and date
boxscores_df = boxscores_df.sort_values(['PERSON_ID', 'Date'])

# Calculate rolling averages for key metrics
rolling_features = ['Points', 'REB', 'AST', 'STL', 'BLK', 'TO', 'Minutes', 'GAME_EFFICIENCY']

for feature in rolling_features:
    if feature in boxscores_df.columns:
        # 5-game rolling average
        boxscores_df[f'{feature}_5G_AVG'] = boxscores_df.groupby('PERSON_ID')[feature].rolling(
            window=5, min_periods=1).mean().reset_index(0, drop=True)
        
        # 10-game rolling average
        boxscores_df[f'{feature}_10G_AVG'] = boxscores_df.groupby('PERSON_ID')[feature].rolling(
            window=10, min_periods=1).mean().reset_index(0, drop=True)

print(f"Rolling features created. Shape: {boxscores_df.shape}")

Creating rolling average features...
Rolling features created. Shape: (167718, 54)


## 5. Data Merging and Integration

In [12]:
# Merge all datasets
print("Merging datasets...")

# Merge players with career averages
players_with_career = players_df.merge(
    career_averages, 
    left_on='PERSON_ID', 
    right_on='PLAYER_ID', 
    how='left'
)

# Merge with teams data
players_with_career = players_with_career.merge(
    teams_df, 
    left_on='TEAM_ID', 
    right_on='id', 
    how='left', 
    suffixes=('', '_team')
)

# Merge boxscores with player info
boxscores_with_players = boxscores_df.merge(
    players_with_career[['PERSON_ID', 'DISPLAY_FIRST_LAST', 'POSITION_CATEGORY', 
                         'AGE', 'EXPERIENCE_YEARS', 'HEIGHT_INCHES', 'WEIGHT', 'BMI',
                         'DRAFT_POSITION', 'TOP_10_PICK', 'LOTTERY_PICK']], 
    on='PERSON_ID', 
    how='left'
)

print(f"Final merged dataset shape: {boxscores_with_players.shape}")
print(f"Number of unique players: {boxscores_with_players['PERSON_ID'].nunique()}")
print(f"Date range: {boxscores_with_players['Date'].min()} to {boxscores_with_players['Date'].max()}")

Merging datasets...
Final merged dataset shape: (167718, 64)
Number of unique players: 568
Date range: 2003-10-29 00:00:00 to 2025-04-13 00:00:00


## 6. Feature Selection and Final Dataset

In [13]:
# Select final features for modeling
print("Selecting final features...")

# Define feature columns
feature_columns = [
    # Player demographics
    'AGE', 'EXPERIENCE_YEARS', 'HEIGHT_INCHES', 'WEIGHT', 'BMI',
    'DRAFT_POSITION', 'TOP_10_PICK', 'LOTTERY_PICK',
    
    # Position
    'POSITION_CATEGORY',
    
    # Career averages
    'PPG_mean', 'RPG_mean', 'APG_mean', 'SPG_mean', 'BPG_mean',
    'TOPG_mean', 'MPG_mean', 'TS_PCT_mean', 'AST_TO_RATIO_mean',
    'SEASONS_PLAYED',
    
    # Game context
    'IS_HOME', 'IS_WIN', 'MONTH', 'DAY_OF_WEEK', 'IS_PLAYOFFS',
    
    # Rolling averages
    'Points_5G_AVG', 'REB_5G_AVG', 'AST_5G_AVG', 'STL_5G_AVG', 'BLK_5G_AVG',
    'TO_5G_AVG', 'Minutes_5G_AVG', 'GAME_EFFICIENCY_5G_AVG',
    'Points_10G_AVG', 'REB_10G_AVG', 'AST_10G_AVG', 'STL_10G_AVG', 'BLK_10G_AVG',
    'TO_10G_AVG', 'Minutes_10G_AVG', 'GAME_EFFICIENCY_10G_AVG',
    
    # Current game stats
    'Minutes', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%',
    'OREB', 'DREB', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF',
    
    # Efficiency metrics
    'GAME_EFFICIENCY', 'TRUE_SHOOTING_PCT', 'USAGE_RATE'
]

# Create final dataset
final_features = [col for col in feature_columns if col in boxscores_with_players.columns]
final_dataset = boxscores_with_players[['PERSON_ID', 'DISPLAY_FIRST_LAST', 'Date', 'season', 'Points'] + final_features].copy()

# Remove rows with too many missing values
final_dataset = final_dataset.dropna(thresh=len(final_features) * 0.7)

print(f"Final dataset shape: {final_dataset.shape}")
print(f"Features included: {len(final_features)}")
print(f"Missing values per column:")
print(final_dataset[final_features].isnull().sum().sort_values(ascending=False).head(10))

Selecting final features...
Final dataset shape: (167718, 56)
Features included: 51
Missing values per column:
FT%                        61956
3P%                        36475
DRAFT_POSITION             20084
FG%                         5412
TRUE_SHOOTING_PCT           4769
USAGE_RATE                   418
WEIGHT                        80
BMI                           80
FTA                            0
GAME_EFFICIENCY_10G_AVG        0
dtype: int64


In [16]:
# Create a mapping dictionary
position_map = {
    'Guard': 1,
    'Forward': 3,
    'Center': 5
}

# Encode the position numerically, assign 0 or NaN to other/unrecognized
final_dataset['POSITION_CATEGORY'] = final_dataset['POSITION_CATEGORY'].map(position_map).fillna(0).astype(int)

In [20]:
# Export processed data
print("Exporting processed data...")

# Export final dataset
final_dataset.to_csv(os.path.join(csv_dir, 'processed_nba_data.csv'), index=False)
print(f"Final dataset exported to: processed_nba_data.csv")

# Export feature summary
feature_summary = pd.DataFrame({
    'Feature': final_features,
    'Type': [final_dataset[col].dtype for col in final_features],
    'Missing_Count': [final_dataset[col].isnull().sum() for col in final_features],
    'Missing_Percent': [final_dataset[col].isnull().sum() / len(final_dataset) * 100 for col in final_features],
    'Correlation_with_Points': [correlations.get(col, 0) for col in final_features]
})

feature_summary = feature_summary.sort_values('Correlation_with_Points', ascending=False)
feature_summary.to_csv(os.path.join(csv_dir, 'feature_summary.csv'), index=False)
print(f"Feature summary exported to: feature_summary.csv")

print("\n=== FEATURE SUMMARY ===")
print(feature_summary.head(20))

Exporting processed data...
Final dataset exported to: processed_nba_data.csv
Feature summary exported to: feature_summary.csv

=== FEATURE SUMMARY ===
                    Feature     Type  Missing_Count  Missing_Percent  Correlation_with_Points
31                      FGM    int64              0         0.000000                 0.961814
48          GAME_EFFICIENCY  float64              0         0.000000                 0.887861
32                      FGA    int64              0         0.000000                 0.883602
14            Points_5G_AVG  float64              0         0.000000                 0.811599
22           Points_10G_AVG  float64              0         0.000000                 0.778277
21   GAME_EFFICIENCY_5G_AVG  float64              0         0.000000                 0.751938
29  GAME_EFFICIENCY_10G_AVG  float64              0         0.000000                 0.727191
30                  Minutes    int64              0         0.000000                 0.717469
37