In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def find_backend_dir(start_path=None):
    """
    Walk up directories from start_path (or cwd) until a folder named 'backend' is found.
    Returns the absolute path to the 'backend' folder.
    """
    if start_path is None:
        start_path = os.getcwd()
    curr_path = os.path.abspath(start_path)
    while True:
        # Check if 'backend' exists in this directory
        candidate = os.path.join(curr_path, "backend")
        if os.path.isdir(candidate):
            return candidate
        # If at filesystem root, stop
        parent = os.path.dirname(curr_path)
        if curr_path == parent:
            break
        curr_path = parent
    raise FileNotFoundError(f"No 'backend' directory found upward from {start_path}")

# Find the backend directory and CSV folder
backend_dir = find_backend_dir()
csv_dir = os.path.join(backend_dir, "CSVs")

In [3]:
df = pd.read_csv(csv_dir+"/final_nba_dataset.csv", parse_dates=["Date"])

In [4]:
def get_season_id(date):
    y = date.year
    # NBA season: Oct–June; season name is start_year-end_year_short
    return f"{y}-{str(y+1)[-2:]}" if date.month >= 10 else f"{y-1}-{str(y)[-2:]}"
df["SEASON_ID"] = df["Date"].apply(get_season_id)

In [5]:
# All season IDs in the dataset
season_ids = df["SEASON_ID"].unique()
print("Season IDs in dataset:", season_ids)

Season IDs in dataset: ['2003-04' '2004-05' '2005-06' '2006-07' '2007-08' '2008-09' '2009-10'
 '2010-11' '2011-12' '2012-13' '2013-14' '2014-15' '2015-16' '2016-17'
 '2017-18' '2018-19' '2019-20' '2020-21' '2021-22' '2022-23' '2023-24'
 '2024-25']


In [6]:
per_game_numeric = [
    'Points', 'Minutes', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%',
    'OREB', 'DREB', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF', 'GAME_EFFICIENCY', 'TRUE_SHOOTING_PCT', 'USAGE_RATE'
]
rolling = [
    'Points_5G_AVG', 'REB_5G_AVG', 'AST_5G_AVG', 'STL_5G_AVG', 'BLK_5G_AVG', 'TO_5G_AVG', 'Minutes_5G_AVG', 'GAME_EFFICIENCY_5G_AVG',
    'Points_10G_AVG', 'REB_10G_AVG', 'AST_10G_AVG', 'STL_10G_AVG', 'BLK_10G_AVG', 'TO_10G_AVG', 'Minutes_10G_AVG', 'GAME_EFFICIENCY_10G_AVG'
]
bools = [
    'IS_STARTER', 'IS_HOME', 'IS_WIN', 'IS_PLAYOFFS', 'IS_B2B'
]
workload = ['GAMES_LAST_30D']

seasonal_onehots = ['SEASON_Autumn', 'SEASON_Winter', 'SEASON_Spring']

# These are effectively static for a season but you can keep the 'last' value
static_feats = [
    'AGE', 'EXPERIENCE_YEARS', 'HEIGHT_INCHES', 'WEIGHT', 'BMI', 'DRAFT_POSITION', 'TOP_10_PICK', 'LOTTERY_PICK', 'POSITION_CATEGORY'
]

In [7]:
agg_dict = {c: 'mean' for c in per_game_numeric + rolling + bools + workload}
agg_dict.update({c: 'max' for c in seasonal_onehots})  # or 'any'
agg_dict.update({c: 'last' for c in static_feats})

In [8]:
season_feats = (
    df.groupby(['PERSON_ID', 'SEASON_ID']).agg(agg_dict).reset_index()
)

In [9]:
target_stats = [
    'Points', 'FTM', 'FTA', 'FGM', 'FGA', 'TO', 'STL', 'BLK', 'PF', 'USAGE_RATE', 'OREB', 'DREB', 'AST', 'REB', 'Minutes', '3PM', '3PA', '3P%', 'FT%', 'FG%', 'GAME_EFFICIENCY'
]

In [10]:
for stat in target_stats:
    season_feats[f"next_{stat}"] = (
        season_feats.sort_values(["PERSON_ID", "SEASON_ID"])
        .groupby("PERSON_ID")[stat].shift(-1)
    )

In [None]:
# --- Archive all seasons ---
for season_id, season_df in season_feats.groupby("SEASON_ID"):
    season_df.to_csv(os.path.join(csv_dir, f"season_features_{season_id}.csv"), index=False)

# --- Latest season features for live inference ---
features_only_cols = [col for col in season_feats.columns if not col.startswith("next_")]
latest_season = season_feats["SEASON_ID"].max()
latest_season_features = season_feats[season_feats["SEASON_ID"] == latest_season][features_only_cols]
latest_season_features.to_csv(os.path.join(csv_dir, "latest_season_features_for_inference.csv"), index=False)

# --- NOW drop rows with NaN next season targets for training ---
season_feats = season_feats.dropna(subset=[f'next_{s}' for s in target_stats])

# Continue as before...
season_feats['PERSON_ID'] = season_feats['PERSON_ID'].astype(int)
train = season_feats[season_feats["SEASON_ID"] < "2023-24"]
test  = season_feats[season_feats["SEASON_ID"] == "2023-24"]

train.to_csv(csv_dir+"/train_season_features.csv", index=False)
test.to_csv(csv_dir+"/test_season_features.csv", index=False)


Latest season: 2024-25
