In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def find_backend_dir(start_path=None):
    """
    Walk up directories from start_path (or cwd) until a folder named 'backend' is found.
    Returns the absolute path to the 'backend' folder.
    """
    if start_path is None:
        start_path = os.getcwd()
    curr_path = os.path.abspath(start_path)
    while True:
        # Check if 'backend' exists in this directory
        candidate = os.path.join(curr_path, "backend")
        if os.path.isdir(candidate):
            return candidate
        # If at filesystem root, stop
        parent = os.path.dirname(curr_path)
        if curr_path == parent:
            break
        curr_path = parent
    raise FileNotFoundError(f"No 'backend' directory found upward from {start_path}")

# Find the backend directory and CSV folder
backend_dir = find_backend_dir()
csv_dir = os.path.join(backend_dir, "CSVs")

In [3]:
df = pd.read_csv(csv_dir+"/final_nba_dataset.csv", parse_dates=["Date"])

In [4]:
df

Unnamed: 0,PERSON_ID,DISPLAY_FIRST_LAST,Date,season,Points,AGE,EXPERIENCE_YEARS,HEIGHT_INCHES,WEIGHT,BMI,...,TO,STL,BLK,PF,GAME_EFFICIENCY,TRUE_SHOOTING_PCT,USAGE_RATE,DAYS_SINCE_LAST_GAME,IS_B2B,GAMES_LAST_30D
0,2544.0,LeBron James,2003-10-29,2003-04,25,40.514716,22,81,250.0,26.789705,...,4,0,2,3,45.7,0.586304,60.285714,0.0,False,13.0
1,2544.0,LeBron James,2003-10-30,2003-04,21,40.514716,22,81,250.0,26.789705,...,1,0,7,1,60.4,0.522908,51.414634,1.0,True,12.0
2,2544.0,LeBron James,2003-11-01,2003-04,8,40.514716,22,81,250.0,26.789705,...,2,0,2,3,23.8,0.310559,38.153846,2.0,False,11.0
3,2544.0,LeBron James,2003-11-05,2003-04,7,40.514716,22,81,250.0,26.789705,...,2,3,2,1,38.7,0.305944,32.780488,4.0,False,10.0
4,2544.0,LeBron James,2003-11-07,2003-04,23,40.514716,22,81,250.0,26.789705,...,0,0,7,2,47.5,0.545541,47.909091,2.0,False,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167713,1642530.0,Yuki Kawamura,2025-02-03,2024-25,0,24.177960,1,68,159.0,24.175599,...,0,0,0,0,2.7,,0.000000,17.0,False,2.0
167714,1642530.0,Yuki Kawamura,2025-02-05,2024-25,0,24.177960,1,68,159.0,24.175599,...,0,0,0,0,1.5,0.000000,25.000000,2.0,False,12.0
167715,1642530.0,Yuki Kawamura,2025-02-12,2024-25,0,24.177960,1,68,159.0,24.175599,...,0,0,0,0,0.0,,0.000000,7.0,False,9.0
167716,1642530.0,Yuki Kawamura,2025-04-10,2024-25,0,24.177960,1,68,159.0,24.175599,...,0,0,0,0,0.0,,0.000000,57.0,False,15.0


In [5]:
def get_season_id(date):
    y = date.year
    # NBA season: Oct–June; season name is start_year-end_year_short
    return f"{y}-{str(y+1)[-2:]}" if date.month >= 10 else f"{y-1}-{str(y)[-2:]}"
df["SEASON_ID"] = df["Date"].apply(get_season_id)

In [6]:
per_game_numeric = [
    'Points', 'Minutes', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%',
    'OREB', 'DREB', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF', 'GAME_EFFICIENCY', 'TRUE_SHOOTING_PCT', 'USAGE_RATE'
]
rolling = [
    'Points_5G_AVG', 'REB_5G_AVG', 'AST_5G_AVG', 'STL_5G_AVG', 'BLK_5G_AVG', 'TO_5G_AVG', 'Minutes_5G_AVG', 'GAME_EFFICIENCY_5G_AVG',
    'Points_10G_AVG', 'REB_10G_AVG', 'AST_10G_AVG', 'STL_10G_AVG', 'BLK_10G_AVG', 'TO_10G_AVG', 'Minutes_10G_AVG', 'GAME_EFFICIENCY_10G_AVG'
]
bools = [
    'IS_STARTER', 'IS_HOME', 'IS_WIN', 'IS_PLAYOFFS', 'IS_B2B'
]
workload = ['GAMES_LAST_30D']

seasonal_onehots = ['SEASON_Autumn', 'SEASON_Winter', 'SEASON_Spring']

# These are effectively static for a season but you can keep the 'last' value
static_feats = [
    'AGE', 'EXPERIENCE_YEARS', 'HEIGHT_INCHES', 'WEIGHT', 'BMI', 'DRAFT_POSITION', 'TOP_10_PICK', 'LOTTERY_PICK', 'POSITION_CATEGORY'
]

In [7]:
agg_dict = {c: 'mean' for c in per_game_numeric + rolling + bools + workload}
agg_dict.update({c: 'max' for c in seasonal_onehots})  # or 'any'
agg_dict.update({c: 'last' for c in static_feats})

In [8]:
season_feats = (
    df.groupby(['PERSON_ID', 'SEASON_ID']).agg(agg_dict).reset_index()
)

In [9]:
target_stats = [
    'Points', 'FTM', 'FTA', 'FGM', 'FGA', 'TO', 'STL', 'BLK', 'PF', 'USAGE_RATE', 'OREB', 'DREB', 'AST', 'REB', 'Minutes', '3PM', '3PA', '3P%', 'FT%', 'FG%', 'GAME_EFFICIENCY'
]

In [10]:
for stat in target_stats:
    season_feats[f"next_{stat}"] = (
        season_feats.sort_values(["PERSON_ID", "SEASON_ID"])
        .groupby("PERSON_ID")[stat].shift(-1)
    )

In [11]:
# --- Archive all seasons ---
for season_id, season_df in season_feats.groupby("SEASON_ID"):
    season_df.to_csv(os.path.join(csv_dir, f"season_features_{season_id}.csv"), index=False)

# --- Latest season features for live inference ---
features_only_cols = [col for col in season_feats.columns if not col.startswith("next_")]
latest_season = season_feats["SEASON_ID"].max()
season_feats[season_feats["SEASON_ID"] == latest_season][features_only_cols] \
    .to_csv(os.path.join(csv_dir, "latest_season_features_for_inference.csv"), index=False)


In [12]:
season_feats = season_feats.dropna(subset=[f'next_{s}' for s in target_stats])

In [13]:
train = season_feats[season_feats["SEASON_ID"] < "2023-24"]
test  = season_feats[season_feats["SEASON_ID"] == "2023-24"]

In [14]:
train.to_csv(csv_dir+"/train_season_features.csv", index=False)
test.to_csv(csv_dir+"/test_season_features.csv", index=False)

In [None]:
features_only_cols = [col for col in season_feats.columns if not col.startswith("next_")]
latest_season = season_feats["SEASON_ID"].max()
latest_season_features = season_feats[season_feats["SEASON_ID"] == latest_season][features_only_cols]
latest_season_features.to_csv(os.path.join(csv_dir, "latest_season_features_for_inference.csv"), index=False)
