# CS4042 Data Engineering Group Project

In [1]:
import pandas as pd
import numpy as np
import os

### Preprecessing

In [2]:
sources = ['Player-Advanced-Stats', 'Player-Per-Game-Stats', 'Player-Shooting-Stats', 'Player-Totals-Stats']

# Dictionary for each folder
advanced = {}
per_game = {}
shooting = {}
totals = {}

for src in sources:

    src_path = os.path.join('Datasets', src)

    for file in os.listdir(src_path):
        
        file_path = os.path.join(src_path, file)
        name = file[4:-4] # Name of file

        # Assign DataFrame to correct dict
        if name[-1] == 'd':
            advanced[name] = pd.read_csv(file_path)
        elif name[-1] == 'G':
            per_game[name] = pd.read_csv(file_path)
        elif name[-1] == 'g':
            shooting[name] = pd.read_csv(file_path)
        else:
            totals[name] = pd.read_csv(file_path)


In [3]:
a = advanced['2022-23-Player-Stats-Advanced'] # For Data Wrangler, DELET LATER
s = shooting['2022-23-Player-Stats-Shooting']

Remove shooting index and replace with row one. ONLY run this row once, as it deletes a set number of rows, if this breaks please run again from the top.

In [4]:
for df in shooting.values():
    df.columns = df.iloc[0]
    df.drop(index=0, inplace=True)
    df['G'] = df['G'].astype(float)

Remove any player whos played less than 5 games.

In [5]:
for df in advanced.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in per_game.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in shooting.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in totals.values():
    df.drop(df[df['G'] < 5].index, inplace=True)

In [6]:
from functools import reduce

# Removes padding on column names
def clean(df):
    df = df.copy()
    df.columns = df.columns.str.strip()
    return df

# Returns one row, per player, per season.
def row_return(df):
    df = df.copy()
    counts = df["Player"].value_counts()
    multi = df["Player"].isin(counts[counts > 1].index)
    return df[(~multi) | (df["Team"] == "TOT")].copy()

# Merge tables
def table_merge(base, other, how="left"):
    key = "Player"
    if key not in base.columns or key not in other.columns:
        raise KeyError(f"{key} must be a column in both DataFrames")
    new_cols = [c for c in other.columns if c not in base.columns or c == key]
    return pd.merge(base, other[new_cols], on=key, how=how)

# Create Master table per season
def build(season):
    adv_key      = f"{season}-Player-Stats-Advanced"
    pg_key       = f"{season}-Player-Stats-PG"
    shooting_key = f"{season}-Player-Stats-Shooting"
    totals_key   = f"{season}-Player-Stats-Totals"

    adv      = row_return(clean(advanced[adv_key]))
    pg       = row_return(clean(per_game[pg_key]))
    shoot_df = row_return(clean(shooting[shooting_key]))
    tots     = row_return(clean(totals[totals_key]))

    dfs_to_add = [pg, shoot_df, tots]

    master = reduce(
        lambda left, right: table_merge(left, right, how="left"),
        dfs_to_add,
        adv,
    )

    master["Season"] = season
    return master

# Build master tables for all seasons
seasons = ["2022-23", "2023-24", "2024-25", "2025-26"]
masters_by_season = {season: build(season) for season in seasons}
all_seasons_master = pd.concat(masters_by_season.values(), ignore_index=True)

# Quick View Examples

#print("Shapes by season:")
#for season, df in masters_by_season.items():
#    print(season, df.shape)
#
#print("\nPreview 2025-26 master:")
#display(masters_by_season["2025-26"].head())
#
#print("\nPreview all_seasons_master:")
#display(all_seasons_master.head())


# Save as .csv inside 'Master-Stats' folder.
folder = "Master-Stats"
os.makedirs(folder, exist_ok=True)

for season, df in masters_by_season.items():
    filename = f"NBA-{season}-Master-Stats.csv"
    filepath = os.path.join(folder, filename)
    df.to_csv(filepath, index=False)
    print(f"Saved: {filepath}")

all_seasons_filename = "NBA-All-Seasons-Master-Stats.csv"
all_seasons_filepath = os.path.join(folder, all_seasons_filename)
all_seasons_master.to_csv(all_seasons_filepath, index=False)
print(f"Saved: {all_seasons_filepath}")

Saved: Master-Stats\NBA-2022-23-Master-Stats.csv
Saved: Master-Stats\NBA-2023-24-Master-Stats.csv
Saved: Master-Stats\NBA-2024-25-Master-Stats.csv
Saved: Master-Stats\NBA-2025-26-Master-Stats.csv
Saved: Master-Stats\NBA-All-Seasons-Master-Stats.csv
