# CS4042 Data Engineering Group Project

In [42]:
import pandas as pd
import numpy as np
import os

### Preprecessing

In [43]:
sources = ['Player-Advanced-Stats', 'Player-Per-Game-Stats', 'Player-Shooting-Stats', 'Player-Totals-Stats']

# Dictionary for each folder
advanced = {}
per_game = {}
shooting = {}
totals = {}

for src in sources:

    src_path = os.path.join('Datasets', src)

    for file in os.listdir(src_path):
        
        file_path = os.path.join(src_path, file)
        name = file[4:-4] # Name of file

        # Assign DataFrame to correct dict
        if name[-1] == 'd':
            advanced[name] = pd.read_csv(file_path)
        elif name[-1] == 'G':
            per_game[name] = pd.read_csv(file_path)
        elif name[-1] == 'g':
            shooting[name] = pd.read_csv(file_path)
        else:
            totals[name] = pd.read_csv(file_path)



In [44]:
a = advanced['2022-23-Player-Stats-Advanced'] # For Data Wrangler, DELET LATER
s = shooting['2022-23-Player-Stats-Shooting']
p = per_game['2022-23-Player-Stats-PG']
t = totals['2022-23-Player-Stats-Totals']

### Dirty Data - Ensure the quality of the pipeline

This section aims to create data quality issues within the tables to show that our pipeline can handle these concerns without problem. 

In [45]:
# Adds 'junk' column and fills it with empty strings.
def junk_col(df, col_name='junk'):
    if df.columns[-1] != col_name:
        df[col_name] = ''

# Add a fake leauge average summary row
def avg_row(df):
    if df.empty:
        return
    
    new_row = {col: np.nan for col in df.columns}
    if len(df.columns) > 1:
        second_col = df.columns[1]
        new_row[second_col] = "League Average"
    
    new_index = df.index.max() + 1 if len(df.index) > 0 else 0
    df.loc[new_index] = new_row

# Randomly duplicate rows
def dup_rows(df, n_dups=10, random_state=42):
    if len(df) == 0:
        return
    
    rng = np.random.default_rng(random_state)
    idx_pool = df.index[:-1] if len(df) > 1 else df.index  
    
    if len(idx_pool) == 0:
        return
    
    n_dups = min(n_dups, len(idx_pool))
    dup_indices = rng.choice(idx_pool, size=n_dups, replace=False)

    for idx in dup_indices:
        new_index = df.index.max() + 1
        df.loc[new_index] = df.loc[idx]

# add extra spaces
def add_whitespace(df, columns=None):
    if columns is None:
        columns = df.select_dtypes(include=["object"]).columns

    for col in columns:
        df[col] = df[col].apply(lambda v: f"  {v}  " if isinstance(v, str) else v)

# Convert Numeric values to strings
def numeric_to_string(df, columns=None):
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns

    for col in columns:
        df[col] = df[col].apply(lambda v: f" {v} " if pd.notna(v) else v)


In [46]:
def data_dirtier(advanced, per_game, shooting, totals, seed=42):
    for df in advanced.values():
        junk_col(df)
        dup_rows(df, random_state=seed)
        avg_row(df)
        avg_row(df)
        add_whitespace(df)

    for df in per_game.values():
        junk_col(df)
        dup_rows(df, random_state=seed)
        avg_row(df)
        add_whitespace(df)

    for df in shooting.values():
        junk_col(df)
        dup_rows(df, random_state=seed)
        avg_row(df)
        add_whitespace(df)
        numeric_to_string(df)  

    for df in totals.values():
        junk_col(df)
        dup_rows(df, random_state=seed)
        avg_row(df)
        add_whitespace(df)


data_dirtier(advanced, per_game, shooting, totals, seed=42)



### Clean Data - Fix Tables
This section aims to standardise the tables. Ensures there is only one header row, all datatypes are correct for columns, and removes redunant columns

In the case where there are two heading rows, merge them (only happens in shooting df)

In [47]:
def concatinate_headings(top_head, sub_head):
    heading = top_head + "_" + sub_head
    return heading

def clean_heading(heading, type):
    # Remove whitespace
    heading = heading.strip()

    # Replace spaces in text
    heading = heading.replace(" ", "_")

    # Remove trailing numbers in top_head
    if type == "top":
        if "." in heading:
            before, after = heading.rsplit(".", 1) # Splits at last "."
            if after.isdigit(): # if digits follow
                heading = before

    return heading

def logic(df):
    for column in df:
        # Break if there is only one heading
        if "Rk" in str(column):
            break
        # Skip if top heading doesn't have a value
        elif "Unnamed" in str(column):
            continue
        else:
            topheading = clean_heading(column, "top")
            subheading = clean_heading(df[column][0], "sub")
            output = concatinate_headings(topheading, subheading)
            df.loc[0, column] = output.strip()

# Apply changes
for df in shooting.values():
    logic(df)


Remove top headings and replace with concatinated headings done above (only happens in shooting)

In [48]:
def drop_irregular_headings(df):
    if df.columns[0] == "Unnamed: 0":
        df.columns = df.iloc[0]
        df.drop(index=0, inplace=True)
        df.reset_index(drop=True, inplace=True) # drop ensures previous index isn't repeated in the dataframe as a seperate column

# Apply changes
for df in shooting.values():
    drop_irregular_headings(df)

Ensure columns have correct data types (types only incorrect in shooting)
By default, that table's columns are all string objects

In [49]:
def check_type_is_digit(df):
    for column in df.columns:
        value = str(df[column][0]).strip()

        # Skip empty cells
        if value == None:
            continue

        if value == "":
            continue

        if value == "nan":
            continue

        # Convert strings to floats if they are a number or decimal
        try:
            float(value)
            df[column] = df[column].astype(float)
        except:
            continue

for df in shooting.values():
    check_type_is_digit(df)

Drop the last columns after Awards as they don't contain relevant data

In [50]:
for df in advanced.values():
    while df.columns[-1].strip() != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)
for df in per_game.values():
    while df.columns[-1].strip() != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)
for df in shooting.values():
    while df.columns[-1].strip() != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)
for df in totals.values():
    while df.columns[-1].strip() != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)

Drop rows where the majority of the cells are empty (irrelevant data) as well as average rows.

In [51]:
for df in advanced.values():
    df.columns = df.columns.str.strip()
    df.dropna(thresh=int(df.shape[1] * 0.4), inplace=True)
    df.drop(df[df['Player'].str.strip() == "League Average"].index, inplace=True)

for df in per_game.values():
    df.columns = df.columns.str.strip()
    df.dropna(thresh=int(df.shape[1] * 0.4), inplace=True)
    df.drop(df[df['Player'].str.strip() == "League Average"].index, inplace=True)

for df in shooting.values():
    df.columns = df.columns.str.strip()
    df.dropna(thresh=int(df.shape[1] * 0.4), inplace=True)
    df.drop(df[df['Player'].str.strip() == "League Average"].index, inplace=True)

for df in totals.values():
    df.columns = df.columns.str.strip()
    df.dropna(thresh=int(df.shape[1] * 0.4), inplace=True)
    df.drop(df[df['Player'].str.strip() == "League Average"].index, inplace=True)


Drop the final rows before relevant data

In [52]:
def check_if_numeric(df):
    try:
        value = int(df.iloc[-1, 0])
    except:
        return False
    else:
        return True
    
for df in advanced.values():
    while not check_if_numeric(df):
        df.drop(df.index[-1], axis=0, inplace=True)
        df.reset_index(drop=True, inplace=True)
for df in per_game.values():
    while not check_if_numeric(df):
        df.drop(df.index[-1], axis=0, inplace=True)
        df.reset_index(drop=True, inplace=True)
for df in shooting.values():
    while not check_if_numeric(df):
        df.drop(df.index[-1], axis=0, inplace=True)
        df.reset_index(drop=True, inplace=True)
for df in totals.values():
    while not check_if_numeric(df):
        df.drop(df.index[-1], axis=0, inplace=True)
        df.reset_index(drop=True, inplace=True)

### Clean Data - Making Data Analysis-Worthy

Check for duplicate rows and remove them

In [53]:
def remove_duplicates(df):
    df.drop_duplicates(inplace=True)

"""
Optionally, drop only by subset (eg. player, team).
This way if there is an error with the data, where the wrong team or player was placed in a cell, those will get dropped as well.
Even if other columns have different values.

def remove_duplicates(df):
    df.drop_duplicates(subset=["Player", "Team"], inplace=True)
"""

for df in advanced.values():
    remove_duplicates(df)
for df in per_game.values():
    remove_duplicates(df)
for df in shooting.values():
    remove_duplicates(df)
for df in totals.values():
    remove_duplicates(df)

Strip all data of whitespace

In [54]:
def strip_whitespace(df):
    df.iloc[0:] = df.iloc[0:].map(
        # Strips if value is a string object only
        lambda value: value.strip() if isinstance(value, str) else value
    )

for df in advanced.values():
    strip_whitespace(df)
for df in per_game.values():
    strip_whitespace(df)
for df in shooting.values():
    strip_whitespace(df)
for df in totals.values():
    strip_whitespace(df)

Convert common cell placeholders to NaN

In [55]:
def convert_to_nan(df):
    df.iloc[0:] = df.iloc[0:].map(
        # Convers to NaN if...
        lambda value: np.nan

        # Empty string
        if (isinstance(value, str) and value.strip() == "")

        # Other common placeholders
        or (isinstance(value, str) and "n/a" in str(value).lower())
        or (isinstance(value, str) and "null" in str(value).lower())
        or (isinstance(value, str) and str(value) == "?")
        or (isinstance(value, str) and "unknown" in str(value).lower())

        # Otherwise keep value
        else value
    )

for df in advanced.values():
    convert_to_nan(df)
for df in per_game.values():
    convert_to_nan(df)
for df in shooting.values():
    convert_to_nan(df)
for df in totals.values():
    convert_to_nan(df)

Remove any player whos played less than 5 games.

In [56]:
for df in advanced.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in per_game.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in shooting.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in totals.values():
    df.drop(df[df['G'] < 5].index, inplace=True)

In [57]:
from functools import reduce

# Removes padding on column names
def clean(df):
    df = df.copy()
    df.columns = df.columns.str.strip()
    return df

# Returns one row, per player, per season.
def row_return(df):
    df = df.copy()
    counts = df["Player"].value_counts()
    multi = df["Player"].isin(counts[counts > 1].index)
    return df[(~multi) | (df["Team"] == "TOT")].copy()

# Merge tables
def table_merge(base, other, how="left"):
    key = "Player"
    if key not in base.columns or key not in other.columns:
        raise KeyError(f"{key} must be a column in both DataFrames")
    new_cols = [c for c in other.columns if c not in base.columns or c == key]
    return pd.merge(base, other[new_cols], on=key, how=how)

# Create Master table per season
def build(season):
    adv_key      = f"{season}-Player-Stats-Advanced"
    pg_key       = f"{season}-Player-Stats-PG"
    shooting_key = f"{season}-Player-Stats-Shooting"
    totals_key   = f"{season}-Player-Stats-Totals"

    adv      = row_return(clean(advanced[adv_key]))
    pg       = row_return(clean(per_game[pg_key]))
    shoot_df = row_return(clean(shooting[shooting_key]))
    tots     = row_return(clean(totals[totals_key]))

    dfs_to_add = [pg, shoot_df, tots]

    master = reduce(
        lambda left, right: table_merge(left, right, how="left"),
        dfs_to_add,
        adv,
    )

    master["Season"] = season
    return master

# Build master tables for all seasons
seasons = ["2022-23", "2023-24", "2024-25", "2025-26"]
masters_by_season = {season: build(season) for season in seasons}
all_seasons_master = pd.concat(masters_by_season.values(), ignore_index=True)

# Quick View Examples

#print("Shapes by season:")
#for season, df in masters_by_season.items():
#    print(season, df.shape)
#
#print("\nPreview 2025-26 master:")
#display(masters_by_season["2025-26"].head())
#
#print("\nPreview all_seasons_master:")
#display(all_seasons_master.head())


# Save as .csv inside 'Master-Stats' folder.
folder = "Master-Stats"
os.makedirs(folder, exist_ok=True)

for season, df in masters_by_season.items():
    filename = f"NBA-{season}-Master-Stats.csv"
    filepath = os.path.join(folder, filename)
    df.to_csv(filepath, index=False)
    print(f"Saved: {filepath}")

all_seasons_filename = "NBA-All-Seasons-Master-Stats.csv"
all_seasons_filepath = os.path.join(folder, all_seasons_filename)
all_seasons_master.to_csv(all_seasons_filepath, index=False)
print(f"Saved: {all_seasons_filepath}")

Saved: Master-Stats\NBA-2022-23-Master-Stats.csv
Saved: Master-Stats\NBA-2023-24-Master-Stats.csv
Saved: Master-Stats\NBA-2024-25-Master-Stats.csv
Saved: Master-Stats\NBA-2025-26-Master-Stats.csv
Saved: Master-Stats\NBA-All-Seasons-Master-Stats.csv


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import unicodedata

def normalizeName(name):
    nfkd_form = unicodedata.normalize('NFKD', name)
    only_ascii = nfkd_form.encode('ASCII', 'ignore').decode('utf-8')
    return only_ascii.lower()

def KNN(df, name):

    k = 5 # ensures it doesnt find the same person for all the entries (one guy can appear up to 4 times)

    name = normalizeName(name)

    df["PlayerNorm"] = df["Player"].apply(normalizeName)

    if name not in df["PlayerNorm"].values:
        return False
    
    playerIndex = df.index[df["PlayerNorm"] == name][0]

    X = df.drop("Player", axis=1)
    
    # Separate numeric and categorical columns
    numFeatures = X.select_dtypes(include=["int64", "float64"]).columns
    catFeatures = X.select_dtypes(include=["object", "category"]).columns
    
    # Fill missing values
    X[numFeatures] = X[numFeatures].fillna(0)
    X[catFeatures] = X[catFeatures].fillna("missing")
    
    # Convert categorical to string
    for col in catFeatures:
        X[col] = X[col].astype(str)
    
    # Preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numFeatures),
            ("cat", OneHotEncoder(), catFeatures)
        ]
    )
    
    X_processed = preprocessor.fit_transform(X)
    
    # Fit NearestNeighbors
    NN = NearestNeighbors(n_neighbors=k, metric='euclidean')
    NN.fit(X_processed)
    
    distances, indices = NN.kneighbors(X_processed[playerIndex])
    
    # Exclude the player themselves
    neighbors = []
    neighbor_distances = []
    
    for idx, dist in zip(indices[0], distances[0]):
        if df.iloc[idx]["PlayerNorm"] != name:
            neighbors.append(idx)
            neighbor_distances.append(dist)
        if len(neighbors) >= k:
            break
    
    result = df.iloc[neighbors[0]]["Player"]

    print(result)

    return True

KNN(all_seasons_master, "Trey Murphy III") # need to change this to an input



Grayson Allen


True

In [60]:
#\n

from IPython.display import clear_output

clear_output()

menu = True

while menu:
    print("\n0 : Exit")
    print("\n1 : Find most similar player")
    print("\n2 : Visualise position of most shots")
    print("\nPlease select a number :")
    x = input().strip()
    print("\n")


    if x == "0":
        clear_output()
        print("Goodbye !")
        break
    elif x == "1":

        KNNmenu = True

        while KNNmenu:

            Yearmenu = True

            while Yearmenu:

                clear_output()

                print("\nPlease enter the year you wish to search within. (2022, 2023, 2024, 2025) / exit with 0")
                y = input().strip()

                if y == "0":
                    clear_output()
                    print("Returned back to main menu")
                    KNNmenu = False
                    break

                elif y == "2022":
                    yearDF = all_seasons_master[all_seasons_master.Season == "2022-23"]
                    print(yearDF.head(10))
                    break

                elif y == "2023":
                    yearDF = all_seasons_master[all_seasons_master.Season == "2023-24"]
                    print(yearDF.head(10))
                    break

                elif y == "2024":
                    yearDF = all_seasons_master[all_seasons_master.Season == "2024-25"]
                    print(yearDF.head(10))
                    break

                elif y == "2025":
                    yearDF = all_seasons_master[all_seasons_master.Season == "2025-26"]
                    print(yearDF.head(10))
                    break

            if KNNmenu == False:
                break

            Playermenu = True

            while Playermenu:

                print("\nPlease enter the player you wish to search from. (full name) / exit with 0")
                z = input().strip()

                if z == "0":
                    clear_output()
                    print("Returned back to main menu")
                    KNNmenu = False
                    break

                else:
                    found = KNN(yearDF,z)

                if found:
                    KNNmenu = False
                    break
                else:
                    clear_output()
                    print("player not found in database")

            if KNNmenu == False:
                break




    elif x == "2":
        clear_output()
        print("fingle")
    else:
        clear_output()
        print("That is not a valid entry")





player not found in database

Please enter the player you wish to search from. (full name) / exit with 0


IndexError: index (906) out of range