# CS4042 Data Engineering Group Project

In [1]:
import pandas as pd
import numpy as np
import os

### Preprecessing

In [53]:
sources = ['Player-Advanced-Stats', 'Player-Per-Game-Stats', 'Player-Shooting-Stats', 'Player-Totals-Stats']

# Dictionary for each folder
advanced = {}
per_game = {}
shooting = {}
totals = {}

for src in sources:

    src_path = os.path.join('Datasets', src)

    for file in os.listdir(src_path):
        
        file_path = os.path.join(src_path, file)
        name = file[4:-4] # Name of file

        # Assign DataFrame to correct dict
        if name[-1] == 'd':
            advanced[name] = pd.read_csv(file_path)
        elif name[-1] == 'G':
            per_game[name] = pd.read_csv(file_path)
        elif name[-1] == 'g':
            shooting[name] = pd.read_csv(file_path)
        else:
            totals[name] = pd.read_csv(file_path)



In [69]:
a = advanced['2022-23-Player-Stats-Advanced'] # For Data Wrangler, DELET LATER
s = shooting['2022-23-Player-Stats-Shooting']
p = per_game['2022-23-Player-Stats-PG']
t = totals['2022-23-Player-Stats-Totals']

### Clean Data - Fix Tables
This section aims to standardise the tables. Ensures there is only one header row, all datatypes are correct for columns, and removes redunant columns

In the case where there are two heading rows, merge them (only happens in shooting df)

In [54]:
def concatinate_headings(top_head, sub_head):
    heading = top_head + "_" + sub_head
    return heading

def clean_heading(heading, type):
    # Remove whitespace
    heading = heading.strip()

    # Replace spaces in text
    heading = heading.replace(" ", "_")

    # Remove trailing numbers in top_head
    if type == "top":
        if "." in heading:
            before, after = heading.rsplit(".", 1) # Splits at last "."
            if after.isdigit(): # if digits follow
                heading = before

    return heading

def logic(df):
    for column in df:
        # Break if there is only one heading
        if "Rk" in str(column):
            break
        # Skip if top heading doesn't have a value
        elif "Unnamed" in str(column):
            continue
        else:
            topheading = clean_heading(column, "top")
            subheading = clean_heading(df[column][0], "sub")
            output = concatinate_headings(topheading, subheading)
            df.loc[0, column] = output

# Apply changes
for df in shooting.values():
    logic(df)


Remove top headings and replace with concatinated headings done above (only happens in shooting)

In [55]:
def drop_irregular_headings(df):
    if df.columns[0] == "Unnamed: 0":
        df.columns = df.iloc[0]
        df.drop(index=0, inplace=True)
        df.reset_index(drop=True, inplace=True) # drop ensures previous index isn't repeated in the dataframe as a seperate column

# Apply changes
for df in shooting.values():
    drop_irregular_headings(df)

Ensure columns have correct data types (types only incorrect in shooting)
By default, that table's columns are all string objects

In [None]:
def check_type_is_digit(df):
    for column in df.columns:
        value = str(df[column][0]).strip()

        # Skip empty cells
        if value == None:
            continue

        if value == "":
            continue

        if value == "nan":
            continue

        # Convert strings to floats if they are a number or decimal
        try:
            float(value)
            df[column] = df[column].astype(float)
        except:
            continue

for df in shooting.values():
    check_type_is_digit(df)

Drop the final column as it doesn't contain relevant data

In [59]:
for df in advanced.values():
    if df.columns[-1] != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)
for df in per_game.values():
    if df.columns[-1] != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)
for df in shooting.values():
    if df.columns[-1] != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)
for df in totals.values():
    if df.columns[-1] != "Awards":
        df.drop(df.columns[-1], axis=1, inplace=True)

Drop the final row as it doesn't contain relevant data

In [71]:
for df in advanced.values():
    if df.iloc[-1, 1] == "League Average":
        df.drop(df.index[-1], axis=0, inplace=True)
for df in per_game.values():
    if df.iloc[-1, 1] == "League Average":
        df.drop(df.index[-1], axis=0, inplace=True)
for df in shooting.values():
    if df.iloc[-1, 1] == "League Average":
        df.drop(df.index[-1], axis=0, inplace=True)
for df in totals.values():
    if df.iloc[-1, 1] == "League Average":
        df.drop(df.index[-1], axis=0, inplace=True)

### Clean Data - Making Data Analysis-Worthy

Remove any player whos played less than 5 games.

In [None]:
for df in advanced.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in per_game.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in shooting.values():
    df.drop(df[df['G'] < 5].index, inplace=True)
for df in totals.values():
    df.drop(df[df['G'] < 5].index, inplace=True)