In [3]:
from BRScraper import nba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [4]:
# Data preprocessing for ML Model

# df = nba.get_stats(season=2023, info='per_game', playoffs=False)
# drop_columns = ['Age','Pos', 'GS', '3PA', '2PA', 'PF', 'Awards']
# df_cleaned = df.drop(columns=drop_columns)

In [5]:
# mvp_data = nba.get_award_votings('mvp', 2023)
# nominated_players = mvp_data['Player'].tolist()
# print(nominated_players)

# creating the 'Previously_Nominated' column, if a player was nominated for MVP mark 1, else mark 0. will help serve as a proxy for player reputation
# df_cleaned['Previously_Nominated'] = df_cleaned['Player'].apply(lambda x: 1 if x in nominated_players else 0)

['Joel Embiid', 'Nikola Jokić', 'Giannis Antetokounmpo', 'Jayson Tatum', 'Shai Gilgeous-Alexander', 'Donovan Mitchell', 'Domantas Sabonis', 'Luka Dončić', 'Stephen Curry', 'Jimmy Butler', "De'Aaron Fox", 'Jalen Brunson', 'Ja Morant']


In [6]:
# # identifying players who have stats for multiple teams and eliminating duplicates
# multi_team_players = df_cleaned[df_cleaned['Team'] == '2TM']['Player'].unique()

# # keeping only the row where team value is set to 2TM, this row will include all combined stats and average from all teams the player played for
# mask = (df_cleaned['Team'] == '2TM') | (~df_cleaned['Player'].isin(multi_team_players))

# df_cleaned = df_cleaned[mask]

In [7]:
# # creating a True Shooting Percentage (TS%) column
# # the formula is TS% = PTS / 2 * (FGA + 0.44 * FTA)

# if 'PTS' in df_cleaned.columns and 'FGA' in df_cleaned.columns and 'FTA' in df_cleaned.columns:
#     df_cleaned['TS%'] = df_cleaned['PTS'] / (2 * (df_cleaned['FGA'] + 0.44 * df_cleaned['FTA']))
#     df_cleaned['TS%'] = df_cleaned['TS%'].round(2)

In [8]:
# adding another column EEF, stands effeciency. It a metric used by the nba to calculate a player's efficiency or impact.

# # calculating missed field goals and missed free throws because the EEF formula requires it.
# df_cleaned['Missed_FG'] = df_cleaned['FGA'] - df_cleaned['FG']
# df_cleaned['Missed_FT'] = df_cleaned['FTA'] - df_cleaned['FT']

# # Calculating EFF
# df_cleaned['EFF'] = (
#     df_cleaned['PTS'] +
#     df_cleaned['TRB'] +
#     df_cleaned['AST'] +
#     df_cleaned['STL'] +
#     df_cleaned['BLK'] -
#     df_cleaned['Missed_FG'] -
#     df_cleaned['Missed_FT'] -
#     df_cleaned['TOV']
#     ) / df_cleaned['G']

# # dropping the temporary columns, no longer needed
# df_cleaned.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True)

# # rounded EFF to 2 decimals
# df_cleaned['EFF'] = df_cleaned['EFF'].round(2)

# output_file = "nba_2023_adjusted_data.csv"
# df_cleaned.to_csv(output_file, index=False)

In [10]:
import pandas as pd
import os

# Function to preprocess a single season
def preprocess_season(file_name, season):
    df = pd.read_csv(file_name)
    
    # Drop unnecessary columns
    drop_columns = ['Age', 'Pos', 'GS', '3PA', '2PA', 'PF', 'Awards']
    df_cleaned = df.drop(columns=drop_columns, errors='ignore')
    
    # Load MVP data for the season
    try:
        mvp_data = nba.get_award_votings('mvp', season)
        nominated_players = mvp_data['Player'].tolist()
    except Exception:
        nominated_players = []  # If MVP data is not available, leave this empty

    # Add 'Previously_Nominated' column
    df_cleaned['Previously_Nominated'] = df_cleaned['Player'].apply(
        lambda x: 1 if x in nominated_players else 0
    )

    # Handle players who played for multiple teams
    multi_team_players = df_cleaned[df_cleaned['Team'] == '2TM']['Player'].unique()
    mask = (df_cleaned['Team'] == '2TM') | (~df_cleaned['Player'].isin(multi_team_players))
    df_cleaned = df_cleaned[mask]

    # Calculate TS%
    if 'PTS' in df_cleaned.columns and 'FGA' in df_cleaned.columns and 'FTA' in df_cleaned.columns:
        df_cleaned['TS%'] = df_cleaned['PTS'] / (2 * (df_cleaned['FGA'] + 0.44 * df_cleaned['FTA']))
        df_cleaned['TS%'] = df_cleaned['TS%'].round(2)

    # Calculate missed shots for EFF
    if 'FGA' in df_cleaned.columns and 'FG' in df_cleaned.columns:
        df_cleaned['Missed_FG'] = df_cleaned['FGA'] - df_cleaned['FG']
    if 'FTA' in df_cleaned.columns and 'FT' in df_cleaned.columns:
        df_cleaned['Missed_FT'] = df_cleaned['FTA'] - df_cleaned['FT']

    # Calculate EFF
    if {'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'G', 'Missed_FG', 'Missed_FT'}.issubset(df_cleaned.columns):
        df_cleaned['EFF'] = (
            df_cleaned['PTS'] +
            df_cleaned['TRB'] +
            df_cleaned['AST'] +
            df_cleaned['STL'] +
            df_cleaned['BLK'] -
            df_cleaned['Missed_FG'] -
            df_cleaned['Missed_FT'] -
            df_cleaned['TOV']
        ) / df_cleaned['G']
        df_cleaned['EFF'] = df_cleaned['EFF'].round(2)

    # Drop temporary columns
    df_cleaned.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True, errors='ignore')

    return df_cleaned

# Loop through all season files
input_folder = ""  # Root folder
output_folder = "processed_data"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for year in range(1980, 2024):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            processed_df = preprocess_season(file_name, year)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")


Processed 1980-81 successfully!
Processed 1981-82 successfully!
Processed 1982-83 successfully!
Processed 1983-84 successfully!
Processed 1984-85 successfully!
Processed 1985-86 successfully!
Processed 1986-87 successfully!
Processed 1987-88 successfully!
Processed 1988-89 successfully!
Processed 1989-90 successfully!
Processed 1990-91 successfully!
Processed 1991-92 successfully!
Processed 1992-93 successfully!
Processed 1993-94 successfully!
Processed 1994-95 successfully!
Processed 1995-96 successfully!
Processed 1996-97 successfully!
Processed 1997-98 successfully!
Processed 1998-99 successfully!
Processed 1999-00 successfully!
Processed 2000-01 successfully!
Processed 2001-02 successfully!
Processed 2002-03 successfully!
Processed 2003-04 successfully!
Processed 2004-05 successfully!
Processed 2005-06 successfully!
Processed 2006-07 successfully!
Processed 2007-08 successfully!
Processed 2008-09 successfully!
Processed 2009-10 successfully!
Processed 2010-11 successfully!
Processe