In [7]:
from BRScraper import nba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [8]:
# Data preprocessing for ML Model

df = nba.get_stats(season=2023, info='per_game', playoffs=False)
drop_columns = ['Age','Pos', 'GS', '3PA', '2PA', 'PF', 'Awards']
df_cleaned = df.drop(columns=drop_columns)

In [9]:
mvp_data = nba.get_award_votings('mvp', 2023)
nominated_players = mvp_data['Player'].tolist()
print(nominated_players)

# creating the 'Previously_Nominated' column, if a player was nominated for MVP mark 1, else mark 0. will help serve as a proxy for player reputation
df_cleaned['Previously_Nominated'] = df_cleaned['Player'].apply(lambda x: 1 if x in nominated_players else 0)

['Joel Embiid', 'Nikola Jokić', 'Giannis Antetokounmpo', 'Jayson Tatum', 'Shai Gilgeous-Alexander', 'Donovan Mitchell', 'Domantas Sabonis', 'Luka Dončić', 'Stephen Curry', 'Jimmy Butler', "De'Aaron Fox", 'Jalen Brunson', 'Ja Morant']


In [10]:
# identifying players who have stats for multiple teams and eliminating duplicates
multi_team_players = df_cleaned[df_cleaned['Team'] == '2TM']['Player'].unique()

# keeping only the row where team value is set to 2TM, this row will include all combined stats and average from all teams the player played for
mask = (df_cleaned['Team'] == '2TM') | (~df_cleaned['Player'].isin(multi_team_players))

df_cleaned = df_cleaned[mask]

In [11]:
# creating a True Shooting Percentage (TS%) column
# the formula is TS% = PTS / 2 * (FGA + 0.44 * FTA)

if 'PTS' in df_cleaned.columns and 'FGA' in df_cleaned.columns and 'FTA' in df_cleaned.columns:
    df_cleaned['TS%'] = df_cleaned['PTS'] / (2 * (df_cleaned['FGA'] + 0.44 * df_cleaned['FTA']))
    df_cleaned['TS%'] = df_cleaned['TS%'].round(2)

In [12]:
# adding another column EEF, stands effeciency. It a metric used by the nba to calculate a player's efficiency or impact.

# calculating missed field goals and missed free throws because the EEF formula requires it.
df_cleaned['Missed_FG'] = df_cleaned['FGA'] - df_cleaned['FG']
df_cleaned['Missed_FT'] = df_cleaned['FTA'] - df_cleaned['FT']

# Calculating EFF
df_cleaned['EFF'] = (
    df_cleaned['PTS'] +
    df_cleaned['TRB'] +
    df_cleaned['AST'] +
    df_cleaned['STL'] +
    df_cleaned['BLK'] -
    df_cleaned['Missed_FG'] -
    df_cleaned['Missed_FT'] -
    df_cleaned['TOV']
    ) / df_cleaned['G']

# dropping the temporary columns, no longer needed
df_cleaned.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True)

# rounded EFF to 2 decimals
df_cleaned['EFF'] = df_cleaned['EFF'].round(2)

output_file = "nba_2023_adjusted_data.csv"
df_cleaned.to_csv(output_file, index=False)