In [None]:
#import libraries
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats

In [None]:
#read csv data and change Wl to 1s and 0s
nba_df = pd.read_csv("NBA_Data/NBA Player Stats(1950 - 2022).csv")
nba_df = nba_df[["Season", "Player", "G", "FG", "FG%", "3P%", "2P%", "FT", "FTA", "MP", "PTS", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV"]]
nba_df = nba_df.rename(columns={"FT": "FTM", "FG": "FGM", "DRB": "DREB"})
games_1985_to_2000 = pd.read_csv("NBA_Data/games_data_1985_to_2000.csv")
games_2001_to_today = pd.read_csv("NBA_Data/games_data_2001_to_today.csv")
games_1985_to_2000 = games_1985_to_2000[["WL", "MIN", "FGM", "FGA", "FTM", "FTA", "OREB", "DREB", "AST", "STL", "BLK", "PTS"]]
games_2001_to_today = games_2001_to_today[["WL", "MIN", "FGM", "FGA", "FTM", "FTA", "OREB", "DREB", "AST", "STL", "BLK", "PTS"]]
finding_metric = pd.concat([games_1985_to_2000, games_2001_to_today])
finding_metric = finding_metric.dropna(axis=0,  how="any")
finding_metric.loc[finding_metric["WL"] == "W", "WL"] = 1
finding_metric.loc[finding_metric["WL"] == "L", "WL"] = 0
nba_df

In [None]:
finding_metric

In [None]:
#Looking at how each stat correlates to winning games
results = {}
for col in finding_metric.columns:
    if col == "WL":
        continue
    results[col] = stats.pearsonr(finding_metric[col], finding_metric["WL"])
results

In [None]:
#splitting each stat into "bins" 1 apart from each other to get a representation of the spread of the data
#doing a pearsonr test as done above only is looking at 2 x values, one for a win and one for a loss
outer_dict = {}
for col in finding_metric.columns:
    if col == "WL":
        continue
    inner_dict = {}
    max = int(finding_metric[col].max())
    min = int(finding_metric[col].min())
    # offset = range / 20
    for i in range(min, max):
        j = i+1
        temp = finding_metric.loc[(finding_metric[col] >= i) & (finding_metric[col] < j)]["WL"].mean()
        if not pd.isna(temp):
            if not (temp == 1 or temp == 0):
                inner_dict[i] = temp
    outer_dict[col] = inner_dict
outer_dict

In [None]:
#performing the pearsonr test for each of the different stats
results = {}
for col in outer_dict:
    keys = []
    values = []
    for key, value in outer_dict[col].items():
        keys.append(key)
        values.append(value)
    results[col] = stats.pearsonr(keys, values)
results

In [None]:
#trimming out the stats that are statistically insignificant
pvalue_cutoff = 0.05
statistic_cutoff = 0.5
important_results = {col:results[col] for col in results if (results[col].pvalue < pvalue_cutoff and np.abs(results[col].statistic) > statistic_cutoff)}
important_results

In [None]:
#sorting the highest pts in a season
pts_per_season = nba_df[["Player", "PTS", "Season"]].sort_values("PTS", ascending=False)
pts_per_season

In [None]:
# sorting the highest pts in a season by average per player
avg_per_season = nba_df[["PTS", "Player"]].groupby("Player").mean()
avg_per_season = avg_per_season.sort_values("PTS", ascending=False)
avg_per_season

In [None]:
#finding the mean and the std for each statistic in each season per player
data = {}
for season in range(1985, 2023):
    seasonal_data = {}
    season_data = nba_df.loc[nba_df["Season"] == season][["Season","Player","FGM","FTM","FTA","PTS","DREB","AST","STL","BLK"]]
    for col in season_data.columns:
        if col == "Season" or col == "Player":
            continue
        mean_season_data = season_data[col].mean()
        std_season_data = season_data[col].std()
        seasonal_data[col] = (mean_season_data, std_season_data)
    data[season] = seasonal_data
data

In [None]:
#calculating the effectiveness of each player's stats by taking their averages and standard deviations and applying a weight to them
#based off the results of the pearsonr test we did above
player_stats_data = nba_df.loc[nba_df["Season"] > 1984][["Season","Player","FGM","FTM","FTA","PTS","DREB","AST","STL","BLK"]]
for season in range(1985, 2023):
    season_data = player_stats_data.loc[player_stats_data["Season"] == season][["Season","Player","FGM","FTM","FTA","PTS","DREB","AST","STL","BLK"]]
    for index, row in season_data.iterrows():
        for col in season_data.columns:
            if col == "Season" or col == "Player":
                continue
            mean, std = data[season][col]
            temp = player_stats_data.loc[index, col]
            temp = ((temp - mean) / std) * important_results[col].statistic
            player_stats_data.loc[index, col] = temp
player_stats_data

In [None]:
#adding the weighted stats together for each player to get a final weighted statistic of performance based on how effectively they won per season
player_stats_data["Weighted Stat"] = ""
for index, row in player_stats_data.iterrows():
    running_tot = 0
    for col in player_stats_data.columns:
        if col == "Season" or col == "Player":
                continue
        if col == "Weighted Stat":
            player_stats_data.loc[index, col] = running_tot
        running_tot += player_stats_data.loc[index, col]
player_stats_data = player_stats_data.sort_values("Weighted Stat", ascending=False)
player_stats_data.head(30)

In [None]:
important_results = pd.DataFrame.from_dict(important_results, orient='index')
important_results = important_results.rename(index={'FGM': 'Field Goals Made', 'FTM':'Free Throws Made', 'FTA': 'Free Throw Attempts',
                                'DREB': 'Defensive Rebounds', 'AST': 'Assists', 'STL': 'Steals', 'BLK': 'Blocks',
                                'PTS': 'Points'})
important_results

In [None]:
unique_seasons = nba_df['Season'].unique()

average_points_per_season = pd.DataFrame(columns=['Season', 'Average_Points'])
top_players_per_season = pd.DataFrame(columns=['Season', 'Player', 'PTS', 'Std_Dev_Above_Avg'])

#Calculate average points of each season and std. dev. of top 5 players
for season in unique_seasons:
    season_data = nba_df[nba_df['Season'] == season]

    season_player_points = season_data.groupby(['Season', 'Player'])['PTS'].sum().reset_index()
    
    average_points = season_player_points['PTS'].mean()
    average_points_per_season = pd.concat([average_points_per_season, pd.DataFrame({'Season': [season], 'Average_Points': [average_points]})], ignore_index=True)
    top_players = season_player_points.nlargest(5, 'PTS')
    
    std_dev_points = season_player_points['PTS'].std()

    top_players['Std_Dev_Above_Avg'] = (top_players['PTS'] - average_points) / std_dev_points
    
    top_players['Season'] = season
    top_players_per_season = pd.concat([top_players_per_season, top_players], ignore_index=True)

In [None]:
# Sort the DataFrame by season for better visualization
average_points_per_season.sort_values(by='Season', inplace=True)

plt.figure(figsize=(10, 6))
plt.bar(average_points_per_season['Season'], average_points_per_season['Average_Points'], color='orange')
plt.xlabel('Season')
plt.ylabel('Average Points per Player')
plt.title('Average Points per Player per Season')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("Charts/avg_points.png")
plt.show()

In [None]:
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(top_players_per_season['Season'], top_players_per_season['PTS'], c=top_players_per_season['Std_Dev_Above_Avg'], cmap='viridis', alpha=0.7)
plt.colorbar(label='Std Dev Above Avg')
plt.xlabel('Player')
plt.ylabel('Points')
plt.title('Top Players Points and Std Dev Above Avg per Season')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("Charts/top_players_pts.png")
plt.show()

In [None]:
# Filter players with Std_Dev_Above_Avg above 5
top_players_high_std_dev = top_players_per_season[top_players_per_season['Std_Dev_Above_Avg'] > 5]

# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(top_players_high_std_dev['Player'], top_players_high_std_dev['PTS'], color='red')
plt.xlabel('Player')
plt.ylabel('Points')
plt.title('Players with Std Dev Above 5')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("Charts/above5.png")
plt.show()