In [253]:
# NBA Contract Value Analysis
## Data Collection, Exploration, and Initial Cleaning

In [254]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install psycopg2-binary
import psycopg2
import unidecode
conn = psycopg2.connect(
    host="aws-1-us-west-1.pooler.supabase.com",
    port="6543",
    dbname="postgres",
    user="postgres.sigmumejgwpiomgkveht",
    password="Ayayonuro$7007$",
    sslmode="require"
)
cur = conn.cursor()
cur.execute("SELECT version();")  # ask the DB for its version
print(cur.fetchone())             # prints something like ('PostgreSQL 15.x ...',)

cur.close()
conn.close()


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


('PostgreSQL 17.6 on aarch64-unknown-linux-gnu, compiled by gcc (GCC) 13.2.0, 64-bit',)


In [255]:
stats = pd.read_csv("../data/raw/nba_stats_2025.csv")
contracts = pd.read_csv("../data/raw/nba_salaries_2025.csv")
print(stats.shape)
print(stats.head())
print(contracts.shape)
print(contracts.head())

(737, 32)
    Rk                   Player   Age Team  Pos     G    GS    MP    FG   FGA  \
0  NaN                      NaN   NaN  NaN  NaN   NaN   NaN   NaN   NaN   NaN   
1  1.0  Shai Gilgeous-Alexander  26.0  OKC   PG  76.0  76.0  34.2  11.3  21.8   
2  2.0    Giannis Antetokounmpo  30.0  MIL   PF  67.0  67.0  34.2  11.8  19.7   
3  3.0             Nikola Jokić  29.0  DEN    C  70.0  70.0  36.7  11.2  19.5   
4  4.0              Luka Dončić  25.0  2TM   PG  50.0  50.0  35.4   9.2  20.5   

   ...  DRB   TRB   AST  STL  BLK  TOV   PF   PTS                    Awards  \
0  ...  NaN   NaN   NaN  NaN  NaN  NaN  NaN   NaN                       NaN   
1  ...  4.1   5.0   6.4  1.7  1.0  2.4  2.2  32.7  MVP-1DPOY-10CPOY-8ASNBA1   
2  ...  9.7  11.9   6.5  0.9  1.2  3.1  2.3  30.4         MVP-3DPOY-8ASNBA1   
3  ...  9.9  12.7  10.2  1.8  0.6  3.3  2.3  29.6         MVP-2CPOY-2ASNBA1   
4  ...  7.4   8.2   7.7  1.8  0.4  3.6  2.5  28.2                       NaN   

   Player-additional  
0    

In [256]:
## Cleaning Player Stats Dataset

In [257]:
# Rename columns for standardized querying
stats = df.rename(columns={
    "Rk": "rank",
    "Player": "player",
    "Age": "age",
    "Team": "team",
    "Pos": "position",
    "G": "games",
    "GS": "games_started",
    "MP": "minutes_per_game",
    "FG": "fg_made",
    "FGA": "fg_attempts",
    "FG%": "fg_pct",
    "3P": "three_p_made",
    "3PA": "three_p_attempts",
    "3P%": "three_p_pct",
    "2P": "two_p_made",
    "2PA": "two_p_attempts",
    "2P%": "two_p_pct",
    "eFG%": "efg_pct",
    "FT": "ft_made",
    "FTA": "ft_attempts",
    "FT%": "ft_pct",
    "ORB": "off_rebounds",
    "DRB": "def_rebounds",
    "TRB": "total_rebounds",
    "AST": "assists",
    "STL": "steals",
    "BLK": "blocks",
    "TOV": "turnovers",
    "PF": "personal_fouls",
    "PTS": "points_per_game",
    "Player-additional": "player_id"   # keep as join key
})
print(stats.columns.tolist())

['rank', 'player', 'age', 'team', 'position', 'games', 'games_started', 'minutes_per_game', 'fg_made', 'fg_attempts', 'fg_pct', 'three_p_made', 'three_p_attempts', 'three_p_pct', 'two_p_made', 'two_p_attempts', 'two_p_pct', 'efg_pct', 'ft_made', 'ft_attempts', 'ft_pct', 'off_rebounds', 'def_rebounds', 'total_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points_per_game', 'player_id']


In [258]:
# CLean nba stats data set to no longer include "NaN" and change type from float to Int
stats = stats[stats["rank"].notna()]
stats["rank"] = stats["rank"].astype("Int64")

In [259]:
# Creates a list of players that played on multiple teams during the season
multi_team_players = stats[stats["team"] == "2TM"]["player"]

In [260]:
# Check for amount of multi-team players
print("Number of multi-team players:", len(multi_team_players))
# multi_team_players.head(10)

Number of multi-team players: 41


In [261]:
# Remove duplicate rows for traded players:
#   - For players with a "2TM" total row, drop the team-specific rows (e.g., DAL, LAL)
#   - Keep only their "2TM" combined season stats
#   - Single-team players remain unaffected
stats = stats[~((stats["player"].isin(multi_team_players)) & (stats["team"] != "2TM"))]

In [262]:
# Columns should be integers, then confirm
int_cols =["rank", "games", "age", "games_started"] 
stats[int_cols] = stats[int_cols].astype("Int64")
stats.dtypes[int_cols]

rank             Int64
games            Int64
age              Int64
games_started    Int64
dtype: object

In [263]:
### Dropping `Awards` Column
# The `Awards` column was removed because:
# - It contained inconsistent strings (e.g., "MVP-10" = 10th in MVP voting, not an actual MVP award).
# - Many `2TM` summary rows were missing awards, leading to incomplete or misleading data.
# - Awards are subjective recognition and less relevant to our core analysis (salary vs. performance).
# By removing this column, we simplify the dataset and focus on reliable, objective performance metrics.
stats = stats.drop(columns=["Awards"], errors="ignore")

In [264]:
"Awards" in stats.columns

False

In [265]:
# Filter out low participation players
stats = stats[(df["games"] >= 41) & (stats["minutes_per_game"] >= 10)]

In [266]:
# Clean names in stats dataset
stats["player"] = stats["player"].apply(unidecode.unidecode)
stats["player"] = stats["player"].str.strip()

In [267]:
# Remove duplicates and reset index
stats = stats.drop_duplicates(subset=["player", "team"])
stats = stats.reset_index(drop=True)

In [268]:
# Final Check
print("Final dataset shape:", stats.shape)
print(stats.head())

Final dataset shape: (327, 31)
   rank                   player  age team position  games  games_started  \
0     1  Shai Gilgeous-Alexander   26  OKC       PG     76             76   
1     2    Giannis Antetokounmpo   30  MIL       PF     67             67   
2     3             Nikola Jokic   29  DEN        C     70             70   
3     4              Luka Doncic   25  2TM       PG     50             50   
4     5          Anthony Edwards   23  MIN       SG     79             79   

   minutes_per_game  fg_made  fg_attempts  ...  off_rebounds  def_rebounds  \
0              34.2     11.3         21.8  ...           0.9           4.1   
1              34.2     11.8         19.7  ...           2.2           9.7   
2              36.7     11.2         19.5  ...           2.9           9.9   
3              35.4      9.2         20.5  ...           0.8           7.4   
4              36.3      9.1         20.4  ...           0.8           4.9   

   total_rebounds  assists  steals  b

In [269]:
# After cleaning, saved the processed dataset into a separate folder
stats.to_csv("../data/clean/nba_stats_clean.csv", index=False)
print("Cleaned dataset saved successfully.")

Cleaned dataset saved successfully.


In [270]:
# Cleaning player salaries data set

In [271]:
# Clean salary column, Remove $ and commas, Strip whitespace, Convert to integer

contracts["Salary"] = (
    contracts["Salary"]
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
    .astype(int)
    )

contracts.head()

Unnamed: 0,Player,Team,Salary
0,Stephen Curry,GSW,55761216
1,Joel Embiid,PHI,51415938
2,Nikola Jokic,DEN,51415938
3,Kevin Durant,PHO,51179021
4,Bradley Beal,PHO,50203930


In [272]:
# Standardize texts for consistency and drop duplicates
contracts["Player"] = contracts["Player"].str.strip()
contracts["Team"] = contracts["Team"].str.strip().str.upper()
contracts = contracts.drop_duplicates(subset=["Player", "Team"])

contracts.head()

Unnamed: 0,Player,Team,Salary
0,Stephen Curry,GSW,55761216
1,Joel Embiid,PHI,51415938
2,Nikola Jokic,DEN,51415938
3,Kevin Durant,PHO,51179021
4,Bradley Beal,PHO,50203930


In [273]:
# Check for missing values and standardize column name for querying
print(contracts.isnull().sum())
contracts = contracts.rename(columns={
    "Player": "player",
    "Team": "team",
    "Salary": "salary"
})

Player    0
Team      0
Salary    0
dtype: int64


In [274]:
# After cleaning, saved the processed dataset into a separate folder
contracts.to_csv("../data/clean/nba_contracts_clean.csv", index=False)
print("Cleaned dataset saved successfully.")

Cleaned dataset saved successfully.
