In [2]:
from BRScraper import nba
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re

In [4]:
# Data preprocessing for ML Model

# df = nba.get_stats(season=2023, info='per_game', playoffs=False)
# drop_columns = ['Age','Pos', 'GS', '3PA', '2PA', 'PF', 'Awards']
# df_cleaned = df.drop(columns=drop_columns)

In [5]:
# mvp_data = nba.get_award_votings('mvp', 2023)
# nominated_players = mvp_data['Player'].tolist()
# print(nominated_players)

# creating the 'Previously_Nominated' column, if a player was nominated for MVP mark 1, else mark 0. will help serve as a proxy for player reputation
# df_cleaned['Previously_Nominated'] = df_cleaned['Player'].apply(lambda x: 1 if x in nominated_players else 0)

['Joel Embiid', 'Nikola Jokić', 'Giannis Antetokounmpo', 'Jayson Tatum', 'Shai Gilgeous-Alexander', 'Donovan Mitchell', 'Domantas Sabonis', 'Luka Dončić', 'Stephen Curry', 'Jimmy Butler', "De'Aaron Fox", 'Jalen Brunson', 'Ja Morant']


In [6]:
# # identifying players who have stats for multiple teams and eliminating duplicates
# multi_team_players = df_cleaned[df_cleaned['Team'] == '2TM']['Player'].unique()

# # keeping only the row where team value is set to 2TM, this row will include all combined stats and average from all teams the player played for
# mask = (df_cleaned['Team'] == '2TM') | (~df_cleaned['Player'].isin(multi_team_players))

# df_cleaned = df_cleaned[mask]

In [7]:
# # creating a True Shooting Percentage (TS%) column
# # the formula is TS% = PTS / 2 * (FGA + 0.44 * FTA)

# if 'PTS' in df_cleaned.columns and 'FGA' in df_cleaned.columns and 'FTA' in df_cleaned.columns:
#     df_cleaned['TS%'] = df_cleaned['PTS'] / (2 * (df_cleaned['FGA'] + 0.44 * df_cleaned['FTA']))
#     df_cleaned['TS%'] = df_cleaned['TS%'].round(2)

In [4]:
# adding another column EEF, stands effeciency. It a metric used by the nba to calculate a player's efficiency or impact.

# # calculating missed field goals and missed free throws because the EEF formula requires it.
# df_cleaned['Missed_FG'] = df_cleaned['FGA'] - df_cleaned['FG']
# df_cleaned['Missed_FT'] = df_cleaned['FTA'] - df_cleaned['FT']

# # Calculating EFF
# df_cleaned['EFF'] = (
#     df_cleaned['PTS'] +
#     df_cleaned['TRB'] +
#     df_cleaned['AST'] +
#     df_cleaned['STL'] +
#     df_cleaned['BLK'] -
#     df_cleaned['Missed_FG'] -
#     df_cleaned['Missed_FT'] -
#     df_cleaned['TOV']
#     ) / df_cleaned['G']

# # dropping the temporary columns, no longer needed
# df_cleaned.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True)

# # rounded EFF to 2 decimals
# df_cleaned['EFF'] = df_cleaned['EFF'].round(2)

# output_file = "nba_2023_adjusted_data.csv"
# df_cleaned.to_csv(output_file, index=False)

In [5]:
# this function is used to preprocess a single season
def preprocess_season(file_name, season, mvp_votings):
    df = pd.read_csv(file_name)
    
    # dropping unnecessary columns
    drop_columns = ['Age', 'Pos', 'GS', '3PA', '2PA', 'PF', 'Awards']
    df_cleaned = df.drop(columns=drop_columns, errors='ignore')

    # handling players who played for multiple teams
    multi_team_players = df_cleaned[df_cleaned['Team'] == '2TM']['Player'].unique()
    mask = (df_cleaned['Team'] == '2TM') | (~df_cleaned['Player'].isin(multi_team_players))
    df_cleaned = df_cleaned[mask]

    # calculating TS%
    if 'PTS' in df_cleaned.columns and 'FGA' in df_cleaned.columns and 'FTA' in df_cleaned.columns:
        df_cleaned['TS%'] = df_cleaned['PTS'] / (2 * (df_cleaned['FGA'] + 0.44 * df_cleaned['FTA']))
        df_cleaned['TS%'] = df_cleaned['TS%'].round(2)

    # calculating missed shots for EFF
    if 'FGA' in df_cleaned.columns and 'FG' in df_cleaned.columns:
        df_cleaned['Missed_FG'] = df_cleaned['FGA'] - df_cleaned['FG']
    if 'FTA' in df_cleaned.columns and 'FT' in df_cleaned.columns:
        df_cleaned['Missed_FT'] = df_cleaned['FTA'] - df_cleaned['FT']

    # calculating the EFF metric
    if {'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'G', 'Missed_FG', 'Missed_FT'}.issubset(df_cleaned.columns):
        df_cleaned['EFF'] = (
            df_cleaned['PTS'] +
            df_cleaned['TRB'] +
            df_cleaned['AST'] +
            df_cleaned['STL'] +
            df_cleaned['BLK'] -
            df_cleaned['Missed_FG'] -
            df_cleaned['Missed_FT'] -
            df_cleaned['TOV']
        ) / df_cleaned['G']
        df_cleaned['EFF'] = df_cleaned['EFF'].round(2)

    # dropping the temporary columns becuase they're no longer needed
    df_cleaned.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True, errors='ignore')

    # adding the Nominated column
    df_cleaned['Nominated'] = df_cleaned['Player'].apply(lambda player: 1 if player in mvp_votings else 0)

    return df_cleaned

# declaring the folder containing the CSV files and output destination
input_folder = "untouched_seasonal_data"
output_folder = "processed_data"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# processing files for each season
for year in range(1980, 2010):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # fetching MVP voting data for the season
            mvp_data = nba.get_award_votings('mvp', year)
            mvp_votings = set(mvp_data['Player'])  # getting the players who were nominated for MVP
            
            # preprocessing the season data
            processed_df = preprocess_season(file_name, year, mvp_votings)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")

Processed 1980-81 successfully!
Processed 1981-82 successfully!
Processed 1982-83 successfully!
Processed 1983-84 successfully!
Processed 1984-85 successfully!
Processed 1985-86 successfully!
Processed 1986-87 successfully!
Processed 1987-88 successfully!
Processed 1988-89 successfully!
Processed 1989-90 successfully!
Processed 1990-91 successfully!
Processed 1991-92 successfully!
Processed 1992-93 successfully!
Processed 1993-94 successfully!
Processed 1994-95 successfully!
Processed 1995-96 successfully!
Processed 1996-97 successfully!
Processed 1997-98 successfully!
Processed 1998-99 successfully!
Processed 1999-00 successfully!
Processed 2000-01 successfully!
Processed 2001-02 successfully!
Processed 2002-03 successfully!
Processed 2003-04 successfully!
Processed 2004-05 successfully!
Processed 2005-06 successfully!
Processed 2006-07 successfully!
Processed 2007-08 successfully!
Processed 2008-09 successfully!
Processed 2009-10 successfully!


Take a 1 min break before running the next cell to avoid getting locked out!

In [6]:
# declaring the folder containing the CSV files and output destination
input_folder = "untouched_seasonal_data"
output_folder = "processed_data"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# processing files for each season
for year in range(2010, 2025):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # fetching MVP voting data for the season
            mvp_data = nba.get_award_votings('mvp', year)
            mvp_votings = set(mvp_data['Player'])
            
            # preprocessing the season data
            processed_df = preprocess_season(file_name, year, mvp_votings)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")

Processed 2010-11 successfully!
Processed 2011-12 successfully!
Processed 2012-13 successfully!
Processed 2013-14 successfully!
Processed 2014-15 successfully!
Processed 2015-16 successfully!
Processed 2016-17 successfully!
Processed 2017-18 successfully!
Processed 2018-19 successfully!
Processed 2019-20 successfully!
Processed 2020-21 successfully!
Processed 2021-22 successfully!
Processed 2022-23 successfully!
Processed 2023-24 successfully!
Processed 2024-25 successfully!


In [7]:
#Combine all datasets from 1980 to 2015
# Define the folder containing the CSV files
input_folder = "processed_data"
output_folder = "processed_data"

# List to hold dataframes
dataframes = []

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through each year from 1980 to 2015
for year in range(1980, 2016):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # Read the dataset for the year
            df = pd.read_csv(file_name)
        
            # Append the dataframe to the list
            dataframes.append(df)
        
            print(f"Successfully read data for the {year} season")
        except Exception as e:
            print(f"Failed to read data for {year} season. Error: {e}")

# Concatenate all dataframes into one
nba_combined_1980_2015 = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a CSV file
nba_combined_1980_2015.to_csv("nba_combined_1980_2015.csv", index=False)


print("All datasets have been concatenated and saved to 'nba_combined_1980_2015.csv'.")


Successfully read data for the 1980 season
Successfully read data for the 1981 season
Successfully read data for the 1982 season
Successfully read data for the 1983 season
Successfully read data for the 1984 season
Successfully read data for the 1985 season
Successfully read data for the 1986 season
Successfully read data for the 1987 season
Successfully read data for the 1988 season
Successfully read data for the 1989 season
Successfully read data for the 1990 season
Successfully read data for the 1991 season
Successfully read data for the 1992 season
Successfully read data for the 1993 season
Successfully read data for the 1994 season
Successfully read data for the 1995 season
Successfully read data for the 1996 season
Successfully read data for the 1997 season
Successfully read data for the 1998 season
Successfully read data for the 1999 season
Successfully read data for the 2000 season
Successfully read data for the 2001 season
Successfully read data for the 2002 season
Successfull

In [8]:
#Combine all datasets from 2016 to 2024
# Define the folder containing the CSV files
input_folder = "processed_data"
output_folder = "processed_data"

# List to hold dataframes
dataframes = []

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through each year from 2016 to 2024
for year in range(2016, 2025):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # Read the dataset for the year
            df = pd.read_csv(file_name)
        
            # Append the dataframe to the list
            dataframes.append(df)
        
            print(f"Successfully read data for the {year} season")
        except Exception as e:
            print(f"Failed to read data for {year} season. Error: {e}")

# Concatenate all dataframes into one
nba_combined_2016_2024 = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a CSV file
nba_combined_2016_2024.to_csv("nba_combined_2016_2024.csv", index=False)

print("All datasets have been concatenated and saved to 'nba_combined_2016_2024.csv'.")

Successfully read data for the 2016 season
Successfully read data for the 2017 season
Successfully read data for the 2018 season
Successfully read data for the 2019 season
Successfully read data for the 2020 season
Successfully read data for the 2021 season
Successfully read data for the 2022 season
Successfully read data for the 2023 season
Successfully read data for the 2024 season
All datasets have been concatenated and saved to 'nba_combined_2016_2024.csv'.


Adding the Team_Rank column to see how it improves the model

In [43]:
# team name mapping
team_name_mapping = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BRK": "Brooklyn Nets",
    "CHO": "Charlotte Hornets",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NOP": "New Orleans Pelicans",
    "NYK": "New York Knicks",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHX": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "WAS": "Washington Wizards",
    "SDC": "San Diego Clippers",
    "NJN": "New Jersey Nets",
    "PHO": "Phoenix Suns",
    "WSB": "Washington Bullets",
    "SEA": "Seattle SuperSonics",
    "CHH": "Charlotte Hornets",
    "VAN": "Vancouver Grizzlies",
    "NOH": "New Orleans Hornets"
}

def get_team_rankings(year):
    standings = nba.get_standings(year, info='total')

    # process team names to remove asterisks and determine playoff status
    standings["Made_Playoffs"] = standings["Tm"].apply(lambda x: 1 if "*" in x else 0)
    standings["Tm"] = standings["Tm"].apply(lambda x: x.replace("*", "").strip())

    standings_dict = dict(zip(standings["Tm"], standings["Seed"]))
    playoffs_dict = dict(zip(standings["Tm"], standings["Made_Playoffs"]))
    return standings_dict, playoffs_dict

# function to preprocess a single season
def preprocess_season(file_name, year, mvp_votings):
    df = pd.read_csv(file_name)
    
    # dropping unnecessary columns
    drop_columns = ['Age', 'Pos', 'GS', '3PA', '2PA', 'PF', 'Awards']
    df_cleaned = df.drop(columns=drop_columns, errors='ignore')

    # handling players who played for multiple teams
    multi_team_players = df_cleaned[df_cleaned['Team'] == '2TM']['Player'].unique()
    mask = (df_cleaned['Team'] == '2TM') | (~df_cleaned['Player'].isin(multi_team_players))
    df_cleaned = df_cleaned[mask]

    # calculating TS%
    if 'PTS' in df_cleaned.columns and 'FGA' in df_cleaned.columns and 'FTA' in df_cleaned.columns:
        df_cleaned['TS%'] = df_cleaned['PTS'] / (2 * (df_cleaned['FGA'] + 0.44 * df_cleaned['FTA']))
        df_cleaned['TS%'] = df_cleaned['TS%'].round(2)

    # calculating missed shots for EFF
    if 'FGA' in df_cleaned.columns and 'FG' in df_cleaned.columns:
        df_cleaned['Missed_FG'] = df_cleaned['FGA'] - df_cleaned['FG']
    if 'FTA' in df_cleaned.columns and 'FT' in df_cleaned.columns:
        df_cleaned['Missed_FT'] = df_cleaned['FTA'] - df_cleaned['FT']

    # calculating the EFF metric
    if {'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'G', 'Missed_FG', 'Missed_FT'}.issubset(df_cleaned.columns):
        df_cleaned['EFF'] = (
            df_cleaned['PTS'] +
            df_cleaned['TRB'] +
            df_cleaned['AST'] +
            df_cleaned['STL'] +
            df_cleaned['BLK'] -
            df_cleaned['Missed_FG'] -
            df_cleaned['Missed_FT'] -
            df_cleaned['TOV']
        ) / df_cleaned['G']
        df_cleaned['EFF'] = df_cleaned['EFF'].round(2)

    # dropping the temporary columns
    df_cleaned.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True, errors='ignore')

    # adding the Nominated column
    df_cleaned['Nominated'] = df_cleaned['Player'].apply(lambda player: 1 if player in mvp_votings else 0)

    # getting the team standings and then adding the Team_Rank & Made_Playoffs column
    team_standings, playoffs_dict = get_team_rankings(year + 1)
    df_cleaned['Team_Full'] = df_cleaned['Team'].map(team_name_mapping)
    df_cleaned['Team_Rank'] = df_cleaned['Team_Full'].map(team_standings).fillna(-1).astype(int)
    df_cleaned['Made_Playoffs'] = df_cleaned['Team_Full'].map(playoffs_dict).fillna(0).astype(int)
    df_cleaned.drop(columns=['Team_Full'], inplace=True)

    return df_cleaned

input_folder = "untouched_seasonal_data"
output_folder = "processed_data_with_Team_Rank"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# processing files for each season
for year in range(1980, 1994):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # fetching MVP voting data for the season
            mvp_data = nba.get_award_votings('mvp', year)
            mvp_votings = set(mvp_data['Player'])
            
            # preprocessing the season data
            processed_df = preprocess_season(file_name, year, mvp_votings)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")

Processed 1980-81 successfully!
Processed 1981-82 successfully!
Processed 1982-83 successfully!
Processed 1983-84 successfully!
Processed 1984-85 successfully!
Processed 1985-86 successfully!
Processed 1986-87 successfully!
Processed 1987-88 successfully!
Processed 1988-89 successfully!
Processed 1989-90 successfully!
Processed 1990-91 successfully!
Processed 1991-92 successfully!
Processed 1992-93 successfully!
Processed 1993-94 successfully!


Take a 1 min break before running the next cell to avoid getting locked out!

In [44]:
# continuation 1994-2007
input_folder = "untouched_seasonal_data"
output_folder = "processed_data_with_Team_Rank"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# processing files for each season
for year in range(1994, 2008):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # fetching MVP voting data for the season
            mvp_data = nba.get_award_votings('mvp', year)
            mvp_votings = set(mvp_data['Player']) 
            
            # preprocessing the season data
            processed_df = preprocess_season(file_name, year, mvp_votings)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")

Processed 1994-95 successfully!
Processed 1995-96 successfully!
Processed 1996-97 successfully!
Processed 1997-98 successfully!
Processed 1998-99 successfully!
Processed 1999-00 successfully!
Processed 2000-01 successfully!
Processed 2001-02 successfully!
Processed 2002-03 successfully!
Processed 2003-04 successfully!
Processed 2004-05 successfully!
Processed 2005-06 successfully!
Processed 2006-07 successfully!
Processed 2007-08 successfully!


Take a 1 min break before running the next cell to avoid getting locked out!

In [45]:
# continuation 2008-2021
input_folder = "untouched_seasonal_data"
output_folder = "processed_data_with_Team_Rank"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# processing files for each season
for year in range(2008, 2022):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # fetching MVP voting data for the season
            mvp_data = nba.get_award_votings('mvp', year)
            mvp_votings = set(mvp_data['Player']) 
            
            # preprocessing the season data
            processed_df = preprocess_season(file_name, year, mvp_votings)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")

Processed 2008-09 successfully!
Processed 2009-10 successfully!
Processed 2010-11 successfully!
Processed 2011-12 successfully!
Processed 2012-13 successfully!
Processed 2013-14 successfully!
Processed 2014-15 successfully!
Processed 2015-16 successfully!
Processed 2016-17 successfully!
Processed 2017-18 successfully!
Processed 2018-19 successfully!
Processed 2019-20 successfully!
Processed 2020-21 successfully!
Processed 2021-22 successfully!


Take a 1 min break before running the next cell to avoid getting locked out!

In [46]:
input_folder = "untouched_seasonal_data"
output_folder = "processed_data_with_Team_Rank"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# processing files for each season
for year in range(2022, 2024):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # fetching MVP voting data for the season
            mvp_data = nba.get_award_votings('mvp', year)
            mvp_votings = set(mvp_data['Player']) 
            
            # preprocessing the season data
            processed_df = preprocess_season(file_name, year, mvp_votings)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")



Processed 2022-23 successfully!
Processed 2023-24 successfully!


In [60]:
# File paths
input_file = "untouched_seasonal_data/nba_player_stats_2024-25.csv"
output_file = "processed_data_with_Team_Rank/nba_player_stats_2024-25_processed.csv"

# Team mapping for abbreviations
team_mapping = {
    'MIL': 'Milwaukee Bucks',
    'BOS': 'Boston Celtics',
    'PHI': 'Philadelphia 76ers',
    'CLE': 'Cleveland Cavaliers',
    'NYK': 'New York Knicks',
    'BRK': 'Brooklyn Nets',
    'ATL': 'Atlanta Hawks',
    'MIA': 'Miami Heat',
    'CHI': 'Chicago Bulls',
    'TOR': 'Toronto Raptors',
    'IND': 'Indiana Pacers',
    'DET': 'Detroit Pistons',
    'ORL': 'Orlando Magic',
    'CHO': 'Charlotte Hornets',
    'WAS': 'Washington Wizards',
    'DEN': 'Denver Nuggets',
    'MIN': 'Minnesota Timberwolves',
    'OKC': 'Oklahoma City Thunder',
    'POR': 'Portland Trail Blazers',
    'UTA': 'Utah Jazz',
    'GSW': 'Golden State Warriors',
    'LAC': 'Los Angeles Clippers',
    'SAC': 'Sacramento Kings',
    'PHO': 'Phoenix Suns',
    'LAL': 'Los Angeles Lakers',
    'DAL': 'Dallas Mavericks',
    'MEM': 'Memphis Grizzlies',
    'NOP': 'New Orleans Pelicans',
    'SAS': 'San Antonio Spurs',
    'HOU': 'Houston Rockets'
}

# Load the dataset
df = pd.read_csv(input_file)

# Initialize the new columns with default values
df['Team_Rank'] = -1
df['Made_Playoffs'] = 0

standings = nba.get_standings(2025, info='total')

# cleaning the team names in standings
standings['Tm'] = standings['Tm'].str.replace(r'\s*\([0-9]+\)', '', regex=True)
standings['Tm'] = standings['Tm'].str.replace("*", "").str.strip()

# creating mapping of team names to rankings
team_rank_map = dict(zip(standings['Tm'], standings['Seed']))

# mapping team abbreviations to full names and then to rankings
df['Team_Full'] = df['Team'].map(team_mapping)
df['Team_Rank'] = df['Team_Full'].map(team_rank_map).fillna(-1).astype(int)

# Drop the temporary full team name column
df.drop(columns=['Team_Full'], inplace=True)

# dropping unnecessary columns
drop_columns = ['Age', 'Pos', 'GS', '3PA', '2PA', 'PF', 'Awards']
df = df.drop(columns=drop_columns, errors='ignore')

multi_team_players = df[df['Team'] == '2TM']['Player'].unique()
mask = (df['Team'] == '2TM') | (~df['Player'].isin(multi_team_players))
df = df[mask]

# calculating TS%
if 'PTS' in df.columns and 'FGA' in df.columns and 'FTA' in df.columns:
    df['TS%'] = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    df['TS%'] = df['TS%'].round(2)

# calculating missed shots for EFF
if 'FGA' in df.columns and 'FG' in df.columns:
    df['Missed_FG'] = df['FGA'] - df['FG']
if 'FTA' in df.columns and 'FT' in df.columns:
    df['Missed_FT'] = df['FTA'] - df['FT']

# calculating the EFF metric
if {'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'G', 'Missed_FG', 'Missed_FT'}.issubset(df.columns):
    df['EFF'] = (
        df['PTS'] +
        df['TRB'] +
        df['AST'] +
        df['STL'] +
        df['BLK'] -
        df['Missed_FG'] -
        df['Missed_FT'] -
        df['TOV']
    ) / df['G']
    df['EFF'] = df['EFF'].round(2)

# dropping the temporary columns
df.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True, errors='ignore')

# adding the Nominated column
df['Nominated'] = df['Player'].apply(lambda player: 1 if player in mvp_votings else 0)

# Save the updated dataset
df.to_csv(output_file, index=False)
print("Processed 2024-25 season successfully")

Processed 2024-25 season successfully


In [61]:
input_folder = "processed_data_with_Team_Rank"
output_folder = "processed_data_with_Team_Rank"

# Initialize an empty list to store dataframes
all_dataframes = []

# Process each season from 1980 to 2015-16
for year in range(1980, 2017):
    season = f"{year}-{str(year+1)[-2:]}"  # Creates strings like "1980-81"
    file_name = f"{input_folder}/nba_player_stats_{season}_processed.csv"
    
    try:
        # Read the CSV file with no data type inference to preserve original values
        df = pd.read_csv(file_name, dtype=str)
        all_dataframes.append(df)
        print(f"Successfully loaded data for season {season}")
        
    except FileNotFoundError:
        print(f"Warning: Could not find file for season {season}")
    except Exception as e:
        print(f"Error processing season {season}: {str(e)}")

# Combine all dataframes with pure concatenation
if all_dataframes:
    nba_combined_1980_2015_with_Team_Rank = pd.concat(all_dataframes, ignore_index=True)
    
    # Save the combined dataframe to a CSV file without any modifications
    nba_combined_1980_2015_with_Team_Rank.to_csv("nba_combined_1980_2015_with_Team_Rank.csv", index=False)
    print("All datasets have been concatenated and saved to 'nba_combined_1980_2015_with_Team_Rank.csv'.")
    
    # Print basic verification information
    print("\nVerification Information:")
    print(f"Total number of rows: {len(nba_combined_1980_2015_with_Team_Rank)}")
    print(f"Number of unique seasons: {len(nba_combined_1980_2015_with_Team_Rank['Season'].unique())}")
else:
    print("No data was found to combine.")

Successfully loaded data for season 1980-81
Successfully loaded data for season 1981-82
Successfully loaded data for season 1982-83
Successfully loaded data for season 1983-84
Successfully loaded data for season 1984-85
Successfully loaded data for season 1985-86
Successfully loaded data for season 1986-87
Successfully loaded data for season 1987-88
Successfully loaded data for season 1988-89
Successfully loaded data for season 1989-90
Successfully loaded data for season 1990-91
Successfully loaded data for season 1991-92
Successfully loaded data for season 1992-93
Successfully loaded data for season 1993-94
Successfully loaded data for season 1994-95
Successfully loaded data for season 1995-96
Successfully loaded data for season 1996-97
Successfully loaded data for season 1997-98
Successfully loaded data for season 1998-99
Successfully loaded data for season 1999-00
Successfully loaded data for season 2000-01
Successfully loaded data for season 2001-02
Successfully loaded data for sea

In [63]:
input_folder = "processed_data_with_Team_Rank"
output_folder = "processed_data_with_Team_Rank"

# Initialize an empty list to store dataframes
all_dataframes = []

# Process each season from 2016 to 2024
for year in range(2016, 2025):
    season = f"{year}-{str(year+1)[-2:]}"  # Creates strings like "2016-17"
    file_name = f"{input_folder}/nba_player_stats_{season}_processed.csv"
    
    try:
        # Read the CSV file with no data type inference to preserve original values
        df = pd.read_csv(file_name, dtype=str)
        all_dataframes.append(df)
        print(f"Successfully loaded data for season {season}")
        
    except FileNotFoundError:
        print(f"Warning: Could not find file for season {season}")
    except Exception as e:
        print(f"Error processing season {season}: {str(e)}")

# Combine all dataframes with pure concatenation
if all_dataframes:
    nba_combined_2016_2024_with_Team_Rank = pd.concat(all_dataframes, ignore_index=True)
    
    # Save the combined dataframe to a CSV file without any modifications
    nba_combined_2016_2024_with_Team_Rank.to_csv("nba_combined_2016_2024_with_Team_Rank.csv", index=False)
    print("All datasets have been concatenated and saved to 'nba_combined_2016_2024_with_Team_Rank.csv'.")
    
    # Print basic verification information
    print("\nVerification Information:")
    print(f"Total number of rows: {len(nba_combined_2016_2024_with_Team_Rank)}")
    print(f"Number of unique seasons: {len(nba_combined_2016_2024_with_Team_Rank['Season'].unique())}")
else:
    print("No data was found to combine.")

Successfully loaded data for season 2016-17
Successfully loaded data for season 2017-18
Successfully loaded data for season 2018-19
Successfully loaded data for season 2019-20
Successfully loaded data for season 2020-21
Successfully loaded data for season 2021-22
Successfully loaded data for season 2022-23
Successfully loaded data for season 2023-24
Successfully loaded data for season 2024-25
All datasets have been concatenated and saved to 'nba_combined_2016_2024_with_Team_Rank.csv'.

Verification Information:
Total number of rows: 4974
Number of unique seasons: 9


Succefully itegrated Team_Rank and Made_Playoffs column.

Team_Rank holds the players current team ranking.

Made_Playoffs holds a binary value, 1 if the team made the playoffs that year, otherwise 0.