In [11]:
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns

main_directory = os.path.normpath(os.getcwd() + os.sep + os.pardir)
data_directory = os.path.join(main_directory, 'team_data')
file_names = [f for f in os.listdir(data_directory) if os.path.isfile(os.path.join(data_directory, f))]



In [12]:
#seperate excel files

off_files = [file for file in file_names if 'Off.csv' in file]
def_files = [file for file in file_names if 'Def.csv' in file]
adv_files = [file for file in file_names if 'Adv.csv' in file]
playoff_files = [file for file in file_names if 'Playoffs.csv' in file]

off_files = sorted(off_files)
def_files = sorted(def_files)
adv_files = sorted(adv_files)
playoff_files = sorted(playoff_files)


In [24]:
# Dictionary to store DataFrames
team_stats_dfs = {}
normalized_team_dfs = {}

for i in range(25):

    # Cleanup defensive stats
    def_stats = pd.read_csv(def_files[i])
    def_stats = def_stats.drop(columns=['Rk','Team▲','G','MP'])
    def_stats = def_stats.add_prefix('Opp ')
    #def_stats.drop(def_stats.tail(1).index, inplace=True)
    
    # Cleanup advanced stats
    adv_stats = pd.read_csv(adv_files[i], header=1)
    adv_stats = adv_stats.drop(columns=['Rk','Team▲','Unnamed: 17','Unnamed: 22','Unnamed: 27','Arena','Attend.','Attend./G'])
    adv_stats = adv_stats.rename(columns={'eFG%.1': 'Opp eFG%', 'TOV%.1': 'Opp TOV%', 'FT/FGA.1': 'Opp FT/FGA'})
    #adv_stats = adv_stats.rename(columns={'Attend./G': 'Attend/G'})
    #adv_stats.drop(adv_stats.tail(1).index, inplace=True)
    
    # Cleanup main team stats (offensive)
    off_stats = pd.read_csv(off_files[i])
    off_stats = off_stats.drop(columns=['Rk','G','MP'])
    #off_stats.drop(off_stats.tail(1).index, inplace=True)
    off_stats['Team▲'] = off_stats['Team▲'].str.replace('*', '')
    off_stats = off_stats.rename(columns={'Team▲': 'Team'}) 
    off_stats['Team'] = off_stats['Team'].replace('Seattle SuperSonics', 'Seattle Supersonics')
    #in 2014 Charlotte switched mascots midseason
    if i == 14:
        off_stats['Team'] = off_stats['Team'].replace('Charlotte Bobcats', 'Charlotte Hornets')
    
    # Make one big df of every stat
    team_stats = pd.concat([off_stats, def_stats, adv_stats], axis=1)

    #Add the year to the name
    year = 2000 + i
    team_stats['Team'] = team_stats['Team'] + ' ' + str(year)

    #replace win and loss columns with winning percentage (2020 and 2021 had shortened seasons)
    winrate = team_stats['W']/(team_stats['W'] + team_stats['L']) 
    pwinrate = team_stats['PW']/(team_stats['PW'] + team_stats['PL'])
    team_stats.insert(44, 'W%', winrate)
    team_stats.insert(47, 'PW%', pwinrate)
    team_stats = team_stats.drop(columns=['W','L','PW','PL'])

    ##################################################
    #create copy to normalize by year
    normalized_team = team_stats.copy()

    #get column of team names
    team_col = normalized_team['Team'].copy()
    #drop team name column (can't divide with strings)
    normalized_team = normalized_team.drop(normalized_team.columns[0], axis=1)

    #row index for the league average row
    avg_i = len(normalized_team) - 1
        
    #these stats are already normalized and don't appear in the league average row
    normalized_team.at[avg_i,'NRtg'] = 1
    normalized_team.at[avg_i,'MOV'] = 1
    normalized_team.at[avg_i,'SOS'] = 1
    normalized_team.at[avg_i,'SRS'] = 1
    normalized_team.at[avg_i,'W%'] = 1

    #get the row of averages
    divisor_row = normalized_team.iloc[avg_i]
    
    #divide every row by the average to normalize
    normalized_team = normalized_team.div(divisor_row, axis=1)
    
    #add the team column back
    normalized_team.insert(0, 'Team', team_col)
    
    #drop the league average row in both dataframes
    normalized_team.drop(normalized_team.tail(1).index, inplace=True)
    team_stats.drop(team_stats.tail(1).index, inplace=True)
        
    ##################################################

    #put this years stats into it's own file
    if i == 24:
        team_stats2024 = team_stats.copy()
        normal_stats2024 = normalized_team.copy()

    else:
    
        # Read in playoff stats
        playoff = pd.read_csv(playoff_files[i], header=1)
        playoff = playoff[['Team', 'W', 'L']]
        playoff.drop(playoff.tail(1).index, inplace=True)
        year = 2000 + i
        playoff['Team'] = playoff['Team'] + ' ' + str(year)

        # New DataFrame for just team names of all the teams
        all_team_names = team_stats['Team']
        all_team_names_df = pd.DataFrame(all_team_names, columns=['Team'])

        # Add the teams that missed the playoffs
        all_teams = pd.concat([playoff, all_team_names_df]).drop_duplicates(subset=['Team'])
        # Replace NaN with 0
        all_teams = all_teams.fillna(0)

        
        """
        # Sort by wins and losses to categorize
        all_teams.sort_values(by=['W', 'L'], ascending=[True, True], inplace=True)
   
        # Create the groups; lower number means you got further in the bracket
        x = len(all_teams) - 16
        group_num = [6] * x + [5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 2, 1]
    
        # Add the "Playoff Success" column to the DataFrame
        all_teams = all_teams.assign(Playoff_Success=group_num)
        
        # Sort by team alphabetically
        all_teams.sort_values(by='Team', inplace=True)
        
        # Reset index
        all_teams.reset_index(drop=True, inplace=True)

    
        # Put the new vector in the main df
        playoff_success = all_teams.drop(columns=['Team', 'W', 'L'])
        """
        # Sort by team alphabetically
        all_teams.sort_values(by='Team', inplace=True)
        # Reset index
        all_teams.reset_index(drop=True, inplace=True)

        #drop team name and losses
        playoff_wins = all_teams.drop(columns=['Team','L'])

        #add playoff win column to main df and normal df
        team_stats = pd.concat([team_stats, playoff_wins], axis=1)
        normalized_team = pd.concat([normalized_team, playoff_wins], axis=1)       

        #create big array of all years (200i)
        team_stats_dfs[i] = team_stats.copy()
        normalized_team_dfs[i] = normalized_team.copy()
        
# Combine all DataFrames into one giant DataFrame
all_data = pd.concat(team_stats_dfs.values(), ignore_index=True)
all_data.set_index('Team', inplace=True)
#write it into a csv file
all_data.to_csv('all_team_data.csv')

# Combine all normalized DataFrames into one giant DataFrame
all_data = pd.concat(normalized_team_dfs.values(), ignore_index=True)
all_data.set_index('Team', inplace=True)
#write it to a csv file
all_data.to_csv('normal_team_data.csv')

team_stats2024.set_index('Team', inplace=True)
team_stats2024.to_csv('2024_team_data.csv')
normal_stats2024.set_index('Team', inplace=True)
normal_stats2024.to_csv('2024_normal_data.csv')
