Note that Basketball Reference will stop requests if more than 20 per minute are made, and will block requests for 1 hour if that is the case.

In [1]:
import pandas as pd
import regex as re
import time
from difflib import get_close_matches

def Get_AllNBA_Voting(year):
    '''
    Gets all players who recieved All NBA Votes for the given year
    '''
    # Buffer so as not to go over the 20 requests per minute threshold
    time.sleep(5)
    
    # Get the info from the correct URL
    URL = f"https://www.basketball-reference.com/awards/awards_{year}.html"
    tbls = pd.read_html(URL)
    
    # The All NBA info happens to be in the third table
    All_NBA_df = tbls[2]
    
    # Adjust the column names
    All_NBA_df.columns = [col[-1] for col in All_NBA_df]
    
    # Return the desired information, dropping rows missing data
    return(All_NBA_df[['# Tm', 'Player', 'Pts Won']].dropna().reset_index())

In [2]:
def Get_NBA_Player_Stats(year):
    '''
    Gets all NBA players' stats for the given year.
    
    Note: some players play for multiple teams, and have multiple rows.
    For those players, only their totals ('TOT') row which includes all team
    values is kept.
    '''
    # Buffer so as not to go over the 20 requests per minute threshold
    time.sleep(5)
    
    # Get the info from the correct URL
    URL = f"https://www.basketball-reference.com/leagues/NBA_{year}_totals.html"
    tbls = pd.read_html(URL)
    
    # The data is in the first table
    df = tbls[0].reset_index()
    
    # Create a dictionary of players who are repeated in the table (have played
    # for multiple teams.)
    repeats = {Player for Player in df['Player'][df['Tm'] == 'TOT']}
    
    # Initialize a list to store the indices of any rows we want to remove
    remove_index = []
    
    # Need to remove duplicate rows for players who are on multiple teams,
    # as well as rows without player info from the Basketball-Reference format
    for i in range(df.shape[0]):
        # Leave the 'TOT' row but remove any other rows for players who have multiple
        if df['Player'].iloc[i,] in repeats and df['Tm'].iloc[i,] != 'TOT':
            remove_index.append(i)
        # Remove any rows where the player name is "Player" - these are not players
        elif df['Player'].iloc[i,] == "Player":
            remove_index.append(i)
    
    # Remove all indices that we don't want and reset indices
    df = df.drop(remove_index).reset_index()
    
    # Drop unnecessary columns
    df = df.drop(["level_0", "index", "Rk"], axis=1)

    # Add a column with the year
    df['Year'] = year

    # Remove any punctuation (some player names had asterisks)
    df['Player'] = df['Player'].apply(lambda x: re.sub("\*", "", x))
    
    return(df)

In [3]:
def Get_Abbrev(team):

    team_abbrevs = {
        'Atlanta Hawks': 'ATL',
        'Boston Celtics': 'BOS',
        'Brooklyn Nets': 'BRK',
        'Chicago Bulls': 'CHI',
        'Charlotte Hornets': 'CHO',
        'Charlotte Bobcats': 'CHO',
        'Cleveland Cavaliers': 'CLE',
        'Dallas Mavericks': 'DAL',
        'Denver Nuggets': 'DEN',
        'Detroit Pistons': 'DET',
        'Golden State Warriors': 'GSW',
        'Houston Rockets': 'HOU',
        'Indiana Pacers': 'IND',
        'Los Angeles Clippers': 'LAC',
        'Los Angeles Lakers': 'LAL',
        'Memphis Grizzlies': 'MEM',
        'Miami Heat': 'MIA',
        'Milwaukee Bucks': 'MIL',
        'Minnesota Timberwolves': 'MIN',
        'New Jersey Nets': 'NJN',
        'New Orleans Hornets': 'NOH',
        'New Orleans/Oklahoma City Hornets': 'NOK',
        'New Orleans Pelicans': 'NOP',
        'New York Knicks': 'NYK',
        'Oklahoma City Thunder': 'OKC',
        'Orlando Magic': 'ORL',
        'Philadelphia 76ers': 'PHI',
        'Phoenix Suns': 'PHO',
        'Portland Trailblazers': 'POR',
        'Sacramento Kings': 'SAC',
        'San Antonio Spurs': 'SAS',
        'Seattle SuperSonics': 'SEA',
        'Toronto Raptors': 'TOR',
        'Multiple Teams': 'TOT',
        'Utah Jazz': 'UTA',
        'Washington Wizards': 'WAS',
    }
    
    matches = get_close_matches(team, team_abbrevs.keys(), cutoff = 0.75)

    if len(matches) > 0:
        return(team_abbrevs[matches[0]])
    else:
        return(None)

    
def Get_Team_Records(year):
    
    time.sleep(5)
    URL = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
    tbls = pd.read_html(URL)
    
    Team_Records = pd.DataFrame()
    
    for i in range(2):
        df = tbls[i].iloc[:,0:4].copy()

        df.columns = ("Team", "W", "L", "W/L")

        df['Team'] = df['Team'].apply(Get_Abbrev)
        
        df.dropna(inplace = True)

        Team_Records = pd.concat([Team_Records,df])
        
    Team_Records.reset_index(inplace = True, drop = True)

    return Team_Records

In [4]:
NBA_Records = {year: Get_Team_Records(year) for year in range(2004, 2025)}

In [5]:
# Get All NBA Voting Info for 2014-2023 Seasons (No Voting for 2024 yet)
AllNBA_Voting = {year: Get_AllNBA_Voting(year) for year in range(2004,2024)}

# Get All NBA Player Stats for 2014-2024 Seasons
NBA_Player_Stats = {year: Get_NBA_Player_Stats(year) for year in range(2004,2025)}

In [6]:
def Get_Awards_Info(player, awards_df):
    '''
    Returns a tuple of information containing whether a given player was on an
    All-NBA team, what team they were on, and how many 'Voter Points' they recieved
    '''
    if player in set(awards_df['Player']):
        pts = int(awards_df['Pts Won'].loc[awards_df['Player'] == player])
        tm = awards_df.loc[awards_df['Player'] == player]['# Tm'].values[0]
    else:
        pts = 0
        tm = "NO"

    if tm in set(('1T', '2T', '3T', '1st', '2nd', '3rd')):
        All_NBA = True
        tm = tm[0]
    else:
        All_NBA = False

    return (All_NBA, tm, pts)

In [7]:
def Get_Team_Record(team, records_df):
    
    if team == "CHA":
        team = "CHO"
    
    if team == "TOT":
        return (None, None, None)
    elif team in set(records_df['Team']):
        return tuple(records_df[records_df['Team'] == team].iloc[:,1:4].values.flatten())
    else:
        return (None, None, None)

In [8]:
# Iterate through all years with awards (excluding 2024)
for year in range(2004,2024):
    NBA_Player_Stats[year][["All_NBA", 
                        "All_NBA_Team", 
                        "Voting_Points"]] = NBA_Player_Stats[year]['Player'].apply(
    lambda x: Get_Awards_Info(x, AllNBA_Voting[year])).tolist()

# No NBA Awards for 2024 yet
NBA_Player_Stats[2024][["All_NBA", "All_NBA_Team", "Voting_Points"]] = ['NO', 'NO', 'NO']

In [9]:
# View the 2004 data
NBA_Player_Stats[2004]

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,Year,All_NBA,All_NBA_Team,Voting_Points
0,Shareef Abdur-Rahim,PF,27,TOT,85,56,2684,501,1054,.475,...,174,68,37,184,222,1384,2004,False,NO,0
1,Malik Allen,PF,25,MIA,45,6,616,83,198,.419,...,16,12,28,27,81,191,2004,False,NO,0
2,Ray Allen,SG,28,SEA,56,56,2152,447,1017,.440,...,268,71,11,156,132,1287,2004,False,ORV,6
3,Rafer Alston,PG,27,MIA,82,28,2581,287,764,.376,...,372,114,18,128,212,838,2004,False,NO,0
4,Chris Andersen,PF,25,DEN,71,0,1029,90,203,.443,...,35,34,114,48,119,243,2004,False,NO,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,Loren Woods,C,25,MIA,38,2,506,44,96,.458,...,10,11,19,26,52,121,2004,False,NO,0
438,Qyntel Woods,SF,22,POR,62,8,673,88,237,.371,...,46,20,14,52,88,224,2004,False,NO,0
439,Metta World Peace,SF,24,IND,73,71,2714,468,1112,.421,...,272,152,50,202,194,1333,2004,True,3,90
440,Lorenzen Wright,C,28,MEM,65,46,1674,257,586,.439,...,71,45,58,77,192,610,2004,False,NO,0


In [10]:
# Get the team records for each NBA season and add to each row
for year in range(2004,2025):
    NBA_Player_Stats[year][["W", "L", "W/L"]] = NBA_Player_Stats[year]['Tm'].apply(
    lambda x: Get_Team_Record(x, NBA_Records[year])).tolist()

In [11]:
# Initialize a dataframe to hold all years together
All_Stats_df = pd.DataFrame()

# Concatenate all years to form one dataframe
for year in range(2004,2025):
    All_Stats_df = pd.concat([All_Stats_df, NBA_Player_Stats[year]], axis=0)

In [12]:
All_Stats_df.reset_index(inplace = True)

In [13]:
def Get_AllNBA_Position(pos):
    positions = pos.split('-')
    if "PG" in positions or "SG" in positions:
        return "Backcourt"
    else:
        return "Frontcourt"
    

All_Stats_df['Pos'] = All_Stats_df['Pos'].apply(Get_AllNBA_Position)

In [14]:
All_Stats_df.drop(All_Stats_df.columns[0], axis = 1, inplace = True)

In [15]:
values = {
    "W/L": 0.5,
    "FT%": 0,
    "FG%": 0,
    "eFG%": 0,
    "2P%": 0,
    "3P%": 0
}

All_Stats_df.fillna(value = values, inplace = True)

In [16]:
All_Stats_df.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'All_NBA', 'All_NBA_Team', 'Voting_Points', 'W', 'L', 'W/L'],
      dtype='object')

In [18]:
All_Stats_df.to_csv("Data/NBA_Player_Stats.csv")