In [124]:
# libraries to use
import requests
import pandas as pd
import json
import io
import itertools
import datetime
import numpy as np
import math
import os

In [2]:
# get game logs for PlayerID from NBA.com (MeasureType=Base,Advanced,Misc,Scoring,Usage)
def get_game_logs_df(PlayerID, MeasureType, Season, PerMode):
    glogs_url = 'https://stats.nba.com/stats/playergamelogs?DateFrom=&DateTo=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType='+MeasureType+'&Month=0&OppTeamID=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode='+PerMode+'&Period=0&PlayerID='+str(PlayerID)+'&PlusMinus=N&Rank=N&Season='+Season+'&SeasonSegment=&SeasonType=Regular%20Season&ShotClockRange=&TeamID=&VsConference=&VsDivision='
    glogs_response = requests.get(url=glogs_url, headers=headers).json()
    glogs_headers = glogs_response['resultSets'][0]['headers']
    glogs_data = glogs_response['resultSets'][0]['rowSet']
    glogs_df = pd.DataFrame(data=glogs_data, columns=glogs_headers)
    return glogs_df

In [3]:
def get_games_past_week(this_game, plyr_box_stats_df):
    game_date = this_game['GAME_DATE_OBJ']
    # for a given game_date find the week period before
    time_delta_yesterday = datetime.timedelta(days = 1)
    time_delta_week_ago = datetime.timedelta(days = 7)
    date_yesterday = game_date - time_delta_yesterday
    date_week_ago = game_date - time_delta_week_ago    
    # find games that lie between date_yesterday and date_week_ago
    game_dates_past_week = plyr_box_stats_df.loc[(plyr_box_stats_df['GAME_DATE_OBJ'] <= date_yesterday) & (plyr_box_stats_df['GAME_DATE_OBJ'] >= date_week_ago)]
    return game_dates_past_week

In [4]:
# to-do: support for multiple teams for the case player is traded mid-season
# (exmpale: James Harden 2021-22 traded BRK -> PHI)

def calc_injury_stats(plyr_box_stats_df, team, last_injury_date, num_career_injuries):
    player_games = plyr_box_stats_df.loc[plyr_box_stats_df['TEAM_ABBREVIATION']==team]
    # reverse player_games since team schedule first -> last
    player_games.sort_values(by='GAME_DATE_OBJ', ascending=True, inplace = True)
    player_dates = player_games['GAME_DATE_OBJ']
    # to-do: use for later (multi-team support)
    # max_date = max(player_dates)
    # min_date = min(player_dates)
    # global: team_dates_dict
    team_schedule = team_dates_dict.get(team)
    team_dates = team_schedule['dateGame_OBJ']
    # to-do: use for later (multi-team support)
    # team_max_date = max(team_dates)
    # team_min_date = min(team_dates)
    # get list of games in team schedule player wasn't in
    missed_dates = np.setdiff1d(team_dates,player_dates, assume_unique=False)
    played_schedule = []
    for date in team_dates:
        if date in missed_dates:
            played_schedule.append(False)
        else:
            played_schedule.append(True)
    
    # create temp table of team schedule with player in game
    temp_df = pd.DataFrame({'GAME_DATE_OBJ':team_dates,
                           'PLAYED_GAME':played_schedule,})
    temp_df.reset_index(drop=True, inplace=True)

    # create column flagging player out next game
    out_next_game = []
    for index, current_date in temp_df.iterrows():
        # if at last player's game of regular season just put true (data doesn't extend to playoffs)
        if index == len(temp_df) - 1:
            out_next_game.append(0)
        else:
            next_date = temp_df.iloc[index + 1]
            if next_date['PLAYED_GAME']:
                out_next_game.append(0)
            else:
                out_next_game.append(1)
    temp_df.loc[:,'OUT_NEXT_GAME']  = out_next_game

    # get OUT_NEXT_GAME subset based on player game logs
    player_injury_data = temp_df.loc[temp_df['GAME_DATE_OBJ'].isin(player_dates)]

    # calculate days last injury and rolling career injuries during season
    days_last_injury = []
    date_last_injury = last_injury_date
    career_injuries = []
    career_injuries_count = num_career_injuries
    for idx, row in player_injury_data.iterrows():
        if row['OUT_NEXT_GAME'] == 1:
            career_injuries_count = career_injuries_count + 1
            date_last_injury = row['GAME_DATE_OBJ']
    
        time_delta = row['GAME_DATE_OBJ'] - date_last_injury
        days_last_injury.append(time_delta.days)
        career_injuries.append(career_injuries_count)

    player_injury_data.loc[:,'CAREER_INJURIES'] = career_injuries
    player_injury_data.loc[:,'DAYS_LAST_INJURY'] = days_last_injury
        
    # reverse the injury list  because player game logs/plyr_box_stats_df last -> first
    player_injury_data = player_injury_data.iloc[::-1]
    player_injury_data.reset_index(drop=True, inplace=True)
    
    plyr_box_stats_df.loc[:,'OUT_NEXT_GAME'] = player_injury_data['OUT_NEXT_GAME']
    plyr_box_stats_df.loc[:,'CAREER_INJURIES'] = player_injury_data['CAREER_INJURIES']
    plyr_box_stats_df.loc[:,'DAYS_LAST_INJURY'] = player_injury_data['DAYS_LAST_INJURY']
    
    return plyr_box_stats_df


In [36]:
def get_full_game_logs_df(PlayerID, Season, PerMode):
    # get player game logs for PlayerID from NBA.com for all measure types (MeasureType=Base,Advanced,Misc,Scoring,Usage)
    # PlayerID = 1629029 # Luka Doncic
    # Season='2021-22'
    # PerMode='Totals'
    plyr_base_df = get_game_logs_df(PlayerID,'Base',Season, PerMode)
    plyr_advanced_df = get_game_logs_df(PlayerID,'Advanced',Season, PerMode)
    plyr_misc_df = get_game_logs_df(PlayerID,'Misc',Season, PerMode)
    plyr_scor_df = get_game_logs_df(PlayerID,'Scoring',Season, PerMode)
    plyr_usage_df = get_game_logs_df(PlayerID,'Usage',Season, PerMode)

    # get subset of attributes from each game log measure
    plyr_base_df_subset = plyr_base_df[plyr_base_headers_subset]
    plyr_advanced_df_subset = plyr_advanced_df[plyr_advanced_headers_subset]
    plyr_misc_df_subset = plyr_misc_df[plyr_misc_headers_subset]
    plyr_scor_df_subset = plyr_scor_df[plyr_scor_headers_subset]

    # put all game log subsets for player into one dataframe
    plyr_box_stats_df = pd.concat([plyr_base_df_subset, 
                                   plyr_advanced_df_subset, 
                                   plyr_misc_df_subset, 
                                   plyr_scor_df_subset], axis=1)

    num_box_stats = plyr_box_stats_df.shape[0]

    # Add Bio data to game log dataframe: Age, Height (in.), Weight (lbs.)
    # global: plyrs_bio_df
    player_bio = plyrs_bio_df.loc[plyrs_bio_df['PLAYER_ID'] == PlayerID]

    # cannot find player birthday info on NBA.com
    # NOTE: NBA.com provides appropriate age in bio given Season
    player_age = [int(player_bio.iloc[0]['AGE'])] * num_box_stats
    plyr_box_stats_df['AGE'] = player_age

    player_height = [player_bio.iloc[0]['PLAYER_HEIGHT_INCHES']] * num_box_stats
    plyr_box_stats_df['PLAYER_HEIGHT_INCHES'] = player_height

    player_weight = [player_bio.iloc[0]['PLAYER_WEIGHT']] * num_box_stats
    plyr_box_stats_df['PLAYER_WEIGHT'] = player_weight

    # calculate field goals made mid-range and in the paint
    fg_attr = ['FGM','FGA','FG3M','FG3A','FTM','FTA','PTS','PTS_PAINT','PCT_PTS_2PT_MR']

    fgm_paint_list = []
    fgm_mr_list = []
    for x in range(0, num_box_stats):
        game_stats = plyr_box_stats_df.iloc[x][fg_attr]
        fgm_paint = game_stats['PTS_PAINT']/2
        fgm_2p = game_stats['FGM'] - game_stats['FG3M']
        fgm_mr = fgm_2p - fgm_paint
        fgm_paint_list.append(int(fgm_paint))
        fgm_mr_list.append(int(fgm_mr))

    # note: cannot find 2PT Mid-range FG attempts in player box scores at NBA.com;
    # will use 2PT mid-range and in-the-paint made FG for now
    plyr_box_stats_df['FGM_PAINT'] = fgm_paint_list
    plyr_box_stats_df['FGM_2PT_MR'] = fgm_mr_list

    game_dates = plyr_box_stats_df['GAME_DATE'].values.tolist()
    game_dates_obj = [datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') for date in game_dates]
    # add converted python GAME_DATE_OBJ datetime64 object to plyr_box_stats_df
    plyr_box_stats_df.loc[:,'GAME_DATE_OBJ']  = game_dates_obj

    # Compile box stats containing stat totals from the past week (7 days before) of game_date for the following: 
    # Total minutes played - pw_min
    # Total games played - pw_games
    # 2-point field goal attempts - pw_fg2a
    # 3-point field goal attempts - pw_gf3a
    # Blocks - pw_blk
    # Rebounds - pw_reb (need offensive/defensive rebounds…?)
    # Personal fouls - pw_pf
    # Assists - pw_ast
    # Other - ???

    pw_mins = []
    pw_gamess = []
    pw_fg3as = []
    pw_fg2as = []
    pw_orebs = []
    pw_drebs = []
    pw_asts = []
    pw_tovs = []
    pw_stls = []
    pw_blks = []
    pw_posss = []
    pw_pfs = []
    pw_pfds = []
    pw_fgm_paints = []
    pw_fgm_2pt_mrs = []
    pw_pie_avg = []
    pw_usg_pct_avg = []

    # iterate through all games in player box stats and calculate totals/avgs for games the week before
    for index, this_game in plyr_box_stats_df.iterrows():
        games_week_before_df = get_games_past_week(this_game, plyr_box_stats_df)
        pw_min = games_week_before_df['MIN'].sum()
        pw_games = games_week_before_df.shape[0]
        pw_fg3a = games_week_before_df['FG3A'].sum()
        pw_fg2a = games_week_before_df['FGA'].sum() - pw_fg3a
        pw_oreb = games_week_before_df['OREB'].sum()
        pw_dreb = games_week_before_df['DREB'].sum()
        pw_ast = games_week_before_df['AST'].sum()
        pw_tov = games_week_before_df['TOV'].sum()
        pw_stl = games_week_before_df['STL'].sum()
        pw_blk = games_week_before_df['BLK'].sum()
        pw_poss = games_week_before_df['POSS'].sum()
        pw_pf = games_week_before_df['PF'].sum()
        pw_pfd = games_week_before_df['PFD'].sum()
        pw_fgm_paint = games_week_before_df['PTS_PAINT'].sum()/2
        pw_fgm_2p = games_week_before_df['FGM'].sum() - games_week_before_df['FG3M'].sum()
        pw_fgm_2pt_mr = pw_fgm_2p - pw_fgm_paint
        pw_pie = games_week_before_df['PIE'].mean()
        pw_usg_pct = games_week_before_df['USG_PCT'].mean()

        pw_mins.append(pw_min)
        pw_gamess.append(pw_games)
        pw_fg3as.append(pw_fg3a)
        pw_fg2as.append(pw_fg2a)
        pw_orebs.append(pw_oreb)
        pw_drebs.append(pw_dreb)
        pw_asts.append(pw_ast)
        pw_tovs.append(pw_tov)
        pw_stls.append(pw_stl)
        pw_blks.append(pw_blk)
        pw_posss.append(pw_poss)
        pw_pfs.append(pw_pf)
        pw_pfds.append(pw_pfd)
        pw_fgm_paints.append(pw_fgm_paint)
        pw_fgm_2pt_mrs.append(pw_fgm_2pt_mr)
        pw_pie_avg.append(pw_pie)
        pw_usg_pct_avg.append(pw_usg_pct)

    plyr_box_stats_df.loc[:,'PW_MIN'] = pw_mins
    plyr_box_stats_df.loc[:,'PW_GAMES'] = pw_gamess
    plyr_box_stats_df.loc[:,'PW_FG3A'] = pw_fg3as
    plyr_box_stats_df.loc[:,'PW_FG2A'] = pw_fg2as
    plyr_box_stats_df.loc[:,'PW_OREB'] = pw_orebs
    plyr_box_stats_df.loc[:,'PW_DREB'] = pw_drebs
    plyr_box_stats_df.loc[:,'PW_ASTS'] = pw_asts
    plyr_box_stats_df.loc[:,'PW_TOV'] = pw_tovs
    plyr_box_stats_df.loc[:,'PW_STL'] = pw_stls
    plyr_box_stats_df.loc[:,'PW_BLK'] = pw_blks
    plyr_box_stats_df.loc[:,'PW_POS'] = pw_posss
    plyr_box_stats_df.loc[:,'PW_PF'] = pw_pfs
    plyr_box_stats_df.loc[:,'PW_PFD'] = pw_pfds
    plyr_box_stats_df.loc[:,'PW_FGM_PAINT'] = pw_fgm_paints
    plyr_box_stats_df.loc[:,'PW_FGM_2PT_MR'] = pw_fgm_2pt_mrs
    plyr_box_stats_df.loc[:,'PW_PIE_AVG'] = pw_pie_avg
    plyr_box_stats_df.loc[:,'PW_USG_PCT_AVG'] = pw_usg_pct_avg

    # fill na with 0 on past week stats
    #plyr_box_stats_df.loc[:,'PW_PFD'] = plyr_box_stats_df['PW_PFD'].fillna(0)
    plyr_box_stats_df.loc[:,'PW_PIE_AVG'] = plyr_box_stats_df['PW_PIE_AVG'].fillna(0)
    plyr_box_stats_df.loc[:,'PW_USG_PCT_AVG'] = plyr_box_stats_df['PW_USG_PCT_AVG'].fillna(0)

    # get player injury data
    # golbal: il_data, mg_data
    player_il = il_data.loc[il_data['PLAYER_ID'] == PlayerID]
    player_mg = mg_data.loc[mg_data['PLAYER_ID'] == PlayerID]

    # get first date game played for season (at end of list becasue last -> first)
    first_game_season = plyr_box_stats_df.iloc[len(plyr_box_stats_df) - 1]
    first_game_season_date = first_game_season['GAME_DATE_OBJ']

    # get missed game history before season
    player_mg_before_season = player_mg.loc[player_mg['Date_OBJ'] < first_game_season_date]
    player_mg_before_season = player_mg_before_season.loc[pd.isnull(player_mg_before_season['Acquired'])]

    calculate_injuries_flag = True
    
    # find date of last missed game before season
    if player_mg_before_season.empty:
        calculate_injuries_flag = False
        last_mg_date = np.nan
    else: 
        last_mg = player_mg_before_season.iloc[len(player_mg_before_season) - 1]
        last_mg_date = last_mg['Date_OBJ']
    # what if have no prior injuries...?
    
    # get injured list history before season
    player_il_before_season = player_il.loc[player_il['Date_OBJ'] < first_game_season_date]
    player_il_before_season = player_il_before_season.loc[pd.isnull(player_il_before_season['Acquired'])]

    # find date of last missed game before season
    if player_il_before_season.empty:
        calculate_injuries_flag = False
        last_il_date = np.nan
    else:
        last_il = player_il_before_season.iloc[len(player_il_before_season) - 1]
        last_il_date = last_il['Date_OBJ']
        
    # what if have no prior missed games...?

    # get last injury or missed date before season
    # will use as inital 'anchor date' to calculate days from last injury
    if calculate_injuries_flag:
        last_injury_date = max(last_il_date, last_mg_date)
    else: 
        last_injury_date = np.nan

    # get number of career injuries before season
    # will use as inital count to calculate rolling injuries during season
    num_career_injuries = len(player_mg_before_season) + len(player_il_before_season)

    # get list of teams player was on during season (if > 1 -> was traded mid-season)
    player_teams = plyr_box_stats_df['TEAM_ABBREVIATION'].unique()
    if len(player_teams) != 1:
        calculate_injuries_flag = False

    # injury stats can only support a player on the same team thoughout the season (i.e. not traded mid-season)
    if calculate_injuries_flag:
        plyr_box_stats_df = calc_injury_stats(plyr_box_stats_df, player_teams[0], last_injury_date, num_career_injuries)
    else:
        print('Skipping player ' + plyr_box_stats_df['PLAYER_NAME'][0] + ' injury data who has no prior injuries or traded mid-season (to-do)')

    # save game log stats for player to .csv file
    player_name = plyr_box_stats_df['PLAYER_NAME'][0]
    player_name = player_name.replace(" ", "_")
    if calculate_injuries_flag:
        file_name = 'data/2021_22_NBA_REG_GAME_LOGS_' + player_name + '_' + str(plyr_box_stats_df['PLAYER_ID'][0]) + '.csv'
    else:
        file_name = 'data-no-injury/2021_22_NBA_REG_GAME_LOGS_' + player_name + '_' + str(plyr_box_stats_df['PLAYER_ID'][0]) + '.csv'
    plyr_box_stats_df.to_csv(file_name)
    print('Write to file: ' + file_name)





In [6]:
# use these headers for every request to NBA.com
headers  = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [7]:
Season='2021-22'
PerMode='Totals'

# make a list of NBA measure types to loop through for a player
MeasureTypes = [
    'Base',
    'Advanced',
    'Misc',
    'Scoring',
    'Usage'
]

# make a list of NBA seasons to loop through later
season_list = [
    '1996-97',
    '1997-98',
    '1998-99',
    '1999-00',
    '2000-01',
    '2001-02',
    '2002-03',
    '2003-04',
    '2004-05',
    '2005-06',
    '2006-07',
    '2007-08',
    '2008-09',
    '2009-10',
    '2010-11',
    '2011-12',
    '2012-13',
    '2013-14',
    '2014-15',
    '2015-16',
    '2016-17',
    '2017-18',
    '2018-19',
    '2019-20',
    '2020-21',
    '2021-22',
    '2022-23',
]

team_abbrev = ['BOS', 'GSW', 'DET', 'WAS', 'ATL', 'NOP', 'CHI', 'TOR', 'MEM',
       'MIN', 'CHA', 'UTA', 'PHX', 'POR', 'MIL', 'LAC', 'SAS', 'BKN',
       'NYK', 'DEN', 'IND', 'MIA', 'CLE', 'DAL', 'PHI', 'HOU', 'OKC',
       'ORL', 'SAC', 'LAL']

In [8]:
# get general (Measure=Base) player total stats (PerMode=Totals) for 2021-22 season (Season=2021-22) - example URL request
# get PLAYER_ID, PLAYER_NAME, GP, MIN
players_stats_url = 'https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode='+PerMode+'&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season='+Season+'&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight='
players_stats_response = requests.get(url=players_stats_url, headers=headers).json()
players_stats_headers = players_stats_response['resultSets'][0]['headers']
players_stats_data = players_stats_response['resultSets'][0]['rowSet']
players_stats_df = pd.DataFrame(data=players_stats_data, columns=players_stats_headers)

In [9]:
# get subset of players_stats_df containing only PLAYER_ID, PLAYER_NAME, MIN
player_ids_2021_22 = players_stats_df[['PLAYER_ID', 'PLAYER_NAME','GP','MIN']]

# using stat Totals from NBA.com calculate MPG
mpg_list = []
for x in range(0, player_ids_2021_22.shape[0]):
    mpg = player_ids_2021_22.iloc[x]['MIN']/player_ids_2021_22.iloc[x]['GP']
    mpg_list.append(round(mpg,1))

player_ids_2021_22.loc[:,'MPG'] = mpg_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_ids_2021_22.loc[:,'MPG'] = mpg_list


In [11]:
# limit study to only players with 30MPG in the 2021-22 NBA Regular Season
high_mpg_players_2021_22_df = player_ids_2021_22.loc[(player_ids_2021_22['MIN'] >= 500) & (player_ids_2021_22['MPG'] >= 30)]
# save this data to a file for reference
high_mpg_players_2021_22_df.to_csv('players_mpg_2021_22_30mpg.csv', index=False)

In [12]:
# get 2021-22 NBA Season schedule based on nbastatR::seasons_schedule()
schedule_df = pd.read_csv("schedule_2021_22.csv")

# convert dateGame string into datetime64 object, add it to schedule dataframe
sch_game_dates = schedule_df['dateGame'].values.tolist()
sch_game_dates_obj = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in sch_game_dates]
schedule_df.loc[:,'dateGame_OBJ']  = sch_game_dates_obj

# create a dictionary of team to list of games played 
team_dates_dict = {}
for team in team_abbrev:
    team_games = schedule_df.loc[(schedule_df['slugTeamWinner'] == team) | (schedule_df['slugTeamLoser'] == team)]
    team_dates_dict[team]=team_games


In [13]:
# load player dictionary based on data from nbastatR::players_careers()
player_dict = pd.read_csv('df_nba_player_dict.csv')

In [14]:
# load all injured list data since 2010 from Pro Sports Transactions
il_data = pd.read_csv('prosportstransactions_scrape_inactivelist_2010_2023.csv')
# convert Date string into datetime64 object, add it to il dataframe
il_dates = il_data['Date'].values.tolist()
il_dates_obj = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in il_dates]
il_data.loc[:,'Date_OBJ']  = il_dates_obj

# if possible map NBA Player IDs to entires in injured list
il_id = []

num_cannot_find = 0

for index, row in il_data.iterrows():
    namePlayer = row['Acquired']
    if pd.isnull(row['Acquired']):
        namePlayer = row['Relinquished']
    player = player_dict.loc[player_dict['namePlayer'] == namePlayer]
    if (player.empty):
        il_id.append(np.nan)
        # print('cannot find: ' + str(namePlayer))
        num_cannot_find = num_cannot_find + 1
    else:
        il_id.append(int(round(player['idPlayer'].values[0],0)))

print('Injured list cannot match total: ' + str(num_cannot_find))

il_data.loc[:,'PLAYER_ID'] = il_id

Injured list cannot match total: 2066


In [15]:
# load all missed games data since 2010 from Pro Sports Transactions
mg_data = pd.read_csv('prosportstransactions_scrape_missedgames_2010_2023.csv')
# convert Date string into datetime64 object, add it to mg dataframe
mg_dates = mg_data['Date'].values.tolist()
mg_dates_obj = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in mg_dates]
mg_data.loc[:,'Date_OBJ']  = mg_dates_obj

# if possible map NBA Player IDs to entires in missed games list

mg_id = []

num_cannot_find = 0

for index, row in mg_data.iterrows():
    namePlayer = row['Acquired']
    if pd.isnull(row['Acquired']):
        namePlayer = row['Relinquished']
    player = player_dict.loc[player_dict['namePlayer'] == namePlayer]
    if (player.empty):
        mg_id.append(np.nan)
        # print('cannot find: ' + str(namePlayer))
        num_cannot_find = num_cannot_find + 1
    else:
        mg_id.append(round(player['idPlayer'].values[0],0))
        
print('Missed games cannot match total: ' + str(num_cannot_find))

mg_data.loc[:,'PLAYER_ID'] = mg_id

Missed games cannot match total: 1467


In [16]:
# example NBA.com 2021-21 (Season=2021-22) players bio (https://stats.nba.com/stats/leaguedashplayerbiostats):
# use for AGE, PLAYER_HEIGHT_INCHES, PLAYER_WEIGHT
plyrs_bio_url = 'https://stats.nba.com/stats/leaguedashplayerbiostats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season='+Season+'&SeasonSegment=&SeasonType=Regular%20Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
plyrs_bio_response = requests.get(url=plyrs_bio_url, headers=headers).json()
plyrs_bio_headers = plyrs_bio_response['resultSets'][0]['headers']
plyrs_bio_data = plyrs_bio_response['resultSets'][0]['rowSet']
plyrs_bio_df = pd.DataFrame(data=plyrs_bio_data, columns=plyrs_bio_headers)

In [17]:
# example NBA.com 2021-21 (Season=2021-22) player index (https://stats.nba.com/stats/playergamelogs/playerindex):
# do we need anything...? Just get from bio?
plyrs_idx_url = 'https://stats.nba.com/stats/playerindex?College=&Country=&DraftPick=&DraftRound=&DraftYear=&Height=&Historical=1&LeagueID=00&Season='+Season+'&SeasonType=Regular%20Season&TeamID=0&Weight='
plyrs_idx_response = requests.get(url=plyrs_idx_url, headers=headers).json()
plyrs_idx_headers = plyrs_idx_response['resultSets'][0]['headers']
plyrs_idx_data = plyrs_idx_response['resultSets'][0]['rowSet']
plyrs_idx_df = pd.DataFrame(data=plyrs_idx_data, columns=plyrs_idx_headers)
plyrs_idx_headers

['PERSON_ID',
 'PLAYER_LAST_NAME',
 'PLAYER_FIRST_NAME',
 'PLAYER_SLUG',
 'TEAM_ID',
 'TEAM_SLUG',
 'IS_DEFUNCT',
 'TEAM_CITY',
 'TEAM_NAME',
 'TEAM_ABBREVIATION',
 'JERSEY_NUMBER',
 'POSITION',
 'HEIGHT',
 'WEIGHT',
 'COLLEGE',
 'COUNTRY',
 'DRAFT_YEAR',
 'DRAFT_ROUND',
 'DRAFT_NUMBER',
 'ROSTER_STATUS',
 'PTS',
 'REB',
 'AST',
 'STATS_TIMEFRAME',
 'FROM_YEAR',
 'TO_YEAR']

In [18]:
# selection subset of attributes from each game log measure

plyr_base_headers_subset = ['SEASON_YEAR',
 'PLAYER_ID',
 'PLAYER_NAME',
 'TEAM_ID',
 'TEAM_ABBREVIATION',
 'TEAM_NAME',
 'GAME_ID',
 'GAME_DATE',
 'MATCHUP',
 'WL',
 'MIN',
 'FGM',
 'FGA',
 'FG3M',
 'FG3A',
 'FTM',
 'FTA',
 'OREB',
 'DREB',
 'AST',
 'TOV',
 'STL',
 'BLK',
 'PF',
 'PFD',
 'PTS']

plyr_advanced_headers_subset = ['USG_PCT',
 'PIE',
 'POSS']

plyr_misc_headers_subset= [ 'PTS_PAINT']

plyr_scor_headers_subset = ['PCT_PTS_2PT_MR']

In [122]:
# Test with misc PlayerIDs
# PlayerID = 2544 # leBron James
# PlayerID = 202695 # kawhi leonard - lots of time off - no bio?! out all of 2021-22 season, no bio for season - DON'T USE!
# PlayerID = 201935 # James Harden - had 2021-22 trade BRK -> PHI
# PlayerID = 1629029 # Luka Doncic

# PlayerID = 1630162 # Anthony Edwards - no MG prior season

PlayerID = 1628389 # Bam Adebayo

get_full_game_logs_df(203897, Season, PerMode)

Write to file: data/2021_22_NBA_REG_GAME_LOGS_Zach_LaVine_203897.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_injury_data.loc[:,'CAREER_INJURIES'] = career_injuries
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_injury_data.loc[:,'DAYS_LAST_INJURY'] = days_last_injury


In [24]:
# loop through PLAYER_ID in high_mpg_players_2021_22_df to generate .csv game logs
PlayerIDs = high_mpg_players_2021_22_df['PLAYER_ID'].values.tolist()

In [None]:
for PlayerID in PlayerIDs:
    get_full_game_logs_df(PlayerID, Season, PerMode)

In [111]:
player_il = il_data.loc[il_data['PLAYER_ID'] == 203552]
player_mg = mg_data.loc[mg_data['PLAYER_ID'] == 203552]    

In [112]:
player_il

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes,Date_OBJ,PLAYER_ID
6834,11/9/2015,Kings,,Seth Curry,placed on IL with sprained right ankle,2015-11-09,203552.0
6883,11/13/2015,Kings,Seth Curry,,activated from IL,2015-11-13,203552.0
9762,3/27/2017,Mavericks,,Seth Curry,placed on IL with left shoulder injury,2017-03-27,203552.0
9791,3/29/2017,Mavericks,Seth Curry,,activated from IL,2017-03-29,203552.0
9829,4/2/2017,Mavericks,,Seth Curry,placed on IL with sore left shoulder (out for ...,2017-04-02,203552.0
10168,10/18/2017,Mavericks,,Seth Curry,placed on IL with stress reaction in tibia in ...,2017-10-18,203552.0
12739,11/16/2018,Blazers,,Seth Curry,placed on IL with right knee injury,2018-11-16,203552.0
12789,11/23/2018,Blazers,Seth Curry,,activated from IL,2018-11-23,203552.0
14394,11/20/2019,Mavericks,,Seth Curry,placed on IL with illness,2019-11-20,203552.0
14427,11/24/2019,Mavericks,Seth Curry,,activated from IL,2019-11-24,203552.0


In [113]:
player_mg

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes,Date_OBJ,PLAYER_ID
6834,11/7/2015,Kings,,Seth Curry,sprained right ankle (DTD),2015-11-07,203552.0
7580,2/29/2016,Kings,,Seth Curry,sprained right foot (DTD),2016-02-29,203552.0
8381,12/1/2016,Mavericks,,Seth Curry,bruised/strained/sprained right knee (DTD),2016-12-01,203552.0
8433,12/9/2016,Mavericks,Seth Curry,,returned to lineup,2016-12-09,203552.0
9232,3/31/2017,Mavericks,,Seth Curry,sore left shoulder (out for season),2017-03-31,203552.0
9455,10/7/2017,Mavericks,,Seth Curry,stress reaction in tibia in left leg (out for ...,2017-10-07,203552.0
10089,2/8/2018,Mavericks,,Seth Curry,surgery on left leg to repair stress fracture ...,2018-02-08,203552.0
10674,11/15/2018,Blazers,,Seth Curry,right knee injury (DTD),2018-11-15,203552.0
11439,4/5/2019,Blazers,,Seth Curry,sore left tibia (DTD),2019-04-05,203552.0
11451,4/7/2019,Blazers,Seth Curry,,returned to lineup,2019-04-07,203552.0


In [34]:
max(np.nan,np.nan)

nan

In [129]:
dir_list = os.listdir("data")

In [140]:
master_df = pd.DataFrame()

for file_name in dir_list:
    this_player_df = pd.read_csv('data/'+file_name)
    master_df = pd.concat([master_df, this_player_df], axis=0)

In [143]:
master_df.sort_values(by=['GAME_DATE_OBJ','TEAM_ABBREVIATION'], ascending=False, inplace = True)
master_df.reset_index(drop=True, inplace=True)

In [146]:
master_df

Unnamed: 0.1,Unnamed: 0,SEASON_YEAR,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,...,PW_POS,PW_PF,PW_PFD,PW_FGM_PAINT,PW_FGM_2PT_MR,PW_PIE_AVG,PW_USG_PCT_AVG,OUT_NEXT_GAME,CAREER_INJURIES,DAYS_LAST_INJURY
0,0,2021-22,203497,Rudy Gobert,1610612762,UTA,Utah Jazz,22101230,2022-04-10T00:00:00,UTA @ POR,...,198,11,21,19.0,0.0,0.169333,0.200000,0,24,16
1,0,2021-22,1628384,O.G. Anunoby,1610612761,TOR,Toronto Raptors,22101226,2022-04-10T00:00:00,TOR @ NYK,...,0,0,0,0.0,0.0,0.000000,0.000000,0,22,9
2,0,2021-22,1627749,Dejounte Murray,1610612759,SAS,San Antonio Spurs,22101219,2022-04-10T00:00:00,SAS @ DAL,...,0,0,0,0.0,0.0,0.000000,0.000000,0,24,11
3,0,2021-22,203084,Harrison Barnes,1610612758,SAC,Sacramento Kings,22101229,2022-04-10T00:00:00,SAC @ PHX,...,188,2,9,10.0,0.0,0.090000,0.165333,0,15,137
4,0,2021-22,202699,Tobias Harris,1610612755,PHI,Philadelphia 76ers,22101228,2022-04-10T00:00:00,PHI vs. DET,...,303,12,8,10.0,2.0,0.060750,0.163750,0,37,130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4283,77,2021-22,201566,Russell Westbrook,1610612747,LAL,Los Angeles Lakers,22100002,2021-10-19T00:00:00,LAL vs. GSW,...,0,0,0,0.0,0.0,0.000000,0.000000,0,35,281
4284,72,2021-22,203952,Andrew Wiggins,1610612744,GSW,Golden State Warriors,22100002,2021-10-19T00:00:00,GSW @ LAL,...,0,0,0,0.0,0.0,0.000000,0.000000,0,13,158
4285,75,2021-22,1629673,Jordan Poole,1610612744,GSW,Golden State Warriors,22100002,2021-10-19T00:00:00,GSW @ LAL,...,0,0,0,0.0,0.0,0.000000,0.000000,0,2,598
4286,63,2021-22,201939,Stephen Curry,1610612744,GSW,Golden State Warriors,22100002,2021-10-19T00:00:00,GSW @ LAL,...,0,0,0,0.0,0.0,0.000000,0.000000,0,58,158


In [147]:
master_df.to_csv('data/2021_22_NBA_REG_GAME_LOGS_COMBINED.csv')