In [None]:
# So get a list of players at the start of a quarter for each time (home 1-5 and away 1-5)
# I think I'll need to loop through each row of the dataframe and modify the list of players for each sub event (home and away)
#     Will also calculate the margin, score, and seconds for each

In [None]:
import requests
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
pd.set_option('display.max_columns',500)

In [None]:
# play_by_play_url = 'https://stats.nba.com/stats/playbyplayv2/?gameId=0041700404&&startPeriod=0&endPeriod=14' #
# games_url = 'https://stats.nba.com/stats/leaguegamelog?\
# Counter=1000&DateFrom=&DateTo=&Direction=DESC&LeagueID=00\
# &PlayerOrTeam=T&Season=2021-22&SeasonType=Playoffs&Sorter=DATE' #
# players_game_url = 'https://stats.nba.com/stats/boxscoreadvancedv2/?gameId=0041700404&startPeriod=0&endPeriod=14&startRange=7205&endRange=14395&rangeType=2'

In [None]:
# Headers for querying NBA JSON data api
header_data  = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [None]:
# code for getting starting quarter lineups from here: https://github.com/rd11490/NBA-Play-By-Play-Example
# endpoints
def play_by_play_url(game_id):
    return "https://stats.nba.com/stats/playbyplayv2/?gameId={0}&startPeriod=0&endPeriod=14".format(game_id)

def advanced_boxscore_url(game_id, start, end):
    return "https://stats.nba.com/stats/boxscoreadvancedv2/?gameId={0}&startPeriod=0&endPeriod=14&startRange={1}&endRange={2}&rangeType=2".format(game_id, start, end)

def games_url(season,season_type):
    return 'https://stats.nba.com/stats/leaguegamelog?Counter=1000&DateFrom=&DateTo=&Direction=DESC&LeagueID=00&PlayerOrTeam=T&Season={}&SeasonType={}&Sorter=DATE'.format(season,season_type)

def extract_data(url):
    resp = requests.get(url, headers=header_data).json()
#     resp = json.loads(r.data)
    results = resp['resultSets'][0]
    headers = results['headers']
    rows = results['rowSet']
    frame = pd.DataFrame(rows)
    frame.columns = headers
    return frame

def calculate_time_at_period(period):
    if period > 5:
        return (720 * 4 + (period - 5) * (5 * 60)) * 10
    else:
        return (720 * (period - 1)) * 10
    
def split_subs(df, tag):
    subs = df[[tag, 'PERIOD', 'EVENTNUM']]
    subs['SUB'] = tag
    subs.columns = ['PLAYER_ID', 'PERIOD', 'EVENTNUM', 'SUB']
    return subs

def get_all_games(season,season_type):
    games = extract_data(games_url(season,season_type))
    games = games[games['MATCHUP'].str.contains('vs.')][['GAME_ID','MATCHUP']]
    games['HOME']=games['MATCHUP'].str.split(' vs. ').str.get(0)
    games['AWAY']=games['MATCHUP'].str.split(' vs. ').str.get(1)
    return games


def get_game_data(game_id,games):
    
    frame = extract_data(play_by_play_url(game_id)).fillna('')
    
    substitutionsOnly = frame[frame["EVENTMSGTYPE"] == 8][['PERIOD', 'EVENTNUM', 'PLAYER1_ID', 'PLAYER2_ID']]
    substitutionsOnly.columns = ['PERIOD', 'EVENTNUM', 'OUT', 'IN']

    subs_in = split_subs(substitutionsOnly, 'IN')
    subs_out = split_subs(substitutionsOnly, 'OUT')

    full_subs = pd.concat([subs_out, subs_in], axis=0).reset_index()[['PLAYER_ID', 'PERIOD', 'EVENTNUM', 'SUB']]
    first_event_of_period = full_subs.loc[full_subs.groupby(by=['PERIOD', 'PLAYER_ID'])['EVENTNUM'].idxmin()]
    players_subbed_in_at_each_period = first_event_of_period[first_event_of_period['SUB'] == 'IN'][['PLAYER_ID', 'PERIOD', 'SUB']]

    periods = players_subbed_in_at_each_period['PERIOD'].drop_duplicates().values.tolist()

    frames = []
    for period in periods:

        low = calculate_time_at_period(period) + 5
        high = calculate_time_at_period(period + 1) - 5
        boxscore = advanced_boxscore_url(game_id, low, high)
        boxscore_players = extract_data(boxscore)[['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION']]
        boxscore_players['PERIOD'] = period

        players_subbed_in_at_period = players_subbed_in_at_each_period[players_subbed_in_at_each_period['PERIOD'] == period]

        joined_players = pd.merge(boxscore_players, players_subbed_in_at_period, on=['PLAYER_ID', 'PERIOD'], how='left')
        joined_players = joined_players[pd.isnull(joined_players['SUB'])][['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION', 'PERIOD']]
        frames.append(joined_players)

    out = pd.concat(frames)    
    out = out.groupby(['TEAM_ABBREVIATION','PERIOD'])['PLAYER_ID'].apply(lambda x: ','.join(map(str, x))).reset_index()
    
    frame['HOME']=games[games['GAME_ID']==game_id]['HOME'].values[0]
    frame['AWAY']=games[games['GAME_ID']==game_id]['AWAY'].values[0]
    frame = frame.merge(out,how='left',left_on=['HOME','PERIOD'],right_on=['TEAM_ABBREVIATION','PERIOD']).drop('TEAM_ABBREVIATION',axis=1)
    frame.rename({'PLAYER_ID':'HOME_PLAYER_IDS'},axis=1,inplace=2)
    frame = frame.merge(out,how='left',left_on=['AWAY','PERIOD'],right_on=['TEAM_ABBREVIATION','PERIOD']).drop('TEAM_ABBREVIATION',axis=1)
    frame.rename({'PLAYER_ID':'AWAY_PLAYER_IDS'},axis=1,inplace=2)
    frame['HOME_SUB']=frame['HOMEDESCRIPTION'].str.contains('SUB')
    frame['AWAY_SUB']=frame['VISITORDESCRIPTION'].str.contains('SUB')
    frame['QUARTER_START']=frame['NEUTRALDESCRIPTION'].str.contains('Start')
    frame['PLAYER_CHANGE']=np.where((frame['QUARTER_START']==True)|(frame['AWAY_SUB']==True)|(frame['HOME_SUB']==True),True,False)
    frame['HOME_PLAYER_IDS']=np.where(frame['QUARTER_START']==True,frame['HOME_PLAYER_IDS'],np.nan)
    frame['AWAY_PLAYER_IDS']=np.where(frame['QUARTER_START']==True,frame['AWAY_PLAYER_IDS'],np.nan)
    
    for idx, row in frame.iterrows():
        if idx == 0 or row['QUARTER_START']==True:
            continue
        else:
            frame.at[idx,'AWAY_PLAYER_IDS']=frame.at[idx-1,'AWAY_PLAYER_IDS']
            frame.at[idx,'HOME_PLAYER_IDS']=frame.at[idx-1,'HOME_PLAYER_IDS']
            if row['AWAY_SUB']==True:
                frame.at[idx,'AWAY_PLAYER_IDS']=frame.at[idx,'AWAY_PLAYER_IDS'].replace(str(row['PLAYER1_ID']),str(row['PLAYER2_ID']))
            if row['HOME_SUB']==True:
                frame.at[idx,'HOME_PLAYER_IDS']=frame.at[idx,'HOME_PLAYER_IDS'].replace(str(row['PLAYER1_ID']),str(row['PLAYER2_ID']))
    frame['HOME_PLAYER_IDS'].ffill(inplace=True)
    frame['AWAY_PLAYER_IDS'].ffill(inplace=True)
    
    frame=frame[frame['PLAYER_CHANGE']==False]
    frame=frame[frame['NEUTRALDESCRIPTION']=='']
    frame=frame[(~frame['HOMEDESCRIPTION'].str.contains('Jump Ball'))&(~frame['VISITORDESCRIPTION'].str.contains('Jump Ball'))]
    frame=frame[(~frame['HOMEDESCRIPTION'].str.contains('Timeout'))&(~frame['VISITORDESCRIPTION'].str.contains('Timeout'))]
    
    frame['NEXT_HOME_PLAY']=frame['HOMEDESCRIPTION'].shift(-1)
    frame['NEXT_AWAY_PLAY']=frame['VISITORDESCRIPTION'].shift(-1)
    frame['TIMER']=frame['PCTIMESTRING'].str.split(':').str.get(0).astype(int)+frame['PCTIMESTRING'].str.split(':').str.get(1).astype(int)/60
    frame['SECONDS_PASSED']=np.where(frame['TIMER'].shift(1)>=frame['TIMER'],frame['TIMER'].shift(1)*60-frame['TIMER']*60,(np.ceil(frame['TIMER'])-frame['TIMER'])*60)
    frame['HOME_POSSESSION_FLAG']=np.where((frame['NEXT_HOME_PLAY']=='')&(frame['HOMEDESCRIPTION']!=''),1,0)
    frame['AWAY_POSSESSION_FLAG']=np.where((frame['NEXT_AWAY_PLAY']=='')&(frame['VISITORDESCRIPTION']!=''),1,0)
    
    frame['AWAY_SCORE']=frame['SCORE'].str.split(' - ').str.get(0)
    frame['AWAY_SCORE']=np.where(frame['AWAY_SCORE']=='',np.nan,frame['AWAY_SCORE'])
    frame['AWAY_SCORE'].ffill(inplace=True)
    frame['AWAY_SCORE'].fillna(0,inplace=True)
    frame['HOME_SCORE']=frame['SCORE'].str.split(' - ').str.get(1)
    frame['HOME_SCORE'].ffill(inplace=True)
    frame['HOME_SCORE'].fillna(0,inplace=True)
    frame['HOME_SCORE_DIFFERENCE']=frame['HOME_SCORE'].astype(int)-frame['HOME_SCORE'].astype(int).shift(1)
    frame['AWAY_SCORE_DIFFERENCE']=frame['AWAY_SCORE'].astype(int)-frame['AWAY_SCORE'].astype(int).shift(1)
    frame['HOME_SCORE_DIFFERENCE'].fillna(0,inplace=True)
    frame['AWAY_SCORE_DIFFERENCE'].fillna(0,inplace=True)
    frame['HOME_VS_AWAY']=frame['HOME_SCORE_DIFFERENCE']-frame['AWAY_SCORE_DIFFERENCE']
    return frame

In [None]:
season = '2021-22'
season_type = 'Regular+Season'
games = get_all_games(season,season_type)
games

In [None]:
frame = get_game_data(game_id=games['GAME_ID'].values[0],games=games)
frame

In [None]:
# # Run the dataframe all together
# for i,game in enumerate(games['GAME_ID'].values[0:10]):
#     if i == 0:
#         results = get_game_data(game_id=game,games=games)
#     else:
#         results = pd.concat([results,(get_game_data(game_id=game,games=games))],axis=1)
# results

In [None]:
for i,game in enumerate(games['GAME_ID'].values[0:10]):
    try:
        results = get_game_data(game_id=game,games=games)
    except:
        print('Problem with run ' + str(i) + ', game id ' + str(game_id))
    results.to_csv('play_by_play.csv', mode='a', header=False)
    if i%10 == 0:
        print('Run ' + str(i) 'completed. ' str(len(games['GAME_ID'])))