In [10]:
import requests
import pandas as pd
import tqdm
import bs4
import json
import matplotlib.pyplot as plt
from adjustText import adjust_text
from pathlib import Path
import time

In [11]:
from nba_api.stats.endpoints.playbyplayv2 import PlayByPlayV2

In [12]:
from nba_api.stats.endpoints.playbyplay import PlayByPlay
from nba_api.stats.endpoints.leaguegamefinder import LeagueGameFinder
from nba_api.stats.static.players import find_players_by_full_name
from nba_api.stats.static.teams import find_teams_by_full_name
from nba_api.stats.endpoints.playergamelogs import PlayerGameLogs
from nba_api.stats.endpoints.boxscoretraditionalv2 import BoxScoreTraditionalV2


In [13]:
path = Path('pbp_data/2022-23.csv')
data_frame = None
if path.exists():
    data_frame = pd.read_csv(str(path))
    data_frame.loc[:, 'GAME_ID'] = [f"00{item}" for item in data_frame.GAME_ID.astype(str)]

## Downloading cur Season

In [14]:
season = '2022-23'

print(f"Processing {season}")
pbp_dfs = []
game_finder = LeagueGameFinder(season_nullable=season, league_id_nullable='00', season_type_nullable='Regular Season')
df = game_finder.get_data_frames()[0]
game_ids = df.GAME_ID.unique()
for game_id in tqdm.tqdm(game_ids):
    if data_frame is not None and game_id in data_frame.GAME_ID.tolist():
        continue
    while True:
        try:
            pbp_df = PlayByPlayV2(game_id=game_id).get_data_frames()[0]
            break
        except Exception as e:
            print(f"Error occured: {e}, Game ID: {game_id}, retrying")
            time.sleep(1)
    pbp_dfs.append(pbp_df)
    time.sleep(0.5)
full_df = pd.concat([data_frame] + pbp_dfs)

full_df = pd.concat([data_frame] + pbp_dfs)

full_df.loc[:, "PCTIMESECONDS"] = [720 - (int(item.split(":")[0]) * 60 + int(item.split(":")[1])) for item in full_df.PCTIMESTRING.tolist()]

full_df = full_df.fillna({
    "HOMEDESCRIPTION": "",
    "NEUTRALDESCRIPTION": "",
    "VISITORDESCRIPTION": ""
})

Processing 2022-23


 10%|▉         | 118/1230 [03:49<20:55,  1.13s/it] 

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out., Game ID: 0022201108, retrying


 13%|█▎        | 154/1230 [05:36<31:42,  1.77s/it]  

Error occured: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), Game ID: 0022201073, retrying


 13%|█▎        | 156/1230 [06:01<1:51:32,  6.23s/it]

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out., Game ID: 0022201072, retrying


 16%|█▌        | 191/1230 [07:53<31:33,  1.82s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30), Game ID: 0022201038, retrying


 18%|█▊        | 218/1230 [09:12<25:25,  1.51s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out., Game ID: 0022201009, retrying


 20%|██        | 248/1230 [10:38<28:19,  1.73s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out., Game ID: 0022200983, retrying


 29%|██▊       | 351/1230 [14:23<23:28,  1.60s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30), Game ID: 0022200877, retrying


 33%|███▎      | 405/1230 [16:57<20:22,  1.48s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30), Game ID: 0022200829, retrying


 46%|████▌     | 565/1230 [21:45<24:02,  2.17s/it]  

Error occured: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), Game ID: 0022200668, retrying


 62%|██████▏   | 759/1230 [28:09<22:28,  2.86s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out., Game ID: 0022200470, retrying


 66%|██████▌   | 807/1230 [30:08<41:49,  5.93s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out., Game ID: 0022200419, retrying


 69%|██████▊   | 844/1230 [31:49<33:50,  5.26s/it]  

Error occured: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), Game ID: 0022200383, retrying


 72%|███████▏  | 884/1230 [33:02<06:51,  1.19s/it]  

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30), Game ID: 0022200350, retrying


 81%|████████  | 995/1230 [38:14<30:40,  7.83s/it]  

Error occured: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), Game ID: 0022200238, retrying


 87%|████████▋ | 1076/1230 [41:40<04:44,  1.85s/it]

Error occured: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), Game ID: 0022200157, retrying


 94%|█████████▍| 1156/1230 [44:31<01:54,  1.55s/it]

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30), Game ID: 0022200071, retrying


 99%|█████████▉| 1223/1230 [47:41<00:09,  1.30s/it]

Error occured: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out., Game ID: 0022200006, retrying


100%|██████████| 1230/1230 [48:26<00:00,  2.36s/it]


In [15]:
full_df.to_csv(f'pbp_data/{season}.csv', index=False)

In [16]:
full_df = pd.concat([data_frame] + pbp_dfs)

In [17]:
full_df.loc[:, "PCTIMESECONDS"] = [720 - (int(item.split(":")[0]) * 60 + int(item.split(":")[1])) for item in full_df.PCTIMESTRING.tolist()]

full_df = full_df.fillna({
    "HOMEDESCRIPTION": "",
    "NEUTRALDESCRIPTION": "",
    "VISITORDESCRIPTION": ""
})

In [18]:
full_df.to_csv(f'pbp_data/{season}.csv', index=False)

In [19]:
full_df = pd.read_csv(f'pbp_data/{season}.csv')

full_df = full_df.fillna({
    "HOMEDESCRIPTION": "",
    "NEUTRALDESCRIPTION": "",
    "VISITORDESCRIPTION": ""
})
full_df.loc[:, 'GAME_ID'] = [f"00{item}" for item in full_df.GAME_ID.astype(str)]

In [20]:
len(full_df.GAME_ID.unique())

1230

In [21]:
full_df

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,...,PLAYER2_TEAM_ABBREVIATION,PERSON3TYPE,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ID,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION,VIDEO_AVAILABLE_FLAG,PCTIMESECONDS
0,0022201221,2,12,0,1,1:11 PM,12:00,,Start of 1st Period (1:11 PM EST),,...,,0,0,,,,,,0,0
1,0022201221,4,10,0,1,1:11 PM,12:00,Jump Ball Poeltl vs. Leonard: Tip to Crowder,,,...,MIL,5,203109,Jae Crowder,1.610613e+09,Milwaukee,Bucks,MIL,1,0
2,0022201221,7,1,73,1,1:12 PM,11:42,,,Beauchamp 1' Driving Reverse Layup (2 PTS),...,,0,0,,,,,,1,18
3,0022201221,8,1,75,1,1:12 PM,11:16,Achiuwa 6' Driving Finger Roll Layup (2 PTS),,,...,,0,0,,,,,,1,44
4,0022201221,9,5,12,1,1:12 PM,11:14,,,Portis Inbound Turnover (P1.T1),...,,1,0,,,,,,0,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574413,0022200001,637,5,2,4,10:07 PM,0:32,Brown Lost Ball Turnover (P4.T11),,Harris STEAL (3 STL),...,PHI,0,0,,,,,,1,688
574414,0022200001,639,1,50,4,10:07 PM,0:29,,,Harris 1' Running Dunk (18 PTS),...,,0,0,,,,,,1,691
574415,0022200001,640,2,1,4,10:07 PM,0:06,MISS Horford 25' 3PT Jump Shot,,,...,,0,0,,,,,,1,714
574416,0022200001,641,4,0,4,10:07 PM,0:04,,,Harden REBOUND (Off:0 Def:8),...,,0,0,,,,,,1,716


## Finding when players are On/Off

In [13]:
full_df = pd.read_csv(f'pbp_data/{season}.csv')

full_df = full_df.fillna({
    "HOMEDESCRIPTION": "",
    "NEUTRALDESCRIPTION": "",
    "VISITORDESCRIPTION": ""
})
full_df.loc[:, 'GAME_ID'] = [f"00{item}" for item in full_df.GAME_ID.astype(str)]

In [14]:
def find_starters_in_period(found_players, period_pbp):
    #period_start = single_game.loc[single_game.PERIOD == 2] -> period_pbp
    num_subs = len(found_players) - 5
    sub_count = 0
    subbed_in = []
    for i in range(len(period_pbp)):
        item = period_pbp.iloc[i]
        if item.EVENTMSGTYPE == 8:
            player_in = item.PLAYER2_ID
            subbed_in.append(player_in)
            sub_count += 1
        if sub_count == num_subs:
            break

    return found_players.loc[~found_players.PLAYER_ID.isin(subbed_in)].PLAYER_ID.tolist()


def do_work(team_1_players, team_2_players, team_1_id, team_2_id, single_game, item):
    #item = single_game.iloc[index]
    if (item.EVENTMSGTYPE == 12):
        period = item.PERIOD
        if period > 4:
            start_period = ((4 * 12) + (period-5) * 5) * 600 + 0
            end_period = ((4 * 12) + (period-5) * 5) * 600 + 480
        else:
            start_period = ((period-1) * 12 ) * 600 + 15
            end_period = ((period-1) * 12 ) * 600 + 480

        # Fetch players at the start of quarter
        box_score = BoxScoreTraditionalV2(game_id=item.GAME_ID, start_range=str(start_period), end_range=str(end_period), range_type='2')
        players_on = box_score.get_data_frames()[0]
        players_on.loc[:, "SECONDS"] = [(int(item.split(":")[0]) * 60 + int(item.split(":")[1])) for item in players_on.MIN.tolist()]

        # Split by team
        team_1_on = players_on.loc[players_on.TEAM_ID == team_1_id]
        team_2_on = players_on.loc[players_on.TEAM_ID == team_2_id]
        period_pbp = single_game.loc[single_game.PERIOD == item.PERIOD]

        # Check if more than 5 players are found
        if len(team_1_on) > 5:
            team_1_players.append(find_starters_in_period(team_1_on, period_pbp))
        else:
            team_1_players.append(team_1_on.PLAYER_ID.tolist())
        if len(team_1_on) > 5:
            team_2_players.append(find_starters_in_period(team_2_on, period_pbp))
        else:
            team_2_players.append(team_2_on.PLAYER_ID.tolist())
        time.sleep(0.25)
        return
    cur_team_1 = team_1_players[-1][:]
    cur_team_2 = team_2_players[-1][:]
    if (item.EVENTMSGTYPE == 8):
        player_in = item.PLAYER2_ID
        player_out = item.PLAYER1_ID
        is_team_1 = item.PLAYER1_TEAM_ID == team_1_id
        if is_team_1:
            if player_out in cur_team_1:
                cur_team_1[cur_team_1.index(player_out)] = player_in
        else:
            if player_out in cur_team_2:
                cur_team_2[cur_team_2.index(player_out)] = player_in
    team_1_players.append(cur_team_1)
    team_2_players.append(cur_team_2)


In [15]:
cur_pbp_players = pd.read_csv(f'pbp_data/{season}.csv')
cur_pbp_players.loc[:, 'GAME_ID'] = [f"00{item}" for item in cur_pbp_players.GAME_ID.astype(str)]

In [16]:
game_finder = LeagueGameFinder(season_nullable='2023-24', league_id_nullable='00', season_type_nullable='Regular Season')
game_finder_df = game_finder.get_data_frames()[0]
modified_dfs = []
#start = time.time()
count = 0
for index, game_id in enumerate(full_df.GAME_ID.unique()):
    print(f"{index}/{len(full_df.GAME_ID.unique())}: {game_id}")
    #if game_id in cur_pbp_players.GAME_ID.tolist():
    #        continue
    #if game_id in formatted_df.GAME_ID.unique():
    #continue
    single_game = full_df.loc[full_df.GAME_ID == game_id]
    team_1_players = []

    team_2_players = []

    teams_playing = game_finder_df.loc[game_finder_df.GAME_ID == single_game.GAME_ID.iloc[0]]

    team_1_id = teams_playing.iloc[0].TEAM_ID
    team_2_id = teams_playing.iloc[1].TEAM_ID

    #for index in range(len(single_game)):
    """
    for index in range(len(single_game)):
        item = single_game.iloc[index]
        do_work(team_1_players=team_1_players, team_2_players=team_2_players, team_1_id=team_1_id, team_2_id=team_2_id, single_game=single_game, item=item)
    """
    
    #"""
    for index, item in single_game.iterrows():
        do_work(team_1_players=team_1_players, team_2_players=team_2_players, team_1_id=team_1_id, team_2_id=team_2_id, single_game=single_game, item=item)
    #"""
    
    #single_game.apply(lambda item: do_work(team_1_players=team_1_players, team_2_players=team_2_players, team_1_id=team_1_id, team_2_id=team_2_id, single_game=single_game, item=item), axis=1)
    
    for i in range(len(team_1_players[0])):
        #team_1_dict[f'TEAM_1_PLAYER_{i+1}'] = [item[i] for item in team_1_players]
        single_game.loc[:, f'TEAM_1_PLAYER_{i+1}'] = [item[i] for item in team_1_players]
        
    for i in range(len(team_1_players[0])):
        #team_2_dict[f'TEAM_2_PLAYER_{i+1}'] = [item[i] for item in team_2_players]
        single_game.loc[:, f'TEAM_2_PLAYER_{i+1}'] = [item[i] for item in team_2_players]
        
    modified_dfs.append(single_game)
    count += 1
#end = time.time()
#print(f"End-Start: {end-start}")

0/306: 0022301208


ValueError: invalid literal for int() with base 10: '0.000000'