In [34]:
from itertools import chain
from pathlib import Path
from time import sleep
from datetime import datetime

In [35]:
import requests
from tqdm import tqdm
tqdm.monitor_interval = 0

In [36]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [37]:
import csv
from collections import Counter, namedtuple
from pathlib import Path

In [38]:
PARENT_DIR = Path.cwd().parent
OUTPUTS_DIR = PARENT_DIR / 'output'
OUTPUTS_DIR.mkdir(exist_ok=True, parents=True)
IMAGES_DIR = PARENT_DIR / 'images'


In [39]:
PROJECT_DIR = Path.cwd().parent 
DATA_DIR = PROJECT_DIR / 'data' / 'scraped' / 'teamgamelogs'
DATA_DIR.mkdir(exist_ok=True, parents=True)
OUTPUT_DIR = PROJECT_DIR / 'data' / 'prepared'
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [154]:
url = 'https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/2019/scores/pbp/0021900100_full_pbp.json'
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.42 Safari/537.36',
    'x-nba-stats-origin': 'stats',
}

r=requests.get(url,headers=headers)

In [41]:
textjson = r.json()
textjson['g']['pd'][0]['pla'][0]

{'evt': 2,
 'cl': '12:00',
 'de': 'Start Period',
 'locX': 0,
 'locY': -80,
 'opt1': 0,
 'opt2': 0,
 'mtype': 0,
 'etype': 12,
 'opid': '',
 'tid': 0,
 'pid': 0,
 'hs': 0,
 'vs': 0,
 'epid': '',
 'oftid': 0,
 'ord': 20000}

In [42]:
print(len(textjson['g']['pd']),len(textjson['g']['pd'][0]['pla']))

5 129


In [43]:
DtailedinfoNBA = namedtuple('DtailedinfoNBA', ['evt', 'cl', 'de'])

In [44]:
def get_nba_detailed_info(i, j):
    evt = textjson['g']['pd'][i]['pla'][j]['evt']
    cl = textjson['g']['pd'][i]['pla'][j]['cl']
    de = textjson['g']['pd'][i]['pla'][j]['de']
    
    return DtailedinfoNBA(evt=evt,cl=cl,de=de)

In [163]:
def scrap_nba_detailed_info():
    results = []
    try:
        for i in range(0, len(textjson['g']['pd'])):
            for j in range(0, len(textjson['g']['pd'][i]['pla'])):
                results.append(DtailedinfoNBA._asdict(get_nba_detailed_info(i, j)))
    except:
        results = None
    return results

In [161]:
def scrap_multi_web():
    scrap_data_list = []
    
    for i in range(0, 10):
        if i>999:
            gameid = str(i)
        elif i>99:
            gameid = '0'+str(i)
        elif i>9:
            gameid = '00'+str(i)
        else:
            gameid = '000'+str(i)
        url = 'https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/2019/scores/pbp/002190{}_full_pbp.json'.format(gameid)
        headers = {
            'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.42 Safari/537.36',
            'x-nba-stats-origin': 'stats',
        }
        r=requests.get(url,headers=headers)
        textjson = r.json()
        scrap_data = scrap_nba_detailed_info()
        scrap_data_list.append(scrap_data)
        
    return scrap_data_list

In [164]:
scrap_data_list = []
    
for i in range(0, 10):
    if i>999:
        gameid = str(i)
    elif i>99:
        gameid = '0'+str(i)
    elif i>9:
        gameid = '00'+str(i)
    else:
        gameid = '000'+str(i)
    url = 'https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/2019/scores/pbp/002190{}_full_pbp.json'.format(gameid)
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.42 Safari/537.36',
        'x-nba-stats-origin': 'stats',
    }
    r=requests.get(url,headers=headers)
    textjson = r.json()
    scrap_data = scrap_nba_detailed_info()
    scrap_data_list.append(scrap_data)

In [174]:
scrap_data_list[4]

[OrderedDict([('evt', 2), ('cl', '12:00'), ('de', 'Start Period')]),
 OrderedDict([('evt', 4),
              ('cl', '11:58'),
              ('de',
               'Jump Ball Drummond vs Turner (Mark Morris gains possession)')]),
 OrderedDict([('evt', 7),
              ('cl', '11:44'),
              ('de', '[DET] Brown Driving Layup Shot: Missed')]),
 OrderedDict([('evt', 8),
              ('cl', '11:41'),
              ('de', '[IND] Lamb Rebound (Off:0 Def:1)')]),
 OrderedDict([('evt', 9),
              ('cl', '11:36'),
              ('de', '[IND] Lamb 3pt Shot: Missed')]),
 OrderedDict([('evt', 10),
              ('cl', '11:33'),
              ('de', '[DET] Jackson Rebound (Off:0 Def:1)')]),
 OrderedDict([('evt', 11),
              ('cl', '11:19'),
              ('de', '[DET] Mark Morris Turnaround Jump Shot: Missed')]),
 OrderedDict([('evt', 12),
              ('cl', '11:17'),
              ('de', '[IND] Sabonis Rebound (Off:0 Def:1)')]),
 OrderedDict([('evt', 13),
              ('cl'

In [146]:
scrap_data_list = scrap_multi_web()
len(scrap_data_list)

5490

In [149]:
scrap_data_list

[OrderedDict([('evt', 2), ('cl', '12:00'), ('de', 'Start Period')]),
 OrderedDict([('evt', 4),
              ('cl', '11:56'),
              ('de', '[ATL] Dedmon Violation:Jump Ball (K Mauer)')]),
 OrderedDict([('evt', 5),
              ('cl', '11:48'),
              ('de', '[NYK] Harkless Driving Layup Shot: Missed')]),
 OrderedDict([('evt', 6),
              ('cl', '11:45'),
              ('de', '[ATL] Dedmon Rebound (Off:0 Def:1)')]),
 OrderedDict([('evt', 7),
              ('cl', '11:39'),
              ('de', '[ATL] Dedmon 3pt Shot: Missed')]),
 OrderedDict([('evt', 8), ('cl', '11:36'), ('de', '[ATL] Team Rebound')]),
 OrderedDict([('evt', 9), ('cl', '11:36'), ('de', 'Stoppage: Out-of-Bounds')]),
 OrderedDict([('evt', 10),
              ('cl', '11:30'),
              ('de', '[ATL] Young Driving Floating Bank Jump Shot: Missed')]),
 OrderedDict([('evt', 11),
              ('cl', '11:26'),
              ('de', '[NYK] Harkless Rebound (Off:0 Def:1)')]),
 OrderedDict([('evt', 12),
    

In [114]:
for i in range(0, 1500):
    if i>999:
        gameid = str(i)
    elif i>99:
        gameid = '0'+str(i)
    elif i>9:
        gameid = '00'+str(i)
    else:
        gameid = '000'+str(i)
    print(gameid)
    

0000
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199


In [86]:
scraped_data = scrap_nba_detailed_info()

    

len(scraped_data)
type(scraped_data)

scraped_data_1 = {'20110304':scraped_data}
scraped_data_1['20110304']
for i,j in scraped_data_1.items():
    print(i, j)

20110304 [OrderedDict([('evt', 2), ('cl', '12:00'), ('de', 'Start Period')]), OrderedDict([('evt', 4), ('cl', '11:56'), ('de', '[ATL] Dedmon Violation:Jump Ball (K Mauer)')]), OrderedDict([('evt', 5), ('cl', '11:48'), ('de', '[NYK] Harkless Driving Layup Shot: Missed')]), OrderedDict([('evt', 6), ('cl', '11:45'), ('de', '[ATL] Dedmon Rebound (Off:0 Def:1)')]), OrderedDict([('evt', 7), ('cl', '11:39'), ('de', '[ATL] Dedmon 3pt Shot: Missed')]), OrderedDict([('evt', 8), ('cl', '11:36'), ('de', '[ATL] Team Rebound')]), OrderedDict([('evt', 9), ('cl', '11:36'), ('de', 'Stoppage: Out-of-Bounds')]), OrderedDict([('evt', 10), ('cl', '11:30'), ('de', '[ATL] Young Driving Floating Bank Jump Shot: Missed')]), OrderedDict([('evt', 11), ('cl', '11:26'), ('de', '[NYK] Harkless Rebound (Off:0 Def:1)')]), OrderedDict([('evt', 12), ('cl', '11:21'), ('de', '[NYK] Barrett 3pt Shot: Missed')]), OrderedDict([('evt', 13), ('cl', '11:19'), ('de', '[ATL] Dedmon Rebound (Off:0 Def:2)')]), OrderedDict([('evt',

In [77]:
df = pd.DataFrame(scraped_data)
df

Unnamed: 0,evt,cl,de
0,2,12:00,Start Period
1,4,11:56,[ATL] Dedmon Violation:Jump Ball (K Mauer)
2,5,11:48,[NYK] Harkless Driving Layup Shot: Missed
3,6,11:45,[ATL] Dedmon Rebound (Off:0 Def:1)
4,7,11:39,[ATL] Dedmon 3pt Shot: Missed
5,8,11:36,[ATL] Team Rebound
6,9,11:36,Stoppage: Out-of-Bounds
7,10,11:30,[ATL] Young Driving Floating Bank Jump Shot: M...
8,11,11:26,[NYK] Harkless Rebound (Off:0 Def:1)
9,12,11:21,[NYK] Barrett 3pt Shot: Missed


In [87]:
# CSVFILE = OUTPUTS_DIR.joinpath('nba_detailed_info.csv')
# df.to_csv(CSVFILE, index=False, quoting=csv.QUOTE_ALL)

In [160]:
print(gameid)

0009


In [None]:
https://stats.nba.com/stats/boxscoresummaryv2?GameID=0021001227

In [110]:
#To get date, team_name, game id

url_2 = 'https://stats.nba.com/stats/scoreboardV2?DayOffset=0&LeagueID=00&gameDate=04%2F13%2F2011'
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.42 Safari/537.36',
    'x-nba-stats-origin': 'stats',
}

r2 = requests.get(url_2, headers=headers, timeout=15)

In [107]:
r2.json()

{'resource': 'scoreboardV2',
 'parameters': {'GameDate': '04/13/2011', 'LeagueID': '00', 'DayOffset': '0'},
 'resultSets': [{'name': 'GameHeader',
   'headers': ['GAME_DATE_EST',
    'GAME_SEQUENCE',
    'GAME_ID',
    'GAME_STATUS_ID',
    'GAME_STATUS_TEXT',
    'GAMECODE',
    'HOME_TEAM_ID',
    'VISITOR_TEAM_ID',
    'SEASON',
    'LIVE_PERIOD',
    'LIVE_PC_TIME',
    'NATL_TV_BROADCASTER_ABBREVIATION',
    'HOME_TV_BROADCASTER_ABBREVIATION',
    'AWAY_TV_BROADCASTER_ABBREVIATION',
    'LIVE_PERIOD_TIME_BCAST',
    'ARENA_NAME',
    'WH_STATUS'],
   'rowSet': [['2011-04-13T00:00:00',
     1,
     '0021001216',
     3,
     'Final',
     '20110413/NYKBOS',
     1610612738,
     1610612752,
     '2010',
     4,
     ' ',
     None,
     'NBCSB',
     'MSG',
     'Q4   - ',
     'TD Garden',
     1],
    ['2011-04-13T00:00:00',
     2,
     '0021001217',
     3,
     'Final',
     '20110413/ATLCHA',
     1610612766,
     1610612737,
     '2010',
     4,
     ' ',
     None,
     Non