In [1]:
# Importing necessary packages
import pandas as pd
import requests
pd.set_option('display.max_columns', None)
import time
import numpy as np

In [2]:
test_url = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season=2022-23&SeasonType=Regular%20Season&StatCategory=PTS'

In [3]:
r = requests.get(url=test_url).json()
# JSON File of the dataset from NBA.com
r

In [5]:
# Column Headers
col_headers = r['resultSet']['headers']

In [6]:
# Displays the first player in the dataset
r['resultSet']['rowSet'][0]

# Displays the players in a dataframe
pd.DataFrame(r['resultSet']['rowSet'], columns = col_headers)

# Practicing prepping the Data by adding more identifying info
temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns = col_headers)
temp_df2 = pd.DataFrame({'Year':['2022-23' for i in range(len(temp_df1))],
                        'Season_Type':['Regular%20Season' for i in range(len(temp_df1))]})

# Adds exisiting data to new data frame with Year and Season Type
temp_df3 = pd.concat([temp_df2, temp_df1], axis = 1)
temp_df3

[1628369,
 1,
 'Jayson Tatum',
 1610612738,
 'BOS',
 74,
 2732,
 727,
 1559,
 0.466,
 240,
 686,
 0.35,
 531,
 622,
 0.854,
 78,
 571,
 649,
 342,
 78,
 51,
 213,
 160,
 2225,
 2209,
 1.61,
 0.37]

In [9]:
del temp_df1, temp_df2, temp_df3

In [11]:
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Host':'stats.nba.com',
    'Origin': 'https://www.nba.com',
    'Referer': 'https://www.nba.com/',
    'Sec-Ch-Ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
}

In [14]:
# Creating a for loop that allows you to scrape data from multiple seasons
df_cols = ['Year', 'Season_Type'] + col_headers

years = ["2018-19","2019-20","2020-21","2021-22","2022-23"]
season_types = ["Regular%20Season", "Playoffs"]


df = pd.DataFrame(columns = df_cols)

begin_loop = time.time()

for y in years:
    for s in season_types:
        # api_url is a dynamic version of the url we utilized in the beginning 
        api_url = "https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season="+y+"&SeasonType="+s+"&StatCategory=PTS"
        r = requests.get(url=api_url, headers = headers).json()
        temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns = col_headers)
        temp_df2 = pd.DataFrame({'Year':[y for i in range(len(temp_df1))], 'Season_Type':[s for i in range(len(temp_df1))]})
        temp_df3 = pd.concat([temp_df2, temp_df1], axis = 1) 
        df = pd.concat([df, temp_df3], axis = 0)
        print(f'Finished scraping data for the {y} {s}.')
        lag = np.random.uniform(low = 20, high = 40)
        print(f'...waiting {round(lag,1)} seconds')
        time.sleep(lag)
        
print(f'Process completed! Total run time: {round((time.time() - begin_loop)/60,2)}')
df.to_excel('nba_player_data.xlsx', index = False)

Finished scraping data for the 2018-19 Regular%20Season.
...waiting 23.5 seconds
Finished scraping data for the 2018-19 Playoffs.
...waiting 39.8 seconds
Finished scraping data for the 2019-20 Regular%20Season.
...waiting 32.0 seconds
Finished scraping data for the 2019-20 Playoffs.
...waiting 29.2 seconds
Finished scraping data for the 2020-21 Regular%20Season.
...waiting 38.4 seconds
Finished scraping data for the 2020-21 Playoffs.
...waiting 30.8 seconds
Finished scraping data for the 2021-22 Regular%20Season.
...waiting 29.5 seconds
Finished scraping data for the 2021-22 Playoffs.
...waiting 21.8 seconds
Finished scraping data for the 2022-23 Regular%20Season.
...waiting 24.9 seconds
Finished scraping data for the 2022-23 Playoffs.
...waiting 39.8 seconds
Process completed! Total run time: 5.25
