## Scraping bio data (height, weight, nationality) from NBA Stats  

In [11]:
import pandas as pd
import requests
pd.set_option('display.max_columns', None) # so we can see all columns in a wide DataFrame
import time
import numpy as np

In [12]:
raw_api_url ='https://stats.nba.com/stats/leaguedashplayerbiostats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=Totals&Period=0&PlayerExperience=&PlayerPosition=&Season=2023-24&SeasonSegment=&SeasonType=Regular Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='

In [13]:
headers = {
    'Accept': '*/*', 
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
    'Host': 'stats.nba.com',
    'Origin': 'https://www.nba.com',
    'Referer': 'https://www.nba.com/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0'
}

In [14]:
r = requests.get(url=raw_api_url, headers=headers).json() #parses return as JSON and returns a Python object(dictionary) 
r

{'resource': 'leaguedashplayerbiostats',
 'parameters': {'PerMode': 'Totals',
  'LeagueID': '00',
  'Season': '2023-24',
  'SeasonType': 'Regular Season',
  'PORound': 0,
  'Outcome': None,
  'Location': None,
  'Month': 0,
  'SeasonSegment': None,
  'DateFrom': None,
  'DateTo': None,
  'OpponentTeamID': 0,
  'VsConference': None,
  'VsDivision': None,
  'TeamID': 0,
  'Conference': None,
  'Division': None,
  'GameSegment': None,
  'Period': 0,
  'ShotClockRange': None,
  'LastNGames': 0,
  'GameScope': None,
  'PlayerExperience': None,
  'PlayerPosition': None,
  'StarterBench': None,
  'DraftYear': None,
  'DraftPick': None,
  'College': None,
  'Country': None,
  'Height': None,
  'Weight': None,
  'ISTRound': None},
 'resultSets': [{'name': 'LeagueDashPlayerBioStats',
   'headers': ['PLAYER_ID',
    'PLAYER_NAME',
    'TEAM_ID',
    'TEAM_ABBREVIATION',
    'AGE',
    'PLAYER_HEIGHT',
    'PLAYER_HEIGHT_INCHES',
    'PLAYER_WEIGHT',
    'COLLEGE',
    'COUNTRY',
    'DRAFT_YEAR',

In [15]:
# Accessing the headers and resultSets in dict.

df_cols_short = r['resultSets'][0]['headers']

df_cols_short

['PLAYER_ID',
 'PLAYER_NAME',
 'TEAM_ID',
 'TEAM_ABBREVIATION',
 'AGE',
 'PLAYER_HEIGHT',
 'PLAYER_HEIGHT_INCHES',
 'PLAYER_WEIGHT',
 'COLLEGE',
 'COUNTRY',
 'DRAFT_YEAR',
 'DRAFT_ROUND',
 'DRAFT_NUMBER',
 'GP',
 'PTS',
 'REB',
 'AST',
 'NET_RATING',
 'OREB_PCT',
 'DREB_PCT',
 'USG_PCT',
 'TS_PCT',
 'AST_PCT']

In [16]:
col = r['resultSets'][0]['rowSet']
col

[[1630639,
  'A.J. Lawson',
  1610612742,
  'DAL',
  23.0,
  '6-6',
  78,
  '179',
  'South Carolina',
  'Canada',
  'Undrafted',
  'Undrafted',
  'Undrafted',
  36,
  120,
  36,
  14,
  4.5,
  0.041,
  0.086,
  0.177,
  0.559,
  0.076],
 [1631260,
  'AJ Green',
  1610612749,
  'MIL',
  24.0,
  '6-4',
  76,
  '190',
  'Northern Iowa',
  'USA',
  'Undrafted',
  'Undrafted',
  'Undrafted',
  50,
  221,
  56,
  27,
  2.6,
  0.017,
  0.092,
  0.157,
  0.621,
  0.075],
 [1631100,
  'AJ Griffin',
  1610612737,
  'ATL',
  20.0,
  '6-6',
  78,
  '220',
  'Duke',
  'USA',
  '2022',
  '1',
  '16',
  18,
  37,
  14,
  4,
  -9.8,
  0.014,
  0.101,
  0.156,
  0.403,
  0.042],
 [203932,
  'Aaron Gordon',
  1610612743,
  'DEN',
  28.0,
  '6-8',
  80,
  '235',
  'Arizona',
  'USA',
  '2014',
  '1',
  '4',
  68,
  939,
  444,
  230,
  8.2,
  0.079,
  0.131,
  0.175,
  0.601,
  0.141],
 [1628988,
  'Aaron Holiday',
  1610612745,
  'HOU',
  27.0,
  '6-0',
  72,
  '185',
  'UCLA',
  'USA',
  '2018',
  '1'

In [17]:
df_cols = ['Year','Season_type'] + df_cols_short

years = ['1996-97', '1997-98', '1998-99', '1999-00', '2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
         '2012-13','2013-14','2014-15','2015-16','2016-17','2017-18','2018-19','2019-20','2020-21','2021-22']
season_types = ['Regular%20Season','Playoffs']

df = pd.DataFrame(columns=df_cols)
df

Unnamed: 0,Year,Season_type,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,PLAYER_HEIGHT,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,COLLEGE,COUNTRY,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,GP,PTS,REB,AST,NET_RATING,OREB_PCT,DREB_PCT,USG_PCT,TS_PCT,AST_PCT


In [18]:
# begin_loop = time.time()
df = pd.DataFrame()
for y in years:
    for s in season_types:
        api_url = "https://stats.nba.com/stats/leaguedashplayerbiostats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=Totals&Period=0&PlayerExperience=&PlayerPosition=&Season="+y+"&SeasonSegment=&SeasonType="+s+"&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight="
        r = requests.get(url=api_url, headers=headers).json()
        data = r['resultSets'][0]['rowSet']
        columns = r['resultSets'][0]['headers']
        temp_df1 = pd.DataFrame(data, columns=columns)
        temp_df1['Year'] = y
        temp_df1['Season'] = s
        df = pd.concat([df, temp_df1])
        
        # temp_df2 = pd.DataFrame({'Year':[y for i in range(len(temp_df1))],
        #                          'Season_type':[s for i in range(len(temp_df1))]})
        # temp_df3 = pd.concat([temp_df2, temp_df1], axis=1)
        # df = pd.concat([df, temp_df3], axis=0)

#         print(f'Finished scraping data from the {y} {s}')
#         lag = np.random.uniform(low=20, high=40)
#         print(f'...waiting {round(lag,1)} seconds...')
#         time.sleep(lag)
# print(f'Process completed!! Total run time: {round(time.time()-begin_loop,1)} seconds')

KeyboardInterrupt: 

In [None]:
df

In [None]:
df_1996 = df[df['Year'] == '1996-97']
df_1996[df_1996['PLAYER_ID'] == 893]


In [None]:
df_cnt = df[df['COUNTRY']]
df_cnt

In [None]:
# Script to scrape data on European players using Beautiful Soup

import requests
from bs4 import BeautifulSoup
import pandas as pd

years = list(range(2000, 2025))

url_start = 'https://basketball.realgm.com/international/league/1/Euroleague/stats/{}/Averages/Qualified/All/points/All/desc/1/Regular_Season'

data = []  # List to store parsed data

for year in years:
    url = url_start.format(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    player_data = []
    for row in soup.select('.tablesaw tbody tr'):
        player = [year]  # Add the year to the player data
        player += [cell.text.strip() for cell in row.find_all('td')]
        player_data.append(player)

    data.extend(player_data)

# Creating DataFrame from the collected data
columns = ['Year', '#', 'Player', 'Team', 'GP', 'MPG', 'PPG', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%',
           'FTM', 'FTA', 'FT%', 'ORB', 'DRB', 'RPG', 'APG', 'SPG', 'BPG', 'TOV', 'PF']
df = pd.DataFrame(data, columns=columns)

# Saving DataFrame to a CSV file
df.to_csv('euroleague_stats.csv', index=False)

# Display DataFrame
df
