In [1]:
# pip install unidecode

In [2]:

from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd
import pickle
import requests
from unidecode import unidecode

In [3]:
# this dictionary will map players to a set containing all the years in which they were 
# selected for an all-star game, either initially or as a replacement
all_star_appearances = defaultdict(set)

# rows to ignore when iterating the roster tables
ignore_fields = set(['Team Totals', 'Reserves'])

START_YEAR, END_YEAR = 1970, 2022

# unidecode doesn't catch the accented c in Peja's last name (Stojakovic), fix it
# also overwrite any instance of Metta World Peace to Ron Artest
def fix_name(full_name):
    first_name = full_name.split(' ')[0]
    if first_name == 'Peja':
        return 'Peja Stojakovic'
    elif first_name == 'Metta':
        return 'Ron Artest'
    else:
        return unidecode(full_name)

for year in range(START_YEAR, END_YEAR):

    # no ASG played in 1999 because of the lockout
    if year == 1999:
        continue

    print('Scraping ASG {} data...'.format(year))

    # will store all the all-stars for this year
    all_stars = set([])

    html = requests.get('https://www.basketball-reference.com/allstar/NBA_{}.html'.format(year)).content
    soup = BeautifulSoup(html, 'html.parser')

    # this part was annoying - back when ASG was always East vs. West, the tables 
    # were encoded with id="East"/id="West" so they could be extracted more easily/reliably
    # but now, you have games like Giannis vs. LeBron and the table id's are different, so I 
    # had to extract them by index, which is unreliable in the event that the site's design 
    # changes in the future

    # gets rosters for team 1 and team 2
    s1, s2 = soup.findAll('table')[1:3]

    df1 = pd.read_html(str(s1))[0]
    df2 = pd.read_html(str(s2))[0]

    # get the all-stars from teams 1 and 2
    for df in [df1, df2]:
        for i, row in df.iterrows():
            if pd.notnull(row[0]) and row[0] not in ignore_fields:
                player = row[0]
                all_stars.add(fix_name(player))

    # gets all li elements in the page
    s3 = soup.findAll('li') 

    # finds the li element that contains the data pertaining to injury related selections 
    # - players who were selected but couldn't participate due to injury,
    # and their respective replacements
    #
    # since all_stars is a hashset, we don't need to worry about accidentally double counting an all-star
    for s in s3:
        if 'Did not play' in str(s):
            for player in [name.get_text() for name in s.findAll('a')]: # all the injured players and their replacements
                all_stars.add(fix_name(player))
            break

    # update the appearances dictionary
    for player in all_stars:
        all_star_appearances[player].add(year)

sorted_all_star_appearances = sorted([(player, sorted(list(appearances))) for player, appearances in all_star_appearances.items()], key = lambda x : -len(x[1]))

print('\nAll all-star appearances since 1970 (sorted by number of appearances):\n')

for player, appearances in sorted_all_star_appearances:
    print('{}: {}'.format(player, appearances))

# export the dictionary to local disk for future recall in statsnba_fullscrape.py
out = open('all_star_appearances.pickle', 'wb')
pickle.dump(all_star_appearances, out)
# out.write("tester")
out.close()

Scraping ASG 1970 data...
Scraping ASG 1971 data...
Scraping ASG 1972 data...
Scraping ASG 1973 data...
Scraping ASG 1974 data...
Scraping ASG 1975 data...
Scraping ASG 1976 data...
Scraping ASG 1977 data...
Scraping ASG 1978 data...
Scraping ASG 1979 data...
Scraping ASG 1980 data...
Scraping ASG 1981 data...
Scraping ASG 1982 data...
Scraping ASG 1983 data...
Scraping ASG 1984 data...
Scraping ASG 1985 data...
Scraping ASG 1986 data...
Scraping ASG 1987 data...
Scraping ASG 1988 data...
Scraping ASG 1989 data...
Scraping ASG 1990 data...
Scraping ASG 1991 data...
Scraping ASG 1992 data...
Scraping ASG 1993 data...
Scraping ASG 1994 data...
Scraping ASG 1995 data...
Scraping ASG 1996 data...
Scraping ASG 1997 data...
Scraping ASG 1998 data...
Scraping ASG 2000 data...
Scraping ASG 2001 data...
Scraping ASG 2002 data...
Scraping ASG 2003 data...
Scraping ASG 2004 data...
Scraping ASG 2005 data...
Scraping ASG 2006 data...
Scraping ASG 2007 data...
Scraping ASG 2008 data...
Scraping ASG

In [4]:
# this dictionary will map players to a set containing all the years in which they were 
# selected for an all-star game, either initially or as a replacement
all_star_starters = defaultdict(set)

# rows to ignore when iterating the roster tables
ignore_fields_starters = set(['Team Totals', 'Reserves'])

START_YEAR_STARTER, END_YEAR_STARTER = 1996, 2022

# unidecode doesn't catch the accented c in Peja's last name (Stojakovic), fix it
# also overwrite any instance of Metta World Peace to Ron Artest
for year in range(START_YEAR_STARTER, END_YEAR_STARTER):

    # no ASG played in 1999 because of the lockout
    if year == 1999:
        continue

    print('Scraping ASG {} data...'.format(year))

    # will store all the all-stars for this year
    all_stars = set([])

    # this part was annoying - back when ASG was always East vs. West, the tables 
    # were encoded with id="East"/id="West" so they could be extracted more easily/reliably
    # but now, you have games like Giannis vs. LeBron and the table id's are different, so I 
    # had to extract them by index, which is unreliable in the event that the site's design 
    # changes in the future

    if year < 2017:
        html = requests.get('https://www.basketball-reference.com/allstar/NBA_{}_voting.html'.format(year)).content
        soup = BeautifulSoup(html, 'html.parser')
        # gets rosters for team 1 and team 2
        s1, s2 = soup.findAll('table')[0:2]

        df1 = pd.read_html(str(s1))[0]
        df2 = pd.read_html(str(s2))[0]

        # get the all-stars from teams 1 and 2
        for df in [df1, df2]:
            for i, row in df.iterrows():
                if pd.notnull(row[0]) and row[0] not in ignore_fields_starters:
                    player = row[1]
                    all_stars.add(fix_name(player))
    else:
        for pos in ["frontcourt", "backcourt"]:
            if pos == "frontcourt":
                num_player = 3
            else:
                num_player = 2
            for conf in ["eastern", "western"]:
                html = requests.get('https://www.basketball-reference.com/allstar/NBA_{}_voting-{}-{}-conference.html'.format(year, pos, conf)).content
                soup = BeautifulSoup(html, 'html.parser')
                s = soup.findAll('table')[0]
                df = pd.read_html(str(s))[0]
                num_added = 0
                for i, row in df.iterrows():
                    if pd.notnull(row[0]) and row[0] not in ignore_fields_starters:
                          player = row[1]
                          all_stars.add(fix_name(player))
                          num_added += 1
                    if num_added == num_player:
                          break;
    # update the appearances dictionary
    for player in all_stars:
        all_star_starters[player].add(year)

sorted_all_star_starters = sorted([(player, sorted(list(appearances))) for player, appearances in all_star_starters.items()], key = lambda x : -len(x[1]))

print('\nAll all-star appearances since 1970 (sorted by number of appearances):\n')

for player, appearances in sorted_all_star_starters:
    print('{}: {}'.format(player, appearances))

# export the dictionary to local disk for future recall in statsnba_fullscrape.py
out = open('all_star_starters.pickle', 'wb')
pickle.dump(all_star_starters, out)
out.close()

Scraping ASG 1996 data...
Scraping ASG 1997 data...
Scraping ASG 1998 data...
Scraping ASG 2000 data...
Scraping ASG 2001 data...
Scraping ASG 2002 data...
Scraping ASG 2003 data...
Scraping ASG 2004 data...
Scraping ASG 2005 data...
Scraping ASG 2006 data...
Scraping ASG 2007 data...
Scraping ASG 2008 data...
Scraping ASG 2009 data...
Scraping ASG 2010 data...
Scraping ASG 2011 data...
Scraping ASG 2012 data...
Scraping ASG 2013 data...
Scraping ASG 2014 data...
Scraping ASG 2015 data...
Scraping ASG 2016 data...
Scraping ASG 2017 data...
Scraping ASG 2018 data...
Scraping ASG 2019 data...
Scraping ASG 2020 data...
Scraping ASG 2021 data...

All all-star appearances since 1970 (sorted by number of appearances):

Kobe Bryant: [1998, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
LeBron James: [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
Kevin Garnett: [1998, 2000, 2002, 2003

In [5]:
# pip install selenium

In [6]:
import random
from bs4 import BeautifulSoup
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select

import pickle
import time

In [7]:
START_YEAR, END_YEAR = 1996, 2021

# list of DataFrames for historical data, one for each year
df_train_master = []

# let's us look up team record and rank by (year, team)
# we need this to augment our player dataset but we need to construct it first
team_rank_historical_lookup = {}

# load the all-star history dictionary that we generated in bballref_ASG_scrape.py
all_star_appearances = pickle.load(open('all_star_appearances.pickle', 'rb'))
all_star_starters = pickle.load(open('all_star_starters.pickle', 'rb'))

# we need a map from the team's full name to their short form (prefix)
team_prefix = {
	'Atlanta Hawks' : 'ATL',
	'Boston Celtics' : 'BOS',
	'Charlotte Hornets Old' : 'CHH', # deprecated
	'Chicago Bulls' : 'CHI',
	'Cleveland Cavaliers' : 'CLE',
	'Dallas Mavericks' : 'DAL',
	'Denver Nuggets' : 'DEN',
	'Detroit Pistons' : 'DET',
	'Golden State Warriors' : 'GSW',
	'Houston Rockets' : 'HOU',
	'Indiana Pacers' : 'IND',
	'Los Angeles Clippers' : 'LAC', # deprecated
	'LA Clippers' : 'LAC',
	'Los Angeles Lakers' : 'LAL',
	'Miami Heat' : 'MIA',
	'Milwaukee Bucks' : 'MIL',
	'Minnesota Timberwolves' : 'MIN',
	'New Jersey Nets' : 'NJN', # deprecated
	'New York Knicks' : 'NYK',
	'Orlando Magic' : 'ORL',
	'Philadelphia 76ers' : 'PHI',
	'Phoenix Suns' : 'PHX',
	'Portland Trail Blazers' : 'POR',
	'Sacramento Kings' : 'SAC',
	'San Antonio Spurs' : 'SAS',
	'Seattle SuperSonics' : 'SEA', # deprecated
	'Toronto Raptors' : 'TOR',
	'Utah Jazz' : 'UTA',
	'Vancouver Grizzlies' : 'VAN', # deprecated
	'Washington Bullets' : 'WAS', # deprecated
	'Washington Wizards' : 'WAS',
	'Memphis Grizzlies' : 'MEM',
	'New Orleans Hornets' : 'NOH', # deprecated
	'Charlotte Bobcats' : 'CHA', # deprecated
	'New Orleans/Oklahoma City Hornets' : 'NOK', # deprecated
	'Oklahoma City Thunder' : 'OKC',
	'Brooklyn Nets' : 'BKN',
	'Charlotte Hornets New' : 'CHA',
	'New Orleans Pelicans' : 'NOP'
}

# Charlotte's short form in the pre-Bobcats era was CHH but now it's CHA, so we adjust accordingly
def adjust_hornets(row):
	if row['TEAM'] == 'Charlotte Hornets':
		return 'Charlotte Hornets Old' if row['Year'] <= 2001 else 'Charlotte Hornets New'
	return row['TEAM']

# function will construct our team rank lookup by year (nested dictionary data structure)
def fill_team_rank_historical_lookup(row):
	year = row['Year']
	team = row['TEAM']
	rank = row['Conference Rank']
	gp = row['GP']
	prefix = team_prefix[team]
	if year not in team_rank_historical_lookup:
		team_rank_historical_lookup[year] = {}
	team_rank_historical_lookup[year][prefix] = (rank, gp)

# we also need average league pace by year, so we can normalize all statistics to be pace-adjusted
html = requests.get('https://www.basketball-reference.com/leagues/NBA_stats_per_game.html').content
s_pace = BeautifulSoup(html, 'html.parser')

pace_table = s_pace.find('table')
df_pace = pd.read_html(str(pace_table))[0]
df_pace.columns = df_pace.columns.droplevel()

# maps year to average league pace
pace_lookup = {}

for i, row in df_pace.iterrows():
	if pd.isnull(row['Season']) or row['Season'] == 'Season':
		continue
	year = int(row['Season'][:4])
	pace_lookup[year] = row['Pace']
	if year == START_YEAR:
		break

# this function looks up if a player was selected for the ASG in the prior year
# this could have been done succintly in a lambda function, but the 1999 lockout added an annoying wrinkle
def was_AS_last_year(row):
	if row['Year'] == 1999:
		return 1 if 1998 in all_star_appearances[row['PLAYER']] else 0
	return 1 if row['Year'] in all_star_appearances[row['PLAYER']] else 0

# initialize the chromedriver
d = webdriver.Chrome('./chromedriver_win32/chromedriver')

# crude time delay to wait before attempting to scrape tabular data after XML document has loaded
TIME_DELAY_TEAMS = 3
TIME_DELAY_PLAYERS = 10

for year in range(START_YEAR, END_YEAR):

	if year == 1998: # lockout
		continue

	start_date = (10, 1, year) # month, day, year (not padded)
	end_date = (1, 21, year+1) # month, day, year (not padded)

	season_label = str(year) + '-' + str(year+1)[2:]
	print('Scraping stats.nba.com for {} season...'.format(season_label))

	# contains the majortiy of our desired statistics (PTS, REB, AST, etc.)
	url_players_traditional = '''https://www.nba.com/stats/players/traditional/?Season={}&SeasonType=Regular%20Season&sort=PTS&dir=-1&DateFrom={}%2F{}%2F{}&DateTo={}%2F{}%2F{}'''.format(season_label, *start_date, *end_date)
# 	url_players_traditional = '''https://www.nba.com/stats/players/traditional/?sort=PTS&dir=-1&Season={}&SeasonType=Regular%20Season'''.format(season_label)

	# contains advanced statistics (TS%, USG%, PIE)
	url_players_advanced = '''https://www.nba.com/stats/players/advanced/?Season={}&SeasonType=Regular%20Season&sort=PTS&dir=-1&DateFrom={}%2F{}%2F{}&DateTo={}%2F{}%2F{}'''.format(season_label, *start_date, *end_date)
# 	url_players_advanced = '''https://www.nba.com/stats/players/advanced/?sort=GP&dir=-1&Season={}&SeasonType=Regular%20Season'''.format(season_label)

	# contains DEFWS (defensive win-shares)
	url_players_defense = '''https://www.nba.com/stats/players/defense/?Season={}&SeasonType=Regular%20Season&sort=DEF_WS&dir=-1&DateFrom={}%2F{}%2F{}&DateTo={}%2F{}%2F{}'''.format(season_label, *start_date, *end_date)
# 	url_players_defense = '''https://www.nba.com/stats/players/defense/?sort=DEF_WS&dir=-1&Season={}&SeasonType=Regular%20Season'''.format(season_label)

	# contains team rankings by conference at any instance of time
	url_teams = '''https://www.nba.com/stats/teams/traditional/?sort=W_PCT&dir=-1&Season={}&SeasonType=Regular%20Season&Conference={}&DateFrom={}%2F{}%2F{}&DateTo={}%2F{}%2F{}'''

	for conf in ['East', 'West']:
		d.get(url_teams.format(season_label, conf, *start_date, *end_date))

		# crude time delay to ensure element is loaded, definitely a more elegant way to do this
		time.sleep(TIME_DELAY_TEAMS)

		s_teams = BeautifulSoup(d.page_source, 'html.parser').find('table') 
		df = pd.read_html(str(s_teams))[0]
		df['Year'] = year
		df['Conference'] = conf
		df.rename(columns={'Unnamed: 0' : 'Conference Rank'}, inplace=True)

		df['TEAM'] = df[['TEAM','Year']].apply(adjust_hornets, axis=1)
		df[['TEAM', 'Year', 'Conference Rank', 'GP']].apply(fill_team_rank_historical_lookup, axis=1)

	d.get(url_players_traditional)

	time.sleep(TIME_DELAY_PLAYERS)

	# by default, only 50 players are displayed per page, but we can change this using the dropdown select element
# 	select = Select(d.find_element_by_xpath('/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'))
	select = Select(d.find_element_by_xpath('/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'))
	select.select_by_visible_text('All')

	s_traditional = BeautifulSoup(d.page_source, 'html.parser').find('table')

	d.get(url_players_advanced)

	time.sleep(TIME_DELAY_PLAYERS)

# 	select = Select(d.find_element_by_xpath('/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'))
	select = Select(d.find_element_by_xpath('/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'))
	select.select_by_visible_text('All')

	s_advanced = BeautifulSoup(d.page_source, 'html.parser').find('table')

	d.get(url_players_defense)

	time.sleep(TIME_DELAY_PLAYERS)

# 	select = Select(d.find_element_by_xpath('/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'))
	select = Select(d.find_element_by_xpath('/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'))
	select.select_by_visible_text('All')

	s_defense = BeautifulSoup(d.page_source, 'html.parser').find('table')

	df_traditional = pd.read_html(str(s_traditional))[0].dropna(subset=['PLAYER'])

	df_advanced = pd.read_html(str(s_advanced))[0].dropna(subset=['PLAYER'])

	df_defense = pd.read_html(str(s_defense))[0].rename(columns={'Player' : 'PLAYER'}).dropna(subset=['PLAYER'])
	
	df = df_traditional.merge(df_advanced[['PLAYER','TS%', 'USG%', 'PIE']], on='PLAYER')
	df = df.merge(df_defense[['PLAYER', 'DEFWS']], on='PLAYER')

	# stitching it all together
	df['Year'] = year
	df['Avg. Pace'] = df['Year'].map(lambda x : pace_lookup[x])
	df['Team Conference Rank'] = df[['TEAM', 'Year']].apply(lambda row : team_rank_historical_lookup[row['Year']][row['TEAM']][0], axis=1)
	df['Team GP'] = df[['TEAM', 'Year']].apply(lambda row : team_rank_historical_lookup[row['Year']][row['TEAM']][1], axis=1)
	df['PLAYER'] = df['PLAYER'].map(lambda x : 'Ron Artest' if x == 'Metta World Peace' else x)
	df['Prior ASG Appearances'] = df[['PLAYER', 'Year']].apply(lambda row : sum(y<=row['Year'] for y in all_star_appearances[row['PLAYER']]), axis=1)
	df['AS Last Year?'] = df[['PLAYER', 'Year']].apply(was_AS_last_year, axis=1)
	df['Selected?'] = df[['PLAYER', 'Year']].apply(lambda row : 2 if row['Year']+1 in all_star_starters[row['PLAYER']] else (1 if row['Year']+1 in all_star_appearances[row['PLAYER']] else 0), axis=1)

	# desired raw features, before any feature engineering/transformation
	df = df[['Year', 'Avg. Pace', 'PLAYER', 'TEAM', 'Team Conference Rank', 'GP', 'Team GP', 'W', 
			'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'TS%', '3PM', 'DEFWS', 'USG%', 'PIE', 'Prior ASG Appearances', 'AS Last Year?', 'Selected?']]

	df_train_master.append(df)

d.quit()

pd.concat(df_train_master).to_csv('ASG_data.csv', index=False)


Scraping stats.nba.com for 1996-97 season...
Scraping stats.nba.com for 1997-98 season...
Scraping stats.nba.com for 1999-00 season...
Scraping stats.nba.com for 2000-01 season...
Scraping stats.nba.com for 2001-02 season...
Scraping stats.nba.com for 2002-03 season...
Scraping stats.nba.com for 2003-04 season...
Scraping stats.nba.com for 2004-05 season...
Scraping stats.nba.com for 2005-06 season...
Scraping stats.nba.com for 2006-07 season...
Scraping stats.nba.com for 2007-08 season...
Scraping stats.nba.com for 2008-09 season...
Scraping stats.nba.com for 2009-10 season...
Scraping stats.nba.com for 2010-11 season...
Scraping stats.nba.com for 2011-12 season...
Scraping stats.nba.com for 2012-13 season...
Scraping stats.nba.com for 2013-14 season...
Scraping stats.nba.com for 2014-15 season...
Scraping stats.nba.com for 2015-16 season...
Scraping stats.nba.com for 2016-17 season...
Scraping stats.nba.com for 2017-18 season...
Scraping stats.nba.com for 2018-19 season...
Scraping s