In [3]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import string

In [4]:
#create directory for all files that will store data
DATA_DIR = "data"
PLAYER_NAMES_DIR = os.path.join(DATA_DIR, 'player_names')
PLAYER_STATS_DIR = os.path.join(DATA_DIR, 'player_stats')
PLAYER_STATS_2_DIR = os.path.join(DATA_DIR, 'player_stats_2')
NBA_TEAM_STATS = os.path.join(DATA_DIR, 'nba_team_stats')

In [5]:
#function that will scrape an html page given a couple selectors
async def get_html(url, selector, sleep=5, retries=3):
    html = None
    #retries is how many times we will try to scrape html off a page
    for i in range(1, retries+1):
        #use sleep so that a webpage doesn't ban us for constant attempts at scraping their page
        time.sleep(sleep * i)
        try:
            #using playwright open a browser and analyze a page
            async with async_playwright() as p:
                browser = await p.firefox.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                #only take the inner_html that have our desired selectors
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            #exit if there is a timeout error
            print(f"Timeout error on {url}")
            continue
        else:
            #no need to try again if we are successfull
            break
    #return the html code of our desired webpage
    return html

In [6]:
#a function that scrapes that tables of all players
async def scrape_names():
    #go to players page
    url = "https://www.basketball-reference.com/players/"
    #get the alphabet and links from the alaphabet in the site
    html = await get_html(url, "#div_alphabet")

    #find anchor tags that link to the page 
    soup = BeautifulSoup(html)
    #get the names
    links = soup.find_all("a")
    #get the link for said name
    standings_pages = [f"https://www.basketball-reference.com{l['href']}" for l in links]
    #save the html to the file for later data parsing
    for url in standings_pages:
        #get the path 
        save_path = os.path.join(PLAYER_NAMES_DIR, "last_name_" + url.split("/")[-2])
        #dont add the page if it already exists
        if os.path.exists(save_path):
            continue
        #grab the html of the page and write it to the file
        html = await get_html(url, "#players")
        with open(save_path, "w+") as f:
            f.write(html)

In [13]:
#this is a function that will choose which players actually get slected for the project
def scrape_table(letter):
    #get all the links for the tables in one page onto an array
    linksForTables = []
    #get the path that holds the names for all the players of a specific letter
    playersTable = os.path.join(PLAYER_NAMES_DIR, NAMES_FILES[letter])
    #read the html
    with open(playersTable, 'r') as f:
        html = f.read()
    #using beautiful soup analyze the body of the page
    soup = BeautifulSoup(html)
    soup = soup.find("tbody")
    #get the rows of the table that contain the player names
    allPlayers = soup.find_all("tr")
    #for each player check certain attributes to deem whether or not we will use it
    for player in allPlayers:
        playerLink = ""
        #get the starting date
        startingYear = player.find(attrs={'data-stat': 'year_min'}).text
        #sometimes this will be "From", igore if the start date is from and move to next row
        if startingYear == "From":
            continue #moving to next row
        #find the year the player joined the table
        start_year = int(player.find(attrs={'data-stat':'year_min'}).text)
        end_year = int(player.find(attrs={'data-stat':'year_max'}).text)
        if  start_year >= 2005 and end_year - start_year >= 5:
            name = player.find(attrs={'data-stat':'player'})
            playerLink = name.find("a")['href']
            playerLink = "https://www.basketball-reference.com" + playerLink
            linksForTables.append(playerLink)
    return linksForTables

In [8]:
def modified_scrape_table(letter):
    #get all the links for the tables in one page onto an array
    linksForTables = []
    #get the path that holds the names for all the players of a specific letter
    playersTable = os.path.join(PLAYER_NAMES_DIR, NAMES_FILES[letter])
    #read the html
    with open(playersTable, 'r') as f:
        html = f.read()
    #using beautiful soup analyze the body of the page
    soup = BeautifulSoup(html)
    soup = soup.find("tbody")
    #get the rows of the table that contain the player names
    allPlayers = soup.find_all("tr")
    #for each player check certain attributes to deem whether or not we will use it
    for player in allPlayers:
        playerLink = ""
        #get the starting date
        startingYear = player.find(attrs={'data-stat': 'year_min'}).text
        #sometimes this will be "From", igore if the start date is from and move to next row
        if startingYear == "From":
            continue #moving to next row
        #find the year the player joined the table
        start_year = int(player.find(attrs={'data-stat':'year_min'}).text)
        end_year = int(player.find(attrs={'data-stat':'year_max'}).text)
        if  start_year == 2004 and end_year - start_year >= 5:
            name = player.find(attrs={'data-stat':'player'})
            playerLink = name.find("a")['href']
            playerLink = "https://www.basketball-reference.com" + playerLink
            linksForTables.append(playerLink)
    return linksForTables

In [9]:
async def scrape_career_stats(player_url):
    html = await get_html(player_url, "#wrap")
    
    if html != None: 
        soup = BeautifulSoup(html)
        per_game = soup.find(id="div_per_game")
        playerTeams = []
        team_ids = per_game.find_all(attrs={"data-stat":'team_id'})
        for team_id in team_ids:
            link = team_id.find("a")
            if (link):
                playerTeams.append(link['href'])
        print(playerTeams)
        team_link = soup.find("a")['href']
        name = player_url.split('/')[-1]
        save_path = os.path.join(PLAYER_STATS_2_DIR, name)
        if not os.path.exists(save_path):
            with open(save_path, "w+") as f:
                f.write(html)
        return playerTeams

In [10]:
async def scrape_nba_team_stats(team_url):
    #split the url
    split_url = team_url.split("/")
    #make the name of the file based on the team name + year
    name = split_url[4] + split_url[5]
    #get the path
    print("Current Team: ", name)
    save_path = os.path.join(NBA_TEAM_STATS, name)
    #check to see if the path exists
    if not os.path.exists(save_path):
        #if it does, then get the html
        html = await get_html(team_url, "#wrap")
        if html != None:
                with open(save_path, "w+") as f:
                    f.write(html)
        return False
    else:
        return True

In [16]:
def get_NBA_team_link():
    #GET THE LINKS TO EVERY SINGLE NBA TEAM FROM 2005 to 2014
    NBA_TEAMS = [
        "ATL",  # Atlanta Hawks
        "BOS",  # Boston Celtics
        "CHI",  # Chicago Bulls
        "CLE",  # Cleveland Cavaliers
        "DAL",  # Dallas Mavericks
        "DEN",  # Denver Nuggets
        "DET",  # Detroit Pistons
        "GSW",  # Golden State Warriors
        "HOU",  # Houston Rockets
        "IND",  # Indiana Pacers
        "LAC",  # LA Clippers
        "LAL",  # Los Angeles Lakers
        "MEM",  # Memphis Grizzlies
        "MIA",  # Miami Heat
        "MIL",  # Milwaukee Bucks
        "MIN",  # Minnesota Timberwolves
        "NYK",  # New York Knicks
        "ORL",  # Orlando Magic
        "PHI",  # Philadelphia 76ers
        "PHO",  # Phoenix Suns
        "POR",  # Portland Trail Blazers
        "SAC",  # Sacramento Kings
        "SAS",  # San Antonio Spurs
        "TOR",  # Toronto Raptors
        "UTA",  # Utah Jazz
        "WAS",  # Washington Wizards
    ]
    #add all the links to the array
    teamLinks = []
    for nba_team in NBA_TEAMS:
        for i in range(2004, 2025):
            teamLinks.append(f'https://www.basketball-reference.com/teams/{nba_team}/{i}.html')
    
    #Charlote Hornets/Bobcats are an exception b/c their team changed mascots in the 2015-2016 season the bobcats became the hornets
    for i in range(2004, 2015):
        teamLinks.append(f"https://www.basketball-reference.com/teams/CHA/{i}.html")
    for i in range(2015, 2025):
        teamLinks.append(f"https://www.basketball-reference.com/teams/CHO/{i}.html") 

    #New Orlean Pelicans/Hornets are another exception that changed franchise names in the 2012-2013 season
    for i in range(2004, 2014):
        teamLinks.append(f"https://www.basketball-reference.com/teams/NOH/{i}.html")
    for i in range(2006, 2008):
        teamLinks.append(f"https://www.basketball-reference.com/teams/NOK/{i}.html")
    for i in range(2014, 2025):
        teamLinks.append(f"https://www.basketball-reference.com/teams/NOP/{i}.html")

    #Seattle Supersonics changed cities to become the OKC Thunder in 2009 so they are another exception
    for i in range(2004, 2009):
        teamLinks.append(f"https://www.basketball-reference.com/teams/SEA/{i}.html")
    for i in range(2009, 2025):
        teamLinks.append(f"https://www.basketball-reference.com/teams/OKC/{i}.html")
        
    #New Jersey Nets relocated in the 2013-2014 season to become the Brooklyn Nets
    for i in range(2004, 2013):
        teamLinks.append(f"https://www.basketball-reference.com/teams/NJN/{i}.html")
    for i in range(2013, 2025):
        teamLinks.append(f"https://www.basketball-reference.com/teams/BRK/{i}.html")


    return teamLinks

In [34]:
await scrape_names() #get the page that contains the names of all the players who have been in the nba after for a certain last name

List of all the NBA and ABA Players | Basketball-Reference.com


In [14]:
#get the links of all the players 
NAMES_FILES = os.listdir(PLAYER_NAMES_DIR)
playerLinks = []
players_2004 = []
for i in range(len(NAMES_FILES) -1):
    #scrape the player links
    [playerLinks.append(link) for link in scrape_table(i)]
    [players_2004.append(link) for link in modified_scrape_table(i)]
print(len(playerLinks))


553


In [None]:
for player in players_2004:
    await scrape_career_stats(player)

In [None]:
#
#allTeamLinks = []
#scrape the data of every single player who joined the nba in the 2004 season to present day
for player in playerLinks:
    playerTeamLinks = await scrape_career_stats(player)
    #if playerTeamLinks:
    #   for team in playerTeamLinks:
    #      [allTeamLinks.append(team) for team in playerTeamLinks if team not in allTeamLinks and '.html' in team] 


In [17]:
#Get the link of all the nba teams
allNBATeamsLinks = get_NBA_team_link()
#boolean flag to ensure we get the data for all teams (sometimes random failure occurs)
completedTeams = True
#conditional loop
while completedTeams:
    completedTeams = True
    #for each link scrape the data
    for link in allNBATeamsLinks:
        #if an element was already completed prevCompleted is set to True, else, it is False
        prevCompleted = await scrape_nba_team_stats(link)
        #if prevCompleted is ever flase, then set completedTeams to False, so it goes through the loop again
        if not prevCompleted:
            completedTeams = False


Current Team:  ATL2004.html
Current Team:  ATL2005.html
Current Team:  ATL2006.html
Current Team:  ATL2007.html
Current Team:  ATL2008.html
Current Team:  ATL2009.html
Current Team:  ATL2010.html
Current Team:  ATL2011.html
Current Team:  ATL2012.html
Current Team:  ATL2013.html
Current Team:  ATL2014.html
Current Team:  ATL2015.html
Current Team:  ATL2016.html
Current Team:  ATL2017.html
Current Team:  ATL2018.html
Current Team:  ATL2019.html
Current Team:  ATL2020.html
Current Team:  ATL2021.html
Current Team:  ATL2022.html
Current Team:  ATL2023.html
Current Team:  ATL2024.html
Current Team:  BOS2004.html
Current Team:  BOS2005.html
Current Team:  BOS2006.html
Current Team:  BOS2007.html
Current Team:  BOS2008.html
Current Team:  BOS2009.html
Current Team:  BOS2010.html
Current Team:  BOS2011.html
Current Team:  BOS2012.html
Current Team:  BOS2013.html
Current Team:  BOS2014.html
Current Team:  BOS2015.html
Current Team:  BOS2016.html
Current Team:  BOS2017.html
Current Team:  BOS20

In [22]:
PALYER_STATS_FILE = os.listdir(PLAYER_STATS_2_DIR)
len(PALYER_STATS_FILE)

575