In [2]:
import requests
import time
# from nba_api.stats.static import players, teams
# from nba_api.stats.endpoints import CommonTeamRoster, playercareerstats, leagueseasonmatchups, leaguegamefinder, boxscoresummaryv2, boxscoreplayertrackv2, boxscoretraditionalv2, cumestatsteam, leaguestandings, cumestatsteamgames, leaguegamelog
import pandas as pd
# import numpy as np
# import cupy as np 
import json
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import cross_val_score
import requests
import bs4 as bs
import re
from datetime import datetime
from urllib.error import HTTPError
import os

In [3]:
def retry(func, retries=3):
    # Use decorator @retry when making requests to the API. 
    def retry_wrapper(*args, **kwargs):
        attempts = 0
        while attempts < retries:
            try:
                return func(*args, **kwargs)
            except requests.exceptions.HTTPError as e:
                if e.request.status_code == 429:
                    # sports reference websites, of which basketball_reference is a subsidiary, does not like bot traffic.
                    # Limits to 20 requests/min, and a 1 hour ban if violated. 
                    print(e, 'why did i request so much. :(')
                    time.sleep(3600)
            except requests.exceptions.RequestException as e:
                print(e)
                time.sleep(30)
                attempts += 1

    return retry_wrapper

In [4]:
@retry #scrape_game(link)
def scrape_game(link: str):
    '''
    Returns a pandas dataframe containing the box score of both teams. 
    game_link: https://www.basketball-reference.com{link}
    '''
    response = requests.get(f'https://www.basketball-reference.com{link}')
    
    soup = bs.BeautifulSoup(response.text, 'html.parser')

    # Two tables holding stats. One for each team. Want stats for each player for each game.
    basic_stats = ['mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus', 'reason']
    tables = soup.find_all('table', {'class': 'sortable stats_table', 'id': re.compile('box-\w{3}-game-basic')})
    all_player_stats = []
    
    for table in tables:
        team_abbr = table['id'].split('-')[1]
        table_body = table.find('tbody')
        table_rows = table_body.find_all('tr', {'class': None})
        for row in table_rows:
            player_stats = {}
            # Stats which are available in the table
            for stat in basic_stats:    
                potential_stat = row.find('td', {'data-stat': stat})
                if potential_stat != None:
                    player_stats[stat] = potential_stat.get_text()

            # Convert mp to float
            if 'mp' in player_stats and player_stats['mp'] != '':
                mp = player_stats['mp'].split(":")
                minutes = int(mp[0])
                seconds = int(mp[1])
                mp = minutes + seconds/60
                player_stats['mp'] = mp

            # Not so easy stats :(         
            potential_player_name = row.find('th', {'data-stat': 'player'}).find('a')   
            if potential_player_name != None:
                player_stats['player'] = potential_player_name.get_text()
            else:
                player_stats['player'] = 'why'
            player_stats['link'] = link
            game_date = link.split('/')[-1].split('.')[0][:-4] #link takes form /boxscores/YYYYMMDD0(3-digit home abbrev).html
            player_stats['game_date'] = datetime.strptime(game_date, '%Y%m%d').date()
            player_stats['team'] = team_abbr
            player_stats['location'] = link.split('/')[-1].split('.')[0][-3:]

            # Convert empty fields and Nan to None
            for stat in player_stats:
                if player_stats[stat] == '' or player_stats[stat] == 'NaN':
                    player_stats[stat] = None

            # convert to ints/doubles if appropriate
            ints = ['fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']
            doubles = ['fg_pct', 'fg3_pct', 'ft_pct']
            for stat in ints:
                if stat in player_stats and player_stats[stat] != None:
                    player_stats[stat] = int(player_stats[stat])
            for stat in doubles:
                if stat in player_stats and player_stats[stat] != None:
                    player_stats[stat] = float(player_stats[stat])

            all_player_stats.append(player_stats)

    cur_game = pd.DataFrame(all_player_stats)
    cur_game.fillna(0, inplace=True)
    return cur_game

In [5]:
@retry # find_months_with_games
def find_months_with_games(season):
    '''
    Make sure we only request for months that exist in a season. 
    '''
    months = []
    response = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html')
    soup = bs.BeautifulSoup(response.text, 'html.parser')
    filter = soup.find('div', {'class': 'filter'})
    for month in filter.find_all('div'):
        months.append(month.find('a').text.replace(' ', '-'))

    # Need to lowercase them for some reason
    months = [mo.lower() for mo in months]
    return months

In [6]:
@retry # find_games_in_month(season, month)
def find_games_in_month(season, month):
    '''
    Returns the links to all games in that season and month
    '''   
    try:
        response = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html')
        response.raise_for_status()
        soup = bs.BeautifulSoup(response.text, 'html.parser')
        table_holding_games = soup.find('tbody') # Only one tbody tag on the page.
        links = table_holding_games.find_all('td', {'data-stat': 'box_score_text'})
        # Some games will be scheduled but not yet played. 
        links = [link for link in links if link is not None]
        links = [link.find('a') for link in links if link.find('a') is not None]
        links = [link.get('href') for link in links]
        return(links)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            print(f'No games were played in {month} during the {season-1}-{season} season')
            print(f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html')
        else:
            raise
    except Exception as e:
        print(e.response.status_code)
        raise

In [7]:
@retry # scrape_games_in_month(season, month)
def scrape_games_in_month(season, month):
    '''
    Scrapes all games in a given season and month
    '''
    game_links = find_games_in_month(season, month)
    all_game_dataframes = []
    for link in game_links:
        game_stats = scrape_game(link)
        all_game_dataframes.append(game_stats)
        time.sleep(3) # 20 requests/min timer. 
    
    return pd.concat(all_game_dataframes, axis=0).reset_index(drop=True)

In [8]:
def scrape_season(season):
    '''
    Scrapes a seasons worth of data from basketball_reference. 
    Saves into csv files. 'season-month.csv'
    Does not work for season 2019-2020 due to bubble crap :(. Just do it manually copium.
    Returns a pandas dataframe 
    '''
    months = find_months_with_games(season)
    for month in months:
        if month in ['september', 'october', 'november', 'december']:
            # Actually the prior year because season are years (season-1) to season. 2019-2020 fucked because covid 
            year = season - 1
        else:
            year = season
        
        if os.path.exists(f'data/{year}') and os.path.exists(f'data/{year}/{year}-{month}.csv'):
            print(f'{year}-{month} already exists. Current time: ' + datetime.now().strftime("%H:%M:%S"))
            pass        
        else:
            print(f'Scraping games in {year}-{month}. Current time: ' + datetime.now().strftime("%H:%M:%S"))
            games = scrape_games_in_month(season, month) # func works off season not year.
            if not os.path.exists(f'data/{year}'):
                os.mkdir(f'data/{year}')
            games.to_csv(f'data/{year}/{year}-{month}.csv', index=False)

# scrape_season(2020)
    

In [9]:
for year in range(2023, 2000, -1):
    scrape_season(year)

2022-october already exists. Current time: 12:44:37
2022-november already exists. Current time: 12:44:37
2022-december already exists. Current time: 12:44:37
2023-january already exists. Current time: 12:44:37
2023-february already exists. Current time: 12:44:37
2023-march already exists. Current time: 12:44:37
2023-april already exists. Current time: 12:44:37
2023-may already exists. Current time: 12:44:37
2023-june already exists. Current time: 12:44:37
2021-october already exists. Current time: 12:44:37
2021-november already exists. Current time: 12:44:37
2021-december already exists. Current time: 12:44:37
2022-january already exists. Current time: 12:44:37
2022-february already exists. Current time: 12:44:37
2022-march already exists. Current time: 12:44:37
2022-april already exists. Current time: 12:44:37
2022-may already exists. Current time: 12:44:37
2022-june already exists. Current time: 12:44:37
2020-december already exists. Current time: 12:44:37
2021-january already exists