In [44]:
import requests
import time
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import CommonTeamRoster, playercareerstats, leagueseasonmatchups, leaguegamefinder, boxscoresummaryv2, boxscoreplayertrackv2, boxscoretraditionalv2, cumestatsteam, leaguestandings, cumestatsteamgames, leaguegamelog
import pandas as pd
import numpy as np
# import cupy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import requests
import bs4 as bs
import re
from datetime import datetime
from urllib.error import HTTPError

In [45]:
def retry(func, retries=3):
    # Use decorator @retry when making requests to the API. 
    def retry_wrapper(*args, **kwargs):
        attempts = 0
        while attempts < retries:
            try:
                return func(*args, **kwargs)
            except requests.exceptions.HTTPError as e:
                if e.request.status_code == 429:
                    # sports reference websites, of which basketball_reference is a subsidiary, does not like bot traffic.
                    # Limits to 20 requests/min, and a 1 hour ban if violated. 
                    print(e, 'why did i request so much. :(')
                    time.sleep(3600)
            except requests.exceptions.RequestException as e:
                print(e)
                time.sleep(30)
                attempts += 1

    return retry_wrapper

In [46]:
@retry
def scrape_game(link: str):
    '''
    Returns a pandas dataframe containing the box score of both teams. 
    game_link: https://www.basketball-reference.com{link}
    '''
    response = requests.get(f'https://www.basketball-reference.com{link}')
    
    soup = bs.BeautifulSoup(response.text, 'html.parser')

    # Two tables holding stats. One for each team. Want stats for each player for each game.
    basic_stats = ['mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus', 'reason']
    tables = soup.find_all('table', {'class': 'sortable stats_table', 'id': re.compile('box-\w{3}-game-basic')})
    all_player_stats = []
    
    for table in tables:
        team_abbr = table['id'].split('-')[1]
        table_body = table.find('tbody')
        table_rows = table_body.find_all('tr', {'class': None})
        for row in table_rows:
            player_stats = {}
            # Stats which are available in the table
            for stat in basic_stats:    
                potential_stat = row.find('td', {'data-stat': stat})
                if potential_stat != None:
                    player_stats[stat] = potential_stat.get_text()

            # Convert mp to float
            if 'mp' in player_stats:
                minutes = player_stats['mp'].split(":")
                minutes = int(minutes[0]) + int(minutes[1])/60
                player_stats['mp'] = minutes

            # Not so easy stats :(         
            potential_player_name = row.find('th', {'data-stat': 'player'}).find('a')   
            if potential_player_name != None:
                player_stats['player'] = potential_player_name.get_text()
            else:
                player_stats['player'] = 'why'
            player_stats['link'] = link
            game_date = link.split('/')[-1].split('.')[0][:-4] #link takes form /boxscores/YYYYMMDD0(3-digit home abbrev).html
            player_stats['game_date'] = datetime.strptime(game_date, '%Y%m%d').date()
            player_stats['team'] = team_abbr
            player_stats['location'] = link.split('/')[-1].split('.')[0][-3:]

            # Convert empty fields and Nan to None
            for stat in player_stats:
                if player_stats[stat] == '' or player_stats[stat] == 'NaN':
                    player_stats[stat] = None

            # convert to ints/doubles if appropriate
            ints = ['fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']
            doubles = ['fg_pct', 'fg3_pct', 'ft_pct']
            for stat in ints:
                if stat in player_stats and player_stats[stat] != None:
                    player_stats[stat] = int(player_stats[stat])
            for stat in doubles:
                if stat in player_stats and player_stats[stat] != None:
                    player_stats[stat] = float(player_stats[stat])

            all_player_stats.append(player_stats)

    cur_game = pd.DataFrame(all_player_stats)
    cur_game.fillna(0, inplace=True)
    return cur_game

game = scrape_game('/boxscores/202210180BOS.html')       
game.head()



Unnamed: 0,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,...,tov,pf,pts,plus_minus,player,link,game_date,team,location,reason
0,38.183333,8.0,16.0,0.5,2.0,5.0,0.4,3.0,3.0,1.0,...,1.0,5.0,21.0,-6.0,Tyrese Maxey,/boxscores/202210180BOS.html,2022-10-18,PHI,BOS,0
1,37.283333,9.0,14.0,0.643,5.0,9.0,0.556,12.0,12.0,1.0,...,3.0,3.0,35.0,1.0,James Harden,/boxscores/202210180BOS.html,2022-10-18,PHI,BOS,0
2,37.266667,9.0,18.0,0.5,1.0,6.0,0.167,7.0,9.0,0.778,...,6.0,4.0,26.0,-13.0,Joel Embiid,/boxscores/202210180BOS.html,2022-10-18,PHI,BOS,0
3,34.233333,7.0,14.0,0.5,3.0,6.0,0.5,1.0,2.0,0.5,...,0.0,3.0,18.0,-1.0,Tobias Harris,/boxscores/202210180BOS.html,2022-10-18,PHI,BOS,0
4,33.016667,3.0,5.0,0.6,0.0,2.0,0.0,0.0,0.0,0.0,...,2.0,2.0,6.0,-6.0,P.J. Tucker,/boxscores/202210180BOS.html,2022-10-18,PHI,BOS,0


In [47]:
def add_aggregate_stats(game):
    pass
    '''
    Adds aggregate stats for each team. Returns a copy of the dataframe with those aggregate stats. 
    '''
    # Want to use sum(), but this removes some 
    agg_dict = game.columns

    agg = game.groupby('team')
    agg = agg.sum()

    return agg
    # teams = []
    # for team_abbr in game['team'].unique():
    #     team_stats = game[game['team'] == team_abbr].copy()
    #     team_stats.loc[f'{team_abbr} Total'] = team_stats.sum(numeric_only=True, axis=0)

    #     teams.append(team_stats)

    # stats = pd.concat(teams, axis = 0).reset_index()
    # for team_abbr in game['team'].unique():
    #     stats.loc[f'{team_abbr} Total']['player'] = 


    # return pd.concat(teams, axis=0).reset_index()

game = scrape_game('/boxscores/202210180BOS.html')  
game = add_aggregate_stats(game)
game

Unnamed: 0_level_0,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
BOS,240.0,46.0,82.0,4.363,12.0,35.0,2.269,22.0,28.0,4.82,6.0,30.0,36.0,24.0,8.0,3.0,10.0,24.0,126.0,45.0
PHI,240.0,40.0,80.0,3.826,13.0,34.0,2.456,24.0,28.0,3.778,4.0,27.0,31.0,16.0,8.0,3.0,14.0,25.0,117.0,-45.0


In [48]:
@retry # find_months_with_games
def find_months_with_games(season):
    months = []
    response = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html')
    soup = bs.BeautifulSoup(response.text, 'html.parser')
    filter = soup.find('div', {'class': 'filter'})
    for month in filter.find_all('div'):
        months.append(month.find('a').text)

    # Need to lowercase them for some reason
    months = [mo.lower() for mo in months]
    return months

In [49]:
@retry # find_games_in_month(season, month)
def find_games_in_month(season, month):
    '''
    Returns the links to all games in that season and month
    '''
    try:
        response = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html')
        response.raise_for_status()
        soup = bs.BeautifulSoup(response.text, 'html.parser')
        table_holding_games = soup.find('tbody') # Only one tbody tag on the page.
        links = table_holding_games.find_all('td', {'data-stat': 'box_score_text'})
        links = [link.find('a').get('href') for link in links]
        return(links)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            print(f'No games were played in {month} during the {season}-{season+1} season')
        else:
            raise
    except Exception as e:
        print(e.response.status_code)
        raise

In [None]:
@retry # scrape_games_in_month(season, month)
def scrape_games_in_month(season, month)
    '''
    Scrapes all games in a given season and month
    '''
    game_links = find_games_in_month(season, month)
    for link in game_links:
        game_stats = scrape_game(link)
        


In [15]:
def scrape_season(season):
    '''
    Scrapes a seasons worth of data from basketball_reference.
    Returns a pandas dataframe 
    '''
    season_games = pd.DataFrame()
    for month in ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august']:
        # Find the links to all the games
        try:
            response = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html')
            response.raise_for_status()
            soup = bs.BeautifulSoup(response.text, 'html.parser')
            table_holding_games = soup.find('tbody') # Only one tbody tag on the page.
            links = table_holding_games.find_all('td', {'data-stat': 'box_score_text'})
            links = [link.find('a').get('href') for link in links]
            
            # Yoink all games and concat to a dataframe. 
            for link in links: # Just add stats for each player, for each game.
                print(link)
                cur_game = pd.DataFrame()    
                response = requests.get(f'https://www.basketball-reference.com{link}')
                response.raise_for_status()
                
                soup = bs.BeautifulSoup(response.text, 'html.parser')

                # Two tables holding stats. One for each team. Want stats for each player for each game.
                date = link.split('/')[-1].split('.')[0][:-3] #link takes form /boxscores/YYYYMMDD(3-digit home abbrev).html
                tracked_stats = ['player', 'mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']
                tables = soup.find_all('table', {'class': 'sortable stats_table'})
                
                for table in tables:
                    team_name = table.find('caption').get_text()
                    team_name = team_name.split(' Basic')[0]
                    table_body = table.find('tbody')
                    table_rows = table_body.find_all('tr')
                    for row in table_rows:
                        player_name = row.find('th', {'data-stat': 'player'}).find('a').get_text()
                        player_stats = {}
                        for stat in tracked_stats:
                            potential_stat = row.find('th', {'data-stat', stat})
                            if potential_stat != None:
                                player_stats[stat] = potential_stat.get_text()
                        player_stats['player'] = player_name
                        player_stats['link'] = link
                
                player_stats = pd.Series(player_stats)
                cur_game.concat(player_stats)


        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 404:
                print(f'No games found for {season} {month}')
            break
        except requests.exceptions.RequestException as e:
            print(f'Error with {season} {month}')
            break



        

scrape_game('/boxscores/194910290TRI.html')

ValueError: invalid literal for int() with base 10: ''