In [16]:
import requests
import time
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import CommonTeamRoster, playercareerstats, leagueseasonmatchups, leaguegamefinder, boxscoresummaryv2, boxscoreplayertrackv2, boxscoretraditionalv2, cumestatsteam, leaguestandings, cumestatsteamgames, leaguegamelog
import pandas as pd
import numpy as np
# import cupy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import requests
import bs4 as bs
import re

In [2]:
def retry(func, retries=3):
    # Use decorator @retry when making requests to the API. 
    def retry_wrapper(*args, **kwargs):
        attempts = 0
        while attempts < retries:
            try:
                return func(*args, **kwargs)
            except requests.exceptions.RequestException as e:
                print(e)
                time.sleep(30)
                attempts += 1

    return retry_wrapper

In [23]:
def scrape_game(link: str):
    '''
    Returns a pandas dataframe containing the box score of both teams. 
    game_link: https://www.basketball-reference.com{link}
    '''
    response = requests.get(f'https://www.basketball-reference.com{link}')
    try:
        response.raise_for_status()
    except:
        print(str('hello'))
        print(f'Issue in requesting {link}')
        return None
    
    soup = bs.BeautifulSoup(response.text, 'html.parser')

    # Two tables holding stats. One for each team. Want stats for each player for each game.
    tracked_stats = ['mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus', 'reason']
    tables = soup.find_all('table', {'class': 'sortable stats_table', 'id': re.compile('box-\w{3}-game-basic')})
    all_player_stats = []
    
    for table in tables:
        
        team_abbr = table['id'].split('-')[1]
        table_body = table.find('tbody')
        table_rows = table_body.find_all('tr', {'class': None})
        for row in table_rows:
            player_stats = {}
            # Stats which are available in the table
            for stat in tracked_stats:    
                potential_stat = row.find('td', {'data-stat': stat})
                if potential_stat != None:
                    player_stats[stat] = potential_stat.get_text()

            # Not so easy :(         
            potential_player_name = row.find('th', {'data-stat': 'player'}).find('a')   
            if potential_player_name != None:
                player_stats['player'] = potential_player_name.get_text()
            else:
                player_stats['player'] = 'why'
            player_stats['link'] = link
            player_stats['game_date'] = link.split('/')[-1].split('.')[0][:-3] #link takes form /boxscores/YYYYMMDD(3-digit home abbrev).html
            player_stats['team'] = team_abbr
            # Location of game

            all_player_stats.append(player_stats)

    cur_game = pd.DataFrame(all_player_stats)
    return cur_game

game = scrape_game('/boxscores/202210180BOS.html')       
game



Unnamed: 0,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,...,blk,tov,pf,pts,plus_minus,player,link,game_date,team,reason
0,38:11,8.0,16.0,0.5,2.0,5.0,0.4,3.0,3.0,1.0,...,0.0,1.0,5.0,21.0,-6.0,Tyrese Maxey,/boxscores/202210180BOS.html,202210180,PHI,
1,37:17,9.0,14.0,0.643,5.0,9.0,0.556,12.0,12.0,1.0,...,0.0,3.0,3.0,35.0,1.0,James Harden,/boxscores/202210180BOS.html,202210180,PHI,
2,37:16,9.0,18.0,0.5,1.0,6.0,0.167,7.0,9.0,0.778,...,1.0,6.0,4.0,26.0,-13.0,Joel Embiid,/boxscores/202210180BOS.html,202210180,PHI,
3,34:14,7.0,14.0,0.5,3.0,6.0,0.5,1.0,2.0,0.5,...,0.0,0.0,3.0,18.0,-1.0,Tobias Harris,/boxscores/202210180BOS.html,202210180,PHI,
4,33:01,3.0,5.0,0.6,0.0,2.0,0.0,0.0,0.0,,...,1.0,2.0,2.0,6.0,-6.0,P.J. Tucker,/boxscores/202210180BOS.html,202210180,PHI,
5,20:33,2.0,4.0,0.5,1.0,2.0,0.5,0.0,0.0,,...,0.0,0.0,2.0,5.0,-13.0,De'Anthony Melton,/boxscores/202210180BOS.html,202210180,PHI,
6,16:12,0.0,2.0,0.0,0.0,1.0,0.0,1.0,2.0,0.5,...,0.0,1.0,2.0,1.0,-5.0,Danuel House Jr.,/boxscores/202210180BOS.html,202210180,PHI,
7,12:09,1.0,4.0,0.25,1.0,3.0,0.333,0.0,0.0,,...,0.0,0.0,1.0,3.0,-5.0,Georges Niang,/boxscores/202210180BOS.html,202210180,PHI,
8,10:44,1.0,3.0,0.333,0.0,0.0,,0.0,0.0,,...,1.0,1.0,3.0,2.0,4.0,Montrezl Harrell,/boxscores/202210180BOS.html,202210180,PHI,
9,0:23,0.0,0.0,,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,-1.0,Matisse Thybulle,/boxscores/202210180BOS.html,202210180,PHI,


In [15]:
def scraper_bball_reference(cur_season=2022):
    '''
    Scrapes data from basketball-reference.com. Saves all games in a csv. Checks to make sure that this game is not already in ths csv. 
    '''
    all_games = pd.DataFrame()
    for season in range(1950, cur_season):
        season_games = pd.DataFrame()
        for month in ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august']:
            try:
                response = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html')
                response.raise_for_status()
                soup = bs.BeautifulSoup(response.text, 'html.parser')
                table_holding_games = soup.find('tbody') # Only one tbody tag on the page.
                links = table_holding_games.find_all('td', {'data-stat': 'box_score_text'})
                links = [link.find('a').get('href') for link in links]
                
                # Yoink all games and concat to a dataframe. 
                for link in links: # Just add stats for each player, for each game.
                    print(link)
                    cur_game = pd.DataFrame()    
                    response = requests.get(f'https://www.basketball-reference.com{link}')
                    response.raise_for_status()
                    
                    soup = bs.BeautifulSoup(response.text, 'html.parser')

                    # Two tables holding stats. One for each team. Want stats for each player for each game.
                    date = link.split('/')[-1].split('.')[0][:-3] #link takes form /boxscores/YYYYMMDD(3-digit home abbrev).html
                    tracked_stats = ['player', 'mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']
                    tables = soup.find_all('table', {'class': 'sortable stats_table'})
                    
                    for table in tables:
                        team_name = table.find('caption').get_text()
                        team_name = team_name.split(' Basic')[0]
                        table_body = table.find('tbody')
                        table_rows = table_body.find_all('tr')
                        for row in table_rows:
                            player_name = row.find('th', {'data-stat': 'player'}).find('a').get_text()
                            player_stats = {}
                            for stat in tracked_stats:
                                potential_stat = row.find('th', {'data-stat', stat})
                                if potential_stat != None:
                                    player_stats[stat] = potential_stat.get_text()
                            player_stats['player'] = player_name
                            player_stats['link'] = link
                    
                    player_stats = pd.Series(player_stats)
                    cur_game.concat(player_stats)


            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:
                    print(f'No games found for {season} {month}')
                break
            except requests.exceptions.RequestException as e:
                print(f'Error with {season} {month}')
                break



        

scrape_game('/boxscores/194910290TRI.html')       
scraper_bball_reference(1951)

/boxscores/194910290TRI.html
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
h

AttributeError: 'DataFrame' object has no attribute 'concat'

In [None]:
# Save all game stats locally? Want to use stats multiple times, and don't want to make multiple requests.
def save_all_games():
    # Check to see if file/game exists. Want to be able to save extra columnns for predictions. 
    pd.read_csv('data/all_games.csv')

In [3]:
@retry
def career_game_stats(player_id):
    '''
    Returns a df with all games stats for a player.
    '''
    player_games = leaguegamefinder.LeagueGameFinder(player_or_team_abbreviation='P', player_id_nullable = player_id).get_data_frames()[0]
    # Transform matchup to just the opponent teams id
    def convert_matchup(matchup):
        '''
        input: matchup string
        output: opponent team id
        '''
        matchup_abbrev = matchup.strip()[-3:]
        matchup_team = teams.find_team_by_abbreviation(matchup_abbrev)
        if matchup_team != None:
            return str(matchup_team['id'])
        else:
            return None
    player_games['MATCHUP'] = player_games['MATCHUP'].apply(lambda x: convert_matchup(x))
    
    # Create a df with all games played for each season. To be used for further filtering.
    games_by_season = {}
    for unique_season in player_games['SEASON_ID'].unique():
        season = unique_season[-4:]+ '-' + str(int(unique_season[-2:])+1) # convert season id to year format
        all_season_games = leaguegamelog.LeagueGameLog(season = season, season_type_all_star='Regular Season', player_or_team_abbreviation = 'T').get_data_frames()[0]
        # Create a column with Unix time for each game. Makes for easier filtering and comparisons later.
        all_season_games['GAME_UNIX_TIME'] = all_season_games['GAME_DATE'].apply(lambda x: time.mktime(time.strptime(x, '%Y-%m-%d')))
        games_by_season[season] = all_season_games

    # Iterate over each game and do transformations and calculations for each game. Want factors such as matchup w/l prior to game.
    for row in player_games.itertuples():
        # Calculate win percentage of opponent team
        opponent = all_games[all_games['MATCHUP'] == row.MATCHUP]
        opponent = opponent[opponent['GAME_DATE'] < row.GAME_DATE]

    player_games['matchup_wl'] = player_games['WL'].apply(lambda x: team_wl(player_games['MATCHUP'][x]))

    return player_games

games = career_game_stats(2544)
games

here


KeyboardInterrupt: 

In [7]:
# boxscoreplayertrackv2.BoxScorePlayerTrackV2(game_id='0021900001').get_data_frames()[0]
# boxscoretraditionalv2.BoxScoreTraditionalV2(game_id='0021900001').get_data_frames()[0]
# cumestatsteam.CumeStatsTeam(game_id = '0021900001', league_id = '', season = '2019-20', team_id='1610612747').get_data_frames()[0]
# cumestatsteamgames.CumeStatsTeamGames(league_id = '00', season = '2019-20', team_id='1610612747', season_type_all_star=('Regular Season')).get_data_frames()[0]
# all_games = leaguegamelog.LeagueGameLog(season = '2019-20', season_type_all_star='Regular Season', player_or_team_abbreviation = 'T', date_to_nullable = '8/5/2020').get_data_frames()[0]
# all_games = all_games[all_games['TEAM_ID'] == 1610612747]
# all_games
    # player_games = leaguegamefinder.LeagueGameFinder(player_or_team_abbreviation='P', player_id_nullable = player_id).get_data_frames()[0]
@retry
def foo():
    leaguegamefinder.LeagueGameFinder(player_or_team_abbreviation='P')

foo()

HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
