In [40]:
import os
import re
import time
import glob

import numpy as np
import pandas as pd
import random, requests
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [41]:
DATA_DIR = os.path.join(os.getcwd().replace('/src', ''), 'data')

# WebScraping Player Game Log Stats

In [42]:
SEASON_DATES = {
    '2020-21': ['20201222', '20210516'],
    '2021-22': ['20211017', '20220410'],
    '2022-23': ['20221018', '20230206'],
}

In [43]:
class DataScraper():
    # Scraping Historical Game Data from Basketball-Reference.com
    def get_boxscores(self, season, date_list):
        url_parent = "https://www.basketball-reference.com"
        url_boxscore = "https://www.basketball-reference.com/boxscores/?month={month}&day={day}&year={year}"

        print("Scraping boxscores from the {} regular season".format(season))
        
        for date in tqdm(date_list):
            # BeautifulSoup object for a list of boxscores on a given day
            url_summaries = url_boxscore.format(month=date[4:6],day=date[6:8],year=date[0:4])
            soup_summaries = BeautifulSoup(urlopen(url_summaries),'lxml')
            games = soup_summaries.find_all('div',class_='game_summary expanded nohover')

            for game in games:
                summary = {}

                host = game.find_all('table')[1].find_all('a')[1]['href'][7:10]

                winner = game.find('tr',class_='winner').find_all('td')
                loser = game.find('tr',class_='loser').find_all('td')

                summary['winner'] = [winner[0].find('a')['href'][7:10],int(winner[1].get_text())]
                summary['loser'] = [loser[0].find('a')['href'][7:10],int(loser[1].get_text())]


                url_game = url_parent+game.find('a',text='Box Score')['href']
                soup_game = BeautifulSoup(urlopen(url_game),'lxml')


                tables = soup_game.find_all('table',limit=4)[2:]

                columns_basic = [th.get_text() for th in tables[0].find('thead').find_all('tr')[1].find_all('th')][1:]
                columns_advanced = [th.get_text() for th in tables[1].find('thead').find_all('tr')[1].find_all('th')][2:]

                column_headers = ['Name','Date','Team','Home','W','W_PTS','L','L_PTS','MP','FG','FGA','FG_perc','3P','3PA','3P_perc','FT'
                                ,'FT_perc','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','+-','TS_perc','eFG_perc','3PAr','FTA'
                                ,'FTr','ORB_perc','DRB_perc','TRB_perc','AST_perc','STL_perc','BLK_perc','TOV_perc','USG_perc','ORtg','DRtg','BPM']
                
                teams = ['winner','loser']
                basic_stat_template = 'box-{team}-game-basic'
                advanced_stat_template = 'box-{team}-game-advanced'

                for team in teams:
                    if summary[team][0] == host:
                        home = 1
                    else:
                        home = 0

                    basic_stat = basic_stat_template.format(team=summary[team][0].upper())
                    advanced_stat = advanced_stat_template.format(team=summary[team][0].upper())

                    game_data = [date, summary[team][0], home,summary['winner'][0],
                                 summary['winner'][1], summary['loser'][0],summary['loser'][1]]
                    
                    data_basic = soup_game.find('table',id=basic_stat).find('tbody').find_all('tr',class_=None)
                    data_advanced = soup_game.find('table',id=advanced_stat).find('tbody').find_all('tr',class_=None)

                    n = len(data_basic)

                    player_names = [data_basic[i].find('a').get_text() for i in range(n)]

                    player_data = []
                    injury_keywords = ['Did Not Play', 'Not With Team']

                    for i in range(n):
                        if data_basic[i].find('td').get_text() not in injury_keywords:
                            data = [player_names[i]] + game_data + \
                                   [td.get_text() for td in data_basic[i].find_all('td')] + \
                                   [td.get_text() for td in data_advanced[i].find_all('td')[1:]]

                            player_data.append(data)
                    df = pd.DataFrame(player_data,columns=column_headers)
#                     df.columns = df.columns.str.replace('%','_perc').str.replace('/','')
                    df = df.fillna(0)
                    df.loc[:,'FG':'+-'] = df.loc[:,'FG':'+-'].apply(pd.to_numeric)
                    df['MP'] = [0.00 if ':' not in t else round(int(t.split(':')[0])+int(t.split(':')[1])/60, 2) for t in df['MP']] 
                    df.to_csv(os.path.join(*[DATA_DIR, 'Boxscores', season, date+'-'+summary[team][0]+'.csv']), index=False)

                time.sleep(10)
        return None
  

In [44]:
scraper = DataScraper()

# Comment out season dates in SEASON_DATES in constants.py to extract data for specific seasons
for data_type in ['Boxscores']:
    for season in SEASON_DATES.keys(): 
        if not os.path.exists(os.path.join(DATA_DIR, data_type, season)):
            # Create a new directory and scrape the entire season
            os.makedirs(os.path.join(DATA_DIR, data_type, season))
            start_date = SEASON_DATES[season][0]
            end_date = SEASON_DATES[season][1]
            date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date)]

            if data_type == 'Boxscores':
                scraper.get_boxscores(season, date_list)
            else:
                scraper.get_fantasy_salary(season, date_list)


        elif os.path.exists(os.path.join(DATA_DIR, data_type, season)):
            # Iterate over the existing files by name and scrape missing dates
            start_date = SEASON_DATES[season][0]
            end_date = SEASON_DATES[season][1]
            # Dates to scrape box scores from
            date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date)]
                            
            if data_type == 'Boxscores':               
                for date in date_list:
                    # Check if csv files of the form {date}-{hometeam}.csv (i.e. 20131029-CHI.csv) exists
                    if len(glob.glob(os.path.join(DATA_DIR, data_type, season, str(date)+"*.csv"))) > 0:
                        # Set back the start day by 
                        date_list = date_list[date_list.index(date):]

                scraper.get_boxscores(season, date_list)
                
            else:
                for date in date_list:
                    # Check if csv files of the form salary_{date}.csv (i.e. salary_20131029.csv) exists
                    if os.path.exists(os.path.join(DATA_DIR, data_type, season, "salary_{}.csv".format(date))):
                        date_list = date_list[date_list.index(date):]
                scraper.get_fantasy_salary(season, date_list)

Scraping boxscores from the 2020-21 regular season


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Scraping boxscores from the 2021-22 regular season


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Scraping boxscores from the 2022-23 regular season


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




# WebScraping Team Stats

In [39]:
season_list = ['NBA_2021.html', 'NBA_2022.html', 'NBA_2023.html']
def get_table(Season, url='https://www.basketball-reference.com/leagues/'):
    Season_url = f'{url}/{Season}'
    page = requests.get(Season_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    table = soup.find("table", attrs={"id":"advanced-team"})
    df = pd.read_html(str(table))[0]
    return df

for season in season_list:
    data = get_table(Season)
    data.to_csv('C:\\Users\\12242\\103122\\NBA-Fantasy-Point-Projections\\data\\Team Stats\\'f'Team_Stats_{Season}.csv', index=False)