This is a simple utility notebook to update the games.csv file with the latest data from NBA.com. You want to run this notebook before backfilling the data to Hopsworks.ai for the first time.

In [17]:
import os

import pandas as pd

from datetime import datetime, timedelta
from pytz import timezone

# change working directory to project root when running from notebooks folder to make it easier to import modules
# and to access sibling folders
os.chdir('..') 

from src.utils.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
)

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

**Determine How current is the existing data**

In [2]:
games_old = pd.read_csv(DATAPATH / "raw" / "games.csv")

# Find the last date and season in the current dataset
last_date = games_old["GAME_DATE_EST"].max()
last_season = games_old["SEASON"].max()

# remove the time from the date
last_date = last_date.split(" ")[0]

# Determine the date of the next day to begin scraping from
start_date = datetime.strptime(last_date, "%Y-%m-%d") + timedelta(days=1)

# determine what season we are in currently
today = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time
if today.month >= 10:
    current_season = today.year
else:
    current_season = today.year - 1

# determine which seasons we need to scrape to catch up the data
seasons = list(range(last_season, current_season+1))


print("Last date in dataset: ", last_date)
print("Last season in dataset: ", last_season)
print("Current season: ", current_season)
print("Seasons to scrape: ", seasons)
print("Start date: ", start_date)

# if the last date in the dataset is today, then we don't need to scrape any new data
if start_date > datetime.now():
    print("No new data to scrape")
    exit()

Last date in dataset:  2022-12-22
Last season in dataset:  2022
Current season:  2022
Seasons to scrape:  [2022]
Start date:  2022-12-23 00:00:00


**Activate Webdriver**

In [3]:
# initiate a webdriver in selenium 
# since website data is dynamically generated

driver = activate_web_driver('chromium')

**Scrape New Completed Games and Format Them**

In [4]:
def update_games(driver, season, start_date, end_date)-> pd.DataFrame:

    season_types = ["Regular+Season", "PlayIn", "Playoffs"]
      
    all_season_types = pd.DataFrame()

    for season_type in season_types:
        
        df = scrape_to_dataframe(api_key="", driver=driver, Season=season, DateFrom=start_date, DateTo=end_date, season_type=season_type)

        if not(df.empty):
            df = convert_columns(df)
            df = combine_home_visitor(df)
            all_season_types = pd.concat([all_season_types, df], axis=0)


    return all_season_types
    

In [6]:
seasons

[2022]

In [5]:
new_games = pd.DataFrame()
df_season = pd.DataFrame()

for season in seasons:
    end_date = datetime.strptime(f"{season+1}-08-01", "%Y-%m-%d") # use August 1st to get all games from the current season
    print(f"Scraping season {season} from {start_date} to {end_date}")
    df_season = update_games(driver, str(season), str(start_date), str(end_date))
    new_games = pd.concat([new_games, df_season], axis=0)
    start_date = datetime.strptime(f"{season+1}-10-01", "%Y-%m-%d") # if more than 1 season, reset start date to beginning of next season


new_games

Scraping season 2022 from 2022-12-23 00:00:00 to 2023-08-01 00:00:00
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Regular+Season&Season=2022&DateFrom=2022-12-23 00:00:00&DateTo=2023-08-01 00:00:00
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=PlayIn&Season=2022&DateFrom=2022-12-23 00:00:00&DateTo=2023-08-01 00:00:00
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Playoffs&Season=2022&DateFrom=2022-12-23 00:00:00&DateTo=2023-08-01 00:00:00


Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2023-04-09,0,109,42.3,31.1,61.9,51,28,1610612764,22201222,114,48.9,28.6,71.4,51,24,1610612745,2022
1,2023-04-09,0,117,42.6,32.1,50.0,46,28,1610612742,22201224,138,49.0,39.5,70.0,69,31,1610612759,2022
2,2023-04-09,1,123,54.2,40.9,75.0,44,30,1610612748,22201219,110,42.2,36.2,70.8,43,29,1610612753,2022
3,2023-04-09,1,121,50.5,33.3,87.5,45,29,1610612761,22201221,105,46.3,35.3,81.0,38,28,1610612749,2022
4,2023-04-09,1,120,47.2,46.3,78.6,47,33,1610612738,22201216,114,45.4,28.2,78.9,50,27,1610612737,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,2023-04-16,0,117,49.5,24.4,72.7,38,29,1610612749,42200101,130,59.5,60.0,71.4,36,32,1610612748,2022
80,2023-04-15,0,97,43.4,32.3,71.4,38,20,1610612739,42200131,101,42.0,27.6,86.4,51,18,1610612752,2022
81,2023-04-15,1,112,47.7,39.4,83.3,58,25,1610612738,42200111,99,38.8,17.2,81.8,45,23,1610612737,2022
82,2023-04-15,1,121,47.2,48.8,100.0,38,32,1610612755,42200121,101,55.7,44.8,66.7,35,23,1610612751,2022


**Close Webdriver**

In [7]:
driver.close() 

**Append to Games.csv**

In [18]:
games = pd.concat([games_old, new_games], axis=0)

games.to_csv(DATAPATH / "interim" / "games.csv", index=False)

games


Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1.610613e+09,126.0,0.484,0.926,...,25.0,46.0,1.610613e+09,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1.610613e+09,120.0,0.488,0.952,...,16.0,40.0,1.610613e+09,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1.610613e+09,114.0,0.482,0.786,...,22.0,37.0,1.610613e+09,106.0,0.470,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,1610612755,1610612765,2022,1.610613e+09,113.0,0.441,0.909,...,27.0,49.0,1.610613e+09,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,1610612737,1610612741,2022,1.610613e+09,108.0,0.429,1.000,...,22.0,47.0,1.610613e+09,110.0,0.500,0.773,0.292,20.0,47.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,2023-04-16 00:00:00,42200101,,1610612749,1610612748,2022,,117.0,49.500,72.700,...,29.0,38.0,,130.0,59.500,71.400,60.000,32.0,36.0,0
80,2023-04-15 00:00:00,42200131,,1610612739,1610612752,2022,,97.0,43.400,71.400,...,20.0,38.0,,101.0,42.000,86.400,27.600,18.0,51.0,0
81,2023-04-15 00:00:00,42200111,,1610612738,1610612737,2022,,112.0,47.700,83.300,...,25.0,58.0,,99.0,38.800,81.800,17.200,23.0,45.0,1
82,2023-04-15 00:00:00,42200121,,1610612755,1610612751,2022,,121.0,47.200,100.000,...,32.0,38.0,,101.0,55.700,66.700,44.800,23.0,35.0,1
