In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime, timedelta

months = ["october", "november", "december", "january", "february", "march", "april"]

team_dict = {
    "Los Angeles Lakers": "LAL",
    "Phoenix Suns": "PHO",
    "Houston Rockets": "HOU",
    "Boston Celtics": "BOS",
    "Washington Wizards": "WAS",
    "Atlanta Hawks": "ATL",
    "Detroit Pistons": "DET",
    "Minnesota Timberwolves": "MIN",
    "Cleveland Cavaliers": "CLE",
    "New Orleans Pelicans": "NOP",
    "Oklahoma City Thunder": "OKC",
    "Sacramento Kings": "SAC",
    "Dallas Mavericks": "DAL",
    "Portland Trail Blazers": "POR",
    "Philadelphia 76ers": "PHI",
    "Denver Nuggets": "DEN",
    "New York Knicks": "NYK",
    "Miami Heat": "MIA",
    "Toronto Raptors": "TOR",
    "Brooklyn Nets": "BRK",
    "Los Angeles Clippers": "LAC",
    "Orlando Magic": "ORL",
    "Golden State Warriors": "GSW",
    "Chicago Bulls": "CHI",
    "Memphis Grizzlies": "MEM",
    "Indiana Pacers": "IND",
    "Utah Jazz": "UTA",
    "San Antonio Spurs": "SAS",
    "Milwaukee Bucks": "MIL",
    "Charlotte Hornets": "CHO",
}

column_head = [
    "Date",
    "Time",
    "Home_Team",
    "Home_Pts",
    "Away_Team",
    "Away_Points",
]

df = pd.read_csv("./data/schedule.csv")
df = df.drop(df.columns[[0]],axis = 1)

for month in months:
    # this is so our requests dont get timed out
    print(month)
    time.sleep(5)

    # GET data for the month
    URL = (
        "https://www.basketball-reference.com/leagues/NBA_2024_games-" + month + ".html"
    )
    page = requests.get(URL)
    delay = page.headers.get("Retry-After", "None")

    if delay != "None":
        d = datetime.now()
        d = d + timedelta(0, int(delay))
        print("Try again at : ")
        print(d)
        break

    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find("table")

    table_body = table.find("tbody")

    rows = table_body.find_all("tr")

    for row in rows:
        cols = [row.find("th").find("a")]
        cols += row.find_all("td")
        cols = [ele.text.strip() for ele in cols]

        # sanitize this col
        cols = cols[:6]

        #make date in dd/mm/yy
        cols[0] = datetime.strptime(cols[0], '%a, %b %d, %Y').strftime('%Y-%m-%d')
        #shorten team name to three letters
        cols[2] = team_dict[cols[2]] 
        cols[4] = team_dict[cols[4]] 

        formatted_date = datetime.strptime(cols[0], '%Y-%m-%d').strftime('%m/%d/%Y')

        # update home and away team scores
        df.loc[(df['Date'] == formatted_date) & (df['Home_Team'] == cols[2]) & (df['Away_Team'] == cols[4]), "Home_Pts"] = cols[3]
        df.loc[(df['Date'] == formatted_date) & (df['Home_Team'] == cols[2]) & (df['Away_Team'] == cols[4]), "Away_Points"] = cols[5]

# save to .csv
df.to_csv("./data/schedule.csv")

october
november
december
january
february
march
april
