In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import io

In [2]:
scores_dir = "data/Scores"

In [3]:
box_scores = os.listdir(scores_dir)

In [23]:
box_scores

['202001160NOP.html',
 '202008220POR.html',
 '201904070MEM.html',
 '201604190SAS.html',
 '201604060LAL.html',
 '202305060MIA.html',
 '202001170DAL.html',
 '201904090UTA.html',
 '201904070TOR.html',
 '202304260MIL.html',
 '202106100MIL.html',
 '201802140CHI.html',
 '202304090NYK.html',
 '201904040SAC.html',
 '202110200POR.html',
 '202301040PHI.html',
 '202001060SAC.html',
 '201904070BOS.html',
 '201802140BRK.html',
 '201802240PHI.html',
 '202001040BRK.html',
 '202008010IND.html',
 '201802060ORL.html',
 '202205070MIL.html',
 '202106100UTA.html',
 '201902050POR.html',
 '202305110PHI.html',
 '201802140BOS.html',
 '202009130DEN.html',
 '202001040CHI.html',
 '201904060CHI.html',
 '202110260OKC.html',
 '201902270CHO.html',
 '202003020CLE.html',
 '201604130MIN.html',
 '202110270OKC.html',
 '202304290DEN.html',
 '202405210BOS.html',
 '202110280PHI.html',
 '202008080DAL.html',
 '202001220DET.html',
 '202001220MIA.html',
 '201904200DET.html',
 '201902020MIN.html',
 '201802140MEM.html',
 '20160413

In [9]:
box_scores = [os.path.join(scores_dir, f) for f in box_scores if f.endswith(".html")]

In [10]:
box_scores

['data/Scores/202001160NOP.html',
 'data/Scores/202008220POR.html',
 'data/Scores/201904070MEM.html',
 'data/Scores/201604190SAS.html',
 'data/Scores/201604060LAL.html',
 'data/Scores/202305060MIA.html',
 'data/Scores/202001170DAL.html',
 'data/Scores/201904090UTA.html',
 'data/Scores/202403140MIL.html',
 'data/Scores/201904070TOR.html',
 'data/Scores/202403170WAS.html',
 'data/Scores/202304260MIL.html',
 'data/Scores/202106100MIL.html',
 'data/Scores/201802140CHI.html',
 'data/Scores/202304090NYK.html',
 'data/Scores/201904040SAC.html',
 'data/Scores/202110200POR.html',
 'data/Scores/202301040PHI.html',
 'data/Scores/202001060SAC.html',
 'data/Scores/201904070BOS.html',
 'data/Scores/201802140BRK.html',
 'data/Scores/201802240PHI.html',
 'data/Scores/202001040BRK.html',
 'data/Scores/202008010IND.html',
 'data/Scores/201802060ORL.html',
 'data/Scores/202205070MIL.html',
 'data/Scores/202106100UTA.html',
 'data/Scores/201902050POR.html',
 'data/Scores/202403180SAC.html',
 'data/Scores/

In [4]:
# Create a Function to pull out line of stats

#1. We need to remove the Line Score Table, Basic Box Score, Advanced Box Score

def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()

    soup = BeautifulSoup(html)
    # Remove the "Basic Box Stat Score" over_header and the "Reserves" row from the table
    [s.decompose()for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [5]:
# Function to read the line score
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols

    # Some games go to overtime to we don't require the quarterly scores so remove them 
    line_score = line_score[["team", "total"]]
    return line_score

In [6]:
# Function to read stats table 

def read_stats(soup, team, stat):
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce") # Some Columns have strings but we need numeric Column for machine learning. This removes the strings
    return df
    

In [7]:
# Fuction to read season Information

def read_season_info(soup): # Finding the season where they played
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [None]:
base_cols = None
games = []

for box_score in box_scores:
    try:
        soup = parse_html(box_score)
        line_score = read_line_score(soup)
        teams = list(line_score["team"])

        summaries = []
        for team in teams:
            basic = read_stats(soup, team, "basic")
            advanced = read_stats(soup, team, "advanced")

            totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
            totals.index = totals.index.str.lower()

            maxes = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
            maxes.index = maxes.index.str.lower() + "_max"

            summary = pd.concat([totals, maxes])

            if base_cols is None:
                base_cols = list(summary.index.drop_duplicates(keep="first"))
                base_cols = [b for b in base_cols if "bpm" not in b]
            
            summary = summary[base_cols]

            summaries.append(summary)
        summary = pd.concat(summaries, axis=1).T

        game = pd.concat([summary, line_score], axis=1)
        game["home"] = [0, 1]

        game_opp = game.iloc[::-1].reset_index()
        game_opp.columns += "_opp"

        full_game = pd.concat([game, game_opp], axis=1)
        full_game["season"] = read_season_info(soup)
        full_game['date'] = os.path.basename(box_score)[:8]
        full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")
        full_game["won"] = full_game["total"] > full_game["total_opp"]
        games.append(full_game)

    except ValueError as e:
        if "No tables found" in str(e):
            print(f"Warning: No line score table found in the HTML content for {os.path.basename(box_score)}")
            continue
        else:
            raise e

    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")

In [15]:
games_df = pd.concat(games, ignore_index=True)

In [17]:
games_df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,265.0,265.0,46.0,100.0,0.460,15.0,39.0,0.385,25.0,32.0,...,20.7,39.9,160.0,125.0,NOP,138,1,2020,2020-01-16,False
1,265.0,265.0,51.0,93.0,0.548,11.0,26.0,0.423,25.0,35.0,...,25.0,39.5,197.0,132.0,UTA,132,0,2020,2020-01-16,True
2,240.0,240.0,39.0,78.0,0.500,10.0,30.0,0.333,28.0,43.0,...,100.0,35.6,154.0,120.0,POR,108,1,2020,2020-08-22,True
3,240.0,240.0,39.0,95.0,0.411,12.0,35.0,0.343,18.0,19.0,...,34.0,40.9,203.0,114.0,LAL,116,0,2020,2020-08-22,False
4,265.0,265.0,45.0,98.0,0.459,12.0,41.0,0.293,27.0,34.0,...,15.5,25.7,166.0,129.0,MEM,127,1,2019,2019-04-07,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3537,240.0,240.0,45.0,84.0,0.536,16.0,33.0,0.485,15.0,17.0,...,30.0,33.8,146.0,133.0,MIL,99,0,2023,2023-04-22,True
3538,240.0,240.0,34.0,82.0,0.415,9.0,33.0,0.273,10.0,16.0,...,26.8,33.4,164.0,99.0,DEN,103,1,2019,2019-02-11,False
3539,240.0,240.0,38.0,89.0,0.427,16.0,37.0,0.432,11.0,13.0,...,41.5,25.7,226.0,117.0,MIA,87,0,2019,2019-02-11,True
3540,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True
