In [39]:
import os 
import pandas as pd
from bs4 import BeautifulSoup
import tqdm


SCORES_DIR = "C:/Users/athen/Desktop/PersonalProjects/Learning/nba_project/data/data_scraping/data/scores"
PLAYERS_DIR = "nba_project/data/data_scraping/data/players"


boxscores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, b) for b in boxscores if b.endswith(".html")]


box_scores = box_scores[int(8*len(box_scores)/10):]



def parse_html(box_score):
    with open(box_score, 'r', encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')

    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr thead")]

    return soup

def read_stats(soup,team, stat):

    df = pd.read_html(str(soup), attrs={"id":f"box-{team}-game-{stat}"},index_col=0)[0]
    df = df.replace("Did Not Play", float("NaN"))
    df = df.replace("Did Not Dress", float("NaN"))


    
    return df

def read_score(soup):
    score = pd.read_html(str(soup), attrs={"id":"line_score"})[0]
    cols = list(score.columns)
    cols[0] = "Team"
    cols[-1] = "Total"
    score.columns = cols

    score = score[['Team','Total']]

    return score

def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [l["href"] for l in nav.find_all("a")]
    season_num = os.path.basename(hrefs[1]).split("_")[0]

    return season_num


def get_player_data(box_scores=box_scores):

    performances = []
    base_cols = None

    for box_score in tqdm.tqdm(box_scores):
        soup = parse_html(box_score)
        score = read_score(soup)

        teams = list(score["Team"])

        for team in teams:
            basic = read_stats(soup, team, "basic")
            advanced = read_stats(soup, team, "advanced")

            player_names = basic.index.tolist()
            #get rid of Reserves and Team Totals from basic and advanced stats
            player_names = [p for p in player_names if p not in ["Reserves","Team Totals"]]

            for player in player_names:
                basic_stats = basic.loc[player]
                advanced_stats = advanced.loc[player]

                player_stats_one_game = pd.concat([basic_stats, advanced_stats])

                player_stats_one_game["player"] = player
                player_stats_one_game["season"] = read_season_info(soup)
                player_stats_one_game["date"] = os.path.basename(box_score)[:8]
                player_stats_one_game["date"] = pd.to_datetime(player_stats_one_game["date"], format="%Y%m%d")

                if team == teams[0]:
                    player_stats_one_game["home"] = 0
                else:
                    player_stats_one_game["home"] = 1

                if base_cols is None:
                    base_cols = list(player_stats_one_game.index.drop_duplicates(keep='first'))

                player_stats_one_game = player_stats_one_game[base_cols]


                performances.append(player_stats_one_game)
    
    all_performances = pd.concat(performances, axis=1).T

    return all_performances




In [40]:
#suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


all_performances = get_player_data(box_scores=box_scores)



100%|██████████| 3187/3187 [1:48:01<00:00,  2.03s/it]


In [41]:
directory = "C:/Users/athen/Desktop/PersonalProjects/Learning/nba_project/data/data_scraping/data/players/"
file_path = os.path.join(directory, "players5.csv")

# Ensure the directory exists
os.makedirs(directory, exist_ok=True)

# Save the DataFrame to a CSV file
all_performances.to_csv(file_path, index=False)



### Data Cleaning and Saving

In [42]:
file_path = "C:/Users/athen/Desktop/PersonalProjects/Learning/nba_project/data/data_scraping/data/players"

player_files = os.listdir(file_path)

#join all the csv files
players_file = pd.concat([pd.read_csv(os.path.join(file_path, f)) for f in player_files])

players_file = players_file.drop("MP.1", axis=1)

players_file = players_file.replace("Did Not Dress", float("NaN"))
players_file = players_file.replace("Player Suspended", float("NaN"))   
players_file = players_file.replace("Not With Team", float("NaN"))
players_file = players_file.replace("Player Not With Team", float("NaN"))
players_file = players_file.replace("Did Not Play", float("NaN"))
players_file = players_file.replace("Not With Team", float("NaN"))
players_file = players_file.replace(float("NaN"), 0)

players_file.to_csv("C:/Users/athen/Desktop/PersonalProjects/Learning/nba_project/data/data_scraping/data/players.csv", index=False)


### Data Checks

In [44]:
#load the data from the csv
players = pd.read_csv("C:/Users/athen/Desktop/PersonalProjects/Learning/nba_project/data/data_scraping/data/players.csv")

#print the number of times the column 'season' is equal to 2000
season_vals = players["season"].value_counts()
print(f'2000 count',season_vals[2000])
print(f'2001 count',season_vals[2001])
print(f'2002 count',season_vals[2002])
print(f'2003 count',season_vals[2003])
print(f'2004 count',season_vals[2004])
print(f'2005 count',season_vals[2005])
print(f'2006 count',season_vals[2006])
print(f'2007 count',season_vals[2007])
print(f'2008 count',season_vals[2008])
print(f'2009 count',season_vals[2009])
print(f'2010 count',season_vals[2010])
print(f'2011 count',season_vals[2011])
print(f'2012 count',season_vals[2012])
print(f'2013 count',season_vals[2013])
print(f'2014 count',season_vals[2014])
print(f'2015 count',season_vals[2015])
print(f'2016 count',season_vals[2016])
print(f'2017 count',season_vals[2017])
print(f'2018 count',season_vals[2018])
print(f'2019 count',season_vals[2019])
print(f'2020 count',season_vals[2020])
print(f'2021 count',season_vals[2021])
print(f'2022 count',season_vals[2022])
print(f'2023 count',season_vals[2023])
print(f'2024 count',season_vals[2024])



2000 count 30196
2001 count 30137
2002 count 30135
2003 count 30720
2004 count 30326
2005 count 31472
2006 count 31646
2007 count 31386
2008 count 31572
2009 count 26620
2010 count 31481
2011 count 31419
2012 count 27638
2013 count 33542
2014 count 33683
2015 count 33522
2016 count 33672
2017 count 33598
2018 count 33086
2019 count 30269


KeyError: 2020