In [293]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO  


In [315]:
#get the full file path to our player's stats
PLAYER_STATS_2_DIR = "data/player_stats_2"
player_stats = os.listdir(PLAYER_STATS_2_DIR)
#only take the files that have .html
player_stats = [os.path.join(PLAYER_STATS_2_DIR, player) for player in player_stats if player.endswith(".html")]
print(player_stats[2])
len(player_stats)

data/player_stats_2/oubreke01.html


574

In [295]:
def read_file(player):
    with open(player) as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [296]:
def make_df(stats):
    stats = str(stats)
    df_stats = pd.read_html(StringIO(stats))[0]
    return df_stats

In [328]:
def decompose_table(table):
    if (table is not None):
        table.find('colgroup').decompose()
        table.find('thead').decompose()
        table.find('caption').decompose()
        table.find('tfoot').decompose()
        
        body = table.find('tbody')
        table_rows = body.find_all('tr')
        
        for row in table.find_all('tr'):
            if row not in table_rows:
                row.decompose()
        return table
    return None

In [338]:
def determine_teams_and_year(player_soup):
    teamArray = []
    
    table = player_soup.find(id="per_minute")
    table_body = table.find('tbody')
    multipleTeamFlag = False
    
    year = "NaN"
    for row in table_body.find_all('tr'):
        team = row.find(attrs={'data-stat': 'team_id'})

        if team:
            if team.text == "TOT":
                multipleTeamFlag = True
                year = row.find(attrs={'data-stat': 'season'}).text
                year = year.split("-")[0][:2] + year.split("-")[1]
                continue
            team_link = team.find("a")['href']
            split_team_id = team_link.split("/")
            file = split_team_id[2] + split_team_id[3]
            if year in split_team_id[3]:
                if len(teamArray) >= 1:
                    if split_team_id[3] in teamArray[-1]:
                        teamArray[-1] = file
                        
                    else:
                        teamArray.append(file)
                else:
                    teamArray.append(file)
            else:
                teamArray.append(file)
        else:
            continue
    return teamArray


In [303]:
def getValues(str):
    values = str.strip().replace(":", "").split("\n")
    values[0] = values[0][:values[0].index("(")]
    values[2] = values[2][:values[2].index("(")]
    return float(values[0]), float(values[2])

In [304]:
def removeHeaders(soup):
    for row in soup.find_all("strong"):
        row.decompose()
    return soup

In [305]:
def get_team_info(team_file, teamInfoDict):
    team_page = read_file(team_file)
    if '2024' in team_file:
        is2024 = True
    else:
        is2024 = False
        
    metaData = team_page.find(id="meta")
    summary = metaData.find(attrs={"data-template": "Partials/Teams/Summary"})
    specificData = summary.find_all("p")

    if is2024:
        indexPPG = 5
        indexSRSPace = 6
        indexOFRtgDFRtg = 7
    else:
        indexPPG = 3
        indexSRSPace = 4
        indexOFRtgDFRtg = 5
        
    team_and_opp_ppg = specificData[indexPPG]
    team_and_opp_ppg = removeHeaders(team_and_opp_ppg)
    teamPPG, oppPPG = getValues(team_and_opp_ppg.text)

    SRS_and_pace = specificData[indexSRSPace]
    SRS_and_pace = removeHeaders(SRS_and_pace)
    SRS, pace = getValues(SRS_and_pace.text)

    team_OFRtg_and_DFRtg = specificData[indexOFRtgDFRtg]
    team_OFRtg_and_DFRtg = removeHeaders(team_OFRtg_and_DFRtg)
    teamOFRtg, teamDFRtg = getValues(team_OFRtg_and_DFRtg.text)

    teamInfoDict['teamPPG'].append(teamPPG)
    teamInfoDict['oppPPG'].append(oppPPG)
    teamInfoDict['SRS'].append(SRS)
    teamInfoDict['pace'].append(pace)
    teamInfoDict['teamOFRtg'].append(teamOFRtg)
    teamInfoDict['teamDFRtg'].append(teamDFRtg)

In [327]:
def removeMultipleTeams(table):
    curYear = 0
    multipleYear = 0
    for row in table.find_all("tr"):
        curYear = row.find(attrs={"data-stat": "season"})
        if (curYear):
            curYear = curYear.text
            if row.find(attrs={"data-stat": "team_id"}).text == "TOT":
                multipleYear = curYear
                continue
            if curYear == multipleYear:
                row.decompose()
        else:
            row.decompose()
    return table

In [307]:
def parse_per_game_html(player_stats):
    per_game_table = soup.find(id="per_game")
    per_game_table_clean = decompose_table(per_game_table)
    per_game_table_clean = removeMultipleTeams(per_game_table_clean)

    return per_game_table_clean

In [308]:
def parse_per_36_html(player_page):
    per_36_table = player_page.find(id="per_minute")
    per_36_table_clean = decompose_table(per_36_table)
    per_36_table_clean = removeMultipleTeams(per_36_table_clean)
    
    return per_36_table_clean

In [309]:
def parse_per_100_html(player_page):
    per_100_table = player_page.find(id="per_poss")
    per_100_table_clean = decompose_table(per_100_table)
    per_100_table_clean = removeMultipleTeams(per_100_table_clean)

    
    return per_100_table_clean

In [310]:
def parse_advanced_stats_html(player_stats):
    advanced_table = soup.find(id="advanced")
    advanced_table_clean = decompose_table(advanced_table) 
    advanced_table_clean = removeMultipleTeams(advanced_table_clean)

    return advanced_table_clean

In [311]:
def make_df_per_game_stats(soup, id):
    per_game_df = make_df(soup)
    
    columnNames = ["Season", "AGE", "TM", "LG", "POS" , "G", "GS", "MP", "FG", 
                   "FGA", "FG%", "3P", "3PA", "3P%", "2P", "2PA", "2P%", "eFG%",
                   "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", 
                   "TOV", "PF", "PTS"]
    per_game_df.fillna(0, inplace=True)
    per_game_df.columns = columnNames
    per_game_df['ID'] = id
    return per_game_df

In [312]:
def make_df_per_36_stats(soup, id):
    per_36_df = make_df(soup)
    
    columnNames = ["Season", "AGE", "TM", "LG", "POS" , "G", "GS", "MP_total", 
                   "FG_36", "FGA_36", "FG%_36", "3P_36", "3PA_36", "3P%_36", 
                   "2P_36", "2PA_36", "2P%_36", "FT_36", "FTA_36", "FT%_36", 
                   "ORB_36", "DRB_36", "TRB_36", "AST_36", "STL_36", "BLK_36", 
                   "TOV_36", "PF_36", "PTS_36"]
    per_36_df.fillna(0, inplace=True)
    per_36_df.columns = columnNames
    per_36_df['ID'] = id
    return per_36_df

In [313]:
def make_df_per_100_stats(soup, id):
    per_100_df = make_df(soup)
    
    columnNames = ["Season", "AGE", "TM", "LG", "POS" , "G", "GS", "MP_total", 
                   "FG_100", "FGA_100", "FG%_100", "3P_100", "3PA_100", "3P%_100",
                   "2P_100", "2PA_100", "2P%_100", "FT_100", "FTA_100", "FT%_100",
                   "ORB_100", "DRB_100", "TRB_100", "AST_100", "STL_100", "BLK_100", 
                   "TOV_100", "PF_100", "PTS_100", "holder", "ORtg", "DRtg"]
    per_100_df.fillna(0, inplace=True)
    per_100_df.columns = columnNames
    per_100_df =  per_100_df.drop("holder", axis="columns")

    per_100_df['ID'] = id
    return per_100_df

In [314]:
def make_df_advanced_stats(soup, id):
    advanced_stats_df = make_df(soup)

    columnNames = ["Season", "AGE", "TM", "LG", "POS" , "G", "MP_total", "PER", "TS%", 
                   "3PAr", "FTr", "ORB%", "DRB%%", "TRB%", "AST%", "STL%%", 
                   "BLK%", "TOV%", "USG%", "holder1", "OWS", "DWS", "WS", 
                   "WS/48", "holder2", "OBPM", "DBPM", "BPM", "VORP"]
    
    advanced_stats_df.fillna(0, inplace=True)
    advanced_stats_df.columns = columnNames
    #get rid of blank columns that basketball-reference adds
    advanced_stats_df =  advanced_stats_df.drop("holder1", axis="columns")
    advanced_stats_df =  advanced_stats_df.drop("holder2", axis="columns")

    advanced_stats_df['ID'] = id
    return advanced_stats_df

In [344]:
for i in range(15, len(player_stats)):
    file = player_stats[i]
    print(file)
    soup = read_file(file)
    teamInfo = {'teamPPG': [], 'oppPPG': [], 'SRS': [], 'pace': [], 'teamOFRtg': [], 'teamDFRtg': []}
    teamList = determine_teams_and_year(soup)
    
    for team in teamList:
        team_file = f'data/nba_team_stats/{team}'
        get_team_info(team_file, teamInfo)
        
    team_stats_df = pd.DataFrame.from_dict(teamInfo)
    
    per_game = parse_per_game_html(soup)
    per_36 = parse_per_36_html(soup)
    per_100 = parse_per_100_html(soup)
    advanced_stats = parse_advanced_stats_html(soup)

    ID = file.split("/")[-1].strip('.html')
    
    if per_game:
        per_game_DF = make_df_per_game_stats(per_game, ID)
        per_36_DF = make_df_per_36_stats(per_36, ID)
        per_100_DF = make_df_per_100_stats(per_100, ID)
        advanced_stats_DF = make_df_advanced_stats(advanced_stats, ID)
        
        #get all the stats into 1 data frame
        allStats = pd.concat([per_game_DF, per_36_DF, per_100_DF, advanced_stats_DF, team_stats_df], axis=1, join="inner")
    
        #remove any duplicate columns (ie. POS, LG, TM, etc.)
        allStats = allStats.loc[:,~allStats.columns.duplicated()]
        
    all_DFs_array.append(allStats)

data/player_stats_2/netora01.html
data/player_stats_2/jacksju01.html
data/player_stats_2/ridnolu01.html
data/player_stats_2/afflaar01.html
data/player_stats_2/richajo01.html
data/player_stats_2/thomais02.html
data/player_stats_2/childjo01.html
data/player_stats_2/johnsli01.html
data/player_stats_2/crowdja01.html
data/player_stats_2/harklma01.html
data/player_stats_2/fryech01.html
data/player_stats_2/smithjo03.html
data/player_stats_2/garcifr01.html
data/player_stats_2/pargoje01.html
data/player_stats_2/miltosh01.html
data/player_stats_2/johnsst04.html
data/player_stats_2/petrojo01.html
data/player_stats_2/stephla01.html
data/player_stats_2/mbahalu01.html
data/player_stats_2/mayooj01.html
data/player_stats_2/casspom01.html
data/player_stats_2/jokicni01.html
data/player_stats_2/banksma01.html
data/player_stats_2/mykhasv01.html
data/player_stats_2/mcgeeja01.html
data/player_stats_2/lavinza01.html
data/player_stats_2/harrish01.html
data/player_stats_2/ajincal01.html
data/player_stats_2/bro

In [343]:
print(len(all_DFs_array))

17


In [335]:
file = "data/player_stats_2/whiteha01.html"
print(file)
soup = read_file(file)
teamInfo = {'teamPPG': [], 'oppPPG': [], 'SRS': [], 'pace': [], 'teamOFRtg': [], 'teamDFRtg': []}
teamList = determine_teams_and_year(soup)

for team in teamList:
    team_file = f'data/nba_team_stats/{team}'
    get_team_info(team_file, teamInfo)
    
team_stats_df = pd.DataFrame.from_dict(teamInfo)

per_game = parse_per_game_html(soup)
per_36 = parse_per_36_html(soup)
per_100 = parse_per_100_html(soup)
advanced_stats = parse_advanced_stats_html(soup)

ID = file.split("/")[-1].strip('.html')

if per_game:
    per_game_DF = make_df_per_game_stats(per_game, ID)
    per_36_DF = make_df_per_36_stats(per_36, ID)
    per_100_DF = make_df_per_100_stats(per_100, ID)
    advanced_stats_DF = make_df_advanced_stats(advanced_stats, ID)
    
    #get all the stats into 1 data frame
    allStats = pd.concat([per_game_DF, per_36_DF, per_100_DF, advanced_stats_DF, team_stats_df], axis=1, join="inner")

    #remove any duplicate columns (ie. POS, LG, TM, etc.)
    allStats = allStats.loc[:,~allStats.columns.duplicated()]
all_DFs_array.append(allStats)
allStats

data/player_stats_2/whiteha01.html
<tbody> <tr class="full_table" data-row="0" id="per_minute.2011"><th class="left" data-stat="season" scope="row"><a href="/players/w/whiteha01/gamelog/2011">2010-11</a></th><td class="center" data-stat="age">21</td><td class="left" data-stat="team_id"><a href="/teams/SAC/2011.html">SAC</a></td><td class="left" data-stat="lg_id"><a href="/leagues/NBA_2011.html">NBA</a></td><td class="center" data-stat="pos">C</td><td class="right" data-stat="g">1</td><td class="right iz" data-stat="gs">0</td><td class="right" data-stat="mp">2</td><td class="right iz" data-stat="fg_per_mp">0.0</td><td class="right iz" data-stat="fga_per_mp">0.0</td><td class="right iz" data-stat="fg_pct"></td><td class="right iz" data-stat="fg3_per_mp">0.0</td><td class="right iz" data-stat="fg3a_per_mp">0.0</td><td class="right iz" data-stat="fg3_pct"></td><td class="right iz" data-stat="fg2_per_mp">0.0</td><td class="right iz" data-stat="fg2a_per_mp">0.0</td><td class="right iz" data-

Unnamed: 0,Season,AGE,TM,LG,POS,G,GS,MP,FG,FGA,...,OBPM,DBPM,BPM,VORP,teamPPG,oppPPG,SRS,pace,teamOFRtg,teamDFRtg
0,2010-11,21,SAC,NBA,C,1,0,2.0,0.0,0.0,...,-27.9,0.0,-27.9,0.0,99.4,104.7,-4.8,95.2,103.5,-5.6
1,2011-12,22,SAC,NBA,C,18,0,6.1,0.7,1.5,...,-2.0,0.4,-1.6,0.0,98.8,104.4,-4.95,94.7,103.8,-6.0
2,2014-15,25,MIA,NBA,C,48,32,23.8,5.1,8.1,...,2.5,1.6,4.1,1.8,94.7,97.3,-2.92,90.9,103.9,-2.8
3,2015-16,26,MIA,NBA,C,73,43,29.1,5.7,9.3,...,1.7,2.4,4.1,3.3,100.0,98.4,1.5,93.6,106.1,1.7
4,2016-17,27,MIA,NBA,C,77,77,32.6,7.0,12.6,...,0.9,0.5,1.5,2.2,103.2,102.1,0.77,95.2,107.8,1.1
5,2017-18,28,MIA,NBA,C,54,54,25.3,5.8,10.7,...,1.5,1.1,2.6,1.6,103.4,102.9,0.15,95.6,106.8,0.5
6,2018-19,29,MIA,NBA,C,72,53,23.3,5.4,9.4,...,1.1,1.2,2.3,1.8,105.7,105.9,-0.45,98.2,107.3,-0.2
7,2019-20,30,POR,NBA,C,67,61,30.0,6.5,10.5,...,2.5,0.7,3.2,2.6,115.0,116.1,-0.61,100.7,113.7,-1.1
8,2020-21,31,SAC,NBA,C,36,4,15.2,3.5,6.2,...,-1.8,-0.4,-2.2,0.0,113.7,117.4,-3.45,100.0,113.6,-3.7
9,2021-22,32,UTA,NBA,C,65,8,17.9,3.3,5.1,...,0.6,1.4,2.1,1.2,113.6,107.6,5.67,97.1,116.7,6.2


In [345]:
all_players_stats_df = pd.concat(all_DFs_array, ignore_index = True)

0       pietrmi01
1       pietrmi01
2       pietrmi01
3       pietrmi01
4       pietrmi01
          ...    
5284    youngtr01
5285    youngtr01
5286    youngtr01
5287    youngtr01
5288    youngtr01
Name: ID, Length: 5289, dtype: object

In [348]:
all_players_stats_df.to_csv("all_players_stats.csv")