In [1]:
import pandas as pd
from datetime import datetime  
from dateutil import parser
import requests

In [2]:
basketball_reference_abbreviations = {
    'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BRK', 'Charlotte Hornets': 'CHO',
    'Chicago Bulls': 'CHI', 'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL', 'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET', 'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND',
    'Los Angeles Lakers': 'LAL', 'Los Angeles Clippers': 'LAC',
    'Memphis Grizzlies': 'MEM', 'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOP', 'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI', 'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR', 'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS', 'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'
}
MONTHS = ["october", "november","december"]


In [3]:
schedule = pd.read_html("https://www.basketball-reference.com/leagues/NBA_2022_games.html#schedule")[0]

In [4]:
def find_abbreviation(team: str) -> str:
        return basketball_reference_abbreviations[team]


def convert_date(date):
        date = parser.parse(date)
        return datetime.strftime(date, "%Y%m%d")
         
schedule["AbbrHomeTeam"] = schedule["Home/Neutral"].map(find_abbreviation)
schedule["DateStr"] = schedule["Date"].map(convert_date)
schedule.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes,AbbrHomeTeam,DateStr
0,"Tue, Oct 19, 2021",7:30p,Brooklyn Nets,104,Milwaukee Bucks,127,Box Score,,17341.0,,MIL,20211019
1,"Tue, Oct 19, 2021",10:00p,Golden State Warriors,121,Los Angeles Lakers,114,Box Score,,18997.0,,LAL,20211019
2,"Wed, Oct 20, 2021",7:00p,Indiana Pacers,122,Charlotte Hornets,123,Box Score,,15521.0,,CHO,20211020
3,"Wed, Oct 20, 2021",7:00p,Chicago Bulls,94,Detroit Pistons,88,Box Score,,20088.0,,DET,20211020
4,"Wed, Oct 20, 2021",7:30p,Boston Celtics,134,New York Knicks,138,Box Score,2OT,19812.0,,NYK,20211020


In [5]:
def gen_url(row) -> str:
    """Function to find url for stats per game"""
    team = row["AbbrHomeTeam"]
    date = parser.parse(row["Date"])
    date = datetime.strftime(date, "%Y%m%d")
    url = f"https://www.basketball-reference.com/boxscores/{date}0{team}.html"
    return url

schedule["url"] = schedule.apply(lambda x: gen_url(x),axis = 1)


In [17]:
all_stats = pd.DataFrame()
for index,row in schedule.iterrows():
    # r = requests.get(url)
    url = row["url"]
    data = pd.read_html(url)[0]
    data["GameDay"] = row["Date"]
    all_stats = pd.concat([all_stats,data])

all_stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,GameDay
Unnamed: 0_level_1,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Unnamed: 21_level_1
0,Joe Harris,30:46,3,9,.333,3,5,.600,0,0,...,2,2,2,0,0,2,1,9,0,"Tue, Oct 19, 2021"
1,James Harden,30:38,6,16,.375,4,8,.500,4,4,...,5,8,8,1,2,4,3,20,-20,"Tue, Oct 19, 2021"
2,Kevin Durant,30:15,13,25,.520,3,7,.429,3,6,...,11,11,4,0,2,1,2,32,-20,"Tue, Oct 19, 2021"
3,Nicolas Claxton,24:10,6,9,.667,0,0,,0,3,...,7,7,0,0,0,0,3,12,-15,"Tue, Oct 19, 2021"
4,Blake Griffin,22:59,2,5,.400,0,1,.000,2,2,...,4,5,0,1,0,1,1,6,-6,"Tue, Oct 19, 2021"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,D.J. Augustin,10:32,0,1,.000,0,1,.000,0,0,...,0,0,1,0,0,0,0,0,-9,"Sun, Oct 31, 2021"
11,Usman Garuba,1:25,0,0,,0,0,,0,0,...,0,1,0,0,0,0,0,0,+4,"Sun, Oct 31, 2021"
12,Josh Christopher,1:25,0,0,,0,0,,0,0,...,0,0,0,0,0,0,0,0,+4,"Sun, Oct 31, 2021"
13,Armoni Brooks,1:25,0,0,,0,0,,0,0,...,0,0,0,0,0,0,0,0,+4,"Sun, Oct 31, 2021"


In [1]:
import pandas as pd

In [59]:
all_stats = pd.read_csv("all_stats.csv")

In [60]:
all_stats.head(2)

Unnamed: 0.1,Unnamed: 0,Player,MP,FG,FGA,FG%,3P,3PA,3P%,FT,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,GameDay
0,0,Joe Harris,30:46,3.0,9.0,0.333,3.0,5.0,0.6,0.0,...,2.0,2.0,2.0,0.0,0.0,2.0,1.0,9.0,0.0,"Tue, Oct 19, 2021"
1,1,James Harden,30:38,6.0,16.0,0.375,4.0,8.0,0.5,4.0,...,5.0,8.0,8.0,1.0,2.0,4.0,3.0,20.0,-20.0,"Tue, Oct 19, 2021"


In [26]:
cols = ["nan","Player","MP","FG","FGA","FG%",
        "3P","3PA","3P%","FT","FTA","FT%","ORB",
        "DRB","TRB","AST","STL","BLK","TOV","PF",
        "PTS","+/-", "GameDay"]
all_stats
all_stats.columns = cols

In [39]:
all_stats = all_stats.loc[
    (all_stats["Player"] != "Starters") &
    (all_stats["Player"] != "Reserves") &
    (all_stats["Player"] != "Team Totals")
    ]

In [44]:
all_stats.head(2)

Unnamed: 0,nan,Player,MP,FG,FGA,FG%,3P,3PA,3P%,FT,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,GameDay
1,0.0,Joe Harris,30:46,3,9,0.333,3,5,0.6,0,...,2,2,2,0,0,2,1,9,0,"Tue, Oct 19, 2021"
2,1.0,James Harden,30:38,6,16,0.375,4,8,0.5,4,...,5,8,8,1,2,4,3,20,-20,"Tue, Oct 19, 2021"


In [56]:
numeric_cols = ["FG","FGA","FG%","PTS","+/-",
                "3P","3PA","3P%","FT","FTA","FT%","ORB",
                "DRB","TRB","AST","STL","BLK","TOV","PF"]
cleaned = all_stats.copy()
cleaned[numeric_cols] = cleaned[numeric_cols].apply(pd.to_numeric, errors="coerce")
cleaned

Unnamed: 0,nan,Player,MP,FG,FGA,FG%,3P,3PA,3P%,FT,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,GameDay
1,0.0,Joe Harris,,3.0,9.0,0.333,3.0,5.0,0.600,0.0,...,2.0,2.0,2.0,0.0,0.0,2.0,1.0,9.0,0.0,"Tue, Oct 19, 2021"
2,1.0,James Harden,,6.0,16.0,0.375,4.0,8.0,0.500,4.0,...,5.0,8.0,8.0,1.0,2.0,4.0,3.0,20.0,-20.0,"Tue, Oct 19, 2021"
3,2.0,Kevin Durant,,13.0,25.0,0.520,3.0,7.0,0.429,3.0,...,11.0,11.0,4.0,0.0,2.0,1.0,2.0,32.0,-20.0,"Tue, Oct 19, 2021"
4,3.0,Nicolas Claxton,,6.0,9.0,0.667,0.0,0.0,,0.0,...,7.0,7.0,0.0,0.0,0.0,0.0,3.0,12.0,-15.0,"Tue, Oct 19, 2021"
5,4.0,Blake Griffin,,2.0,5.0,0.400,0.0,1.0,0.000,2.0,...,4.0,5.0,0.0,1.0,0.0,1.0,1.0,6.0,-6.0,"Tue, Oct 19, 2021"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445,9.0,David Nwaba,,0.0,2.0,0.000,0.0,0.0,,0.0,...,4.0,4.0,0.0,0.0,0.0,1.0,1.0,0.0,-9.0,"Sun, Oct 31, 2021"
1446,10.0,D.J. Augustin,,0.0,1.0,0.000,0.0,1.0,0.000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-9.0,"Sun, Oct 31, 2021"
1447,11.0,Usman Garuba,,0.0,0.0,,0.0,0.0,,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,"Sun, Oct 31, 2021"
1448,12.0,Josh Christopher,,0.0,0.0,,0.0,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,"Sun, Oct 31, 2021"


In [64]:
pd.read_html("https://www.basketball-reference.com/boxscores/202110190MIL.html")[0].head(2)

Unnamed: 0_level_0,Unnamed: 0_level_0,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats
Unnamed: 0_level_1,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
0,Joe Harris,30:46,3,9,0.333,3,5,0.6,0,0,...,0,2,2,2,0,0,2,1,9,0
1,James Harden,30:38,6,16,0.375,4,8,0.5,4,4,...,3,5,8,8,1,2,4,3,20,-20


In [70]:
html = pd.read_html("https://www.basketball-reference.com/boxscores/202110190MIL.html")

In [90]:
html[1].head(2)

Unnamed: 0_level_0,Unnamed: 0_level_0,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats
Unnamed: 0_level_1,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
0,Joe Harris,7:26,0,1,0.0,0,0,,0,0,...,0,1,1,1,0,0,0,0,0,-3
1,James Harden,9:51,2,4,0.5,0,2,0.0,2,2,...,0,1,1,3,0,1,0,1,6,-7


In [139]:
html = pd.read_html("https://www.basketball-reference.com/boxscores/202111100LAL.html")
html[9][('Basic Box Score Stats','MP')].iloc[-1]

'265'

In [108]:
tables = []
for i in html:
    if "Basic Box Score Stats" in i.columns:
        print(len(i))

16
16
16
16
16
16
16
13
13
13
13
13
13
13
