## Start

In [47]:
from nba_api.stats.endpoints import leaguegamelog, teamgamelog, boxscoreadvancedv3
from nba_api.stats.static import teams
import pandas as pd
import time
import warnings

warnings.filterwarnings("ignore")

In [48]:
season = "2024-25"
elo_k = 20
window_size = 5
min_window = 5

## Teams and League Info

In [49]:
teams_df = pd.DataFrame(teams.get_teams())
team_to_id = {team["full_name"]: team["id"] for team in teams_df.to_dict("records")}
team_to_id.update(
    {team["abbreviation"]: team["id"] for team in teams_df.to_dict("records")}
)
team_to_id.update({team["id"]: team["id"] for team in teams_df.to_dict("records")})
teams_df.head(5)

Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [50]:
# Team id to Fake game id
fake_games = {
    team_to_id["ATL"]: "ABC",
    team_to_id["NYK"]: "ABC",
    team_to_id["HOU"]: "DEF",
    team_to_id["GSW"]: "DEF",
}

In [51]:
# Get league game log for season
game_log = leaguegamelog.LeagueGameLog(season=season).get_data_frames()
assert len(game_log) == 1
games_df = game_log[0]

# filter games_df to only include home games (prevent duplicating for both teams)
games_df = games_df[~games_df.MATCHUP.str.contains("@")]
print(games_df.shape, len(games_df.GAME_ID.unique()))

# add opponent team_id and team_abbreviation columns
games_df.loc[:, "OPP_TEAM_ABBREVIATION"] = games_df["MATCHUP"].str.split(" vs. ").str[1]
games_df.loc[:, "OPP_TEAM_ID"] = games_df["OPP_TEAM_ABBREVIATION"].map(team_to_id)

# adjust WL to 1 for win and 0 for loss
games_df.loc[:, "WL"] = games_df["WL"].map({"W": 1, "L": 0, 1: 1, 0: 0})

# get rid of useless columns
games_df = games_df[
    [
        "GAME_ID",
        "TEAM_ID",
        "TEAM_ABBREVIATION",
        "OPP_TEAM_ID",
        "OPP_TEAM_ABBREVIATION",
        "PLUS_MINUS",
        "WL",
    ]
]
games_df.loc[len(games_df)] = [
    "ABC",
    team_to_id["NYK"],
    "NYK",
    team_to_id["ATL"],
    "ATL",
    -1,
    -1,
]
games_df.loc[len(games_df)] = [
    "DEF",
    team_to_id["HOU"],
    "HOU",
    team_to_id["GSW"],
    "GSW",
    -1,
    -1,
]

games_df.tail(10)

(360, 29) 360


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL
702,0022400358,1610612747,LAL,1610612757,POR,9,1
708,0022400353,1610612764,WAS,1610612763,MEM,-28,0
710,0022400349,1610612754,IND,1610612766,CHA,-4,0
712,0022400347,1610612741,CHI,1610612755,PHI,-8,0
714,0022400354,1610612759,SAS,1610612740,NOP,5,1
717,0022400359,1610612761,TOR,1610612752,NYK,-5,0
718,0022401201,1610612749,MIL,1610612753,ORL,5,1
719,0022401203,1610612760,OKC,1610612742,DAL,14,1
360,ABC,1610612752,NYK,1610612737,ATL,-1,-1
361,DEF,1610612745,HOU,1610612744,GSW,-1,-1


## Team Game Stats

### Processing

In [62]:
def process_team_df(team_df, window_size, mp, debug=False):
    team_df = team_df.iloc[::-1]

    team_id = team_df["Team_ID"].iloc[0]
    if team_id in fake_games:
        team_df.loc[len(team_df)] = [
            team_id,
            fake_games[team_id],
            "2024-12-11",
            "ABC vs. DEF",
        ] + [-1] * (len(team_df.columns) - 4)

    team_df["TS"] = team_df["PTS"] / (2 * (team_df["FGA"] + 0.44 * team_df["FTA"]))
    team_df["EFG"] = (team_df["FGM"] + 0.5 * team_df["FG3M"]) / team_df["FGA"]
    team_df["WL"] = team_df["WL"].map({"W": 1, "L": 0, 1: 1, 0: 0})
    team_df["P_W_PCT"] = team_df["W_PCT"].shift(1)
    team_df["PAST_WL"] = (
        team_df["WL"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )
    team_df["PAST_PTS"] = (
        team_df["PTS"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )

    team_df["PAST_REB"] = (
        team_df["REB"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )

    team_df["PAST_AST"] = (
        team_df["AST"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )

    team_df["PAST_STL"] = (
        team_df["STL"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )
    team_df["PAST_BLK"] = (
        team_df["BLK"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )
    team_df["PAST_TOV"] = (
        team_df["TOV"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )
    team_df["PAST_PF"] = (
        team_df["PF"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )
    team_df["PAST_FG3_PCT"] = (
        team_df["FG3_PCT"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )

    team_df["PAST_TS"] = (
        team_df["TS"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )
    team_df["PAST_EFG"] = (
        team_df["EFG"].rolling(window=window_size, min_periods=mp).mean().shift(1)
    )
    team_df["P_W-L"] = (team_df["W"] - team_df["L"]).shift(1)
    if debug:
        debug_df = team_df[
            [
                "Game_ID",
                "W_PCT",
                "P_W_PCT",
                "WL",
                "PAST_WL",
                "PTS",
                "PAST_PTS",
                "W",
                "L",
                "P_W-L",
            ]
        ]

        print(debug_df)
    team_df = team_df[
        [
            "Game_ID",
            "GAME_DATE",
            "P_W_PCT",
            "PAST_WL",
            "PAST_PTS",
            "P_W-L",
            "PAST_REB",
            "PAST_AST",
            "PAST_STL",
            "PAST_BLK",
            "PAST_TOV",
            "PAST_PF",
            "PAST_FG3_PCT",
            "PAST_TS",
            "PAST_EFG",
        ]
    ]

    return team_df

In [65]:
# # Testing process_team_df function
# test_df = teamgamelog.TeamGameLog(
#     team_id=team_to_id["ATL"], season=season
# ).get_data_frames()[0]
# test_df = process_team_df(test_df, window_size=window_size, mp=min_window, debug=False)
# test_df.tail(3)

### Combining

In [66]:
team_dfs = {}
for i, team in enumerate(teams_df["id"]):
    print(f"Getting game log for {teams_df['full_name'][i]}")
    team_log = teamgamelog.TeamGameLog(team_id=team, season=season).get_data_frames()
    assert len(team_log) == 1
    team_dfs[teams_df["abbreviation"][i]] = process_team_df(
        team_log[0], window_size=window_size, mp=min_window
    )
    time.sleep(0.1)

Getting game log for Atlanta Hawks
Getting game log for Boston Celtics
Getting game log for Cleveland Cavaliers
Getting game log for New Orleans Pelicans
Getting game log for Chicago Bulls
Getting game log for Dallas Mavericks
Getting game log for Denver Nuggets
Getting game log for Golden State Warriors
Getting game log for Houston Rockets
Getting game log for Los Angeles Clippers
Getting game log for Los Angeles Lakers
Getting game log for Miami Heat
Getting game log for Milwaukee Bucks
Getting game log for Minnesota Timberwolves
Getting game log for Brooklyn Nets
Getting game log for New York Knicks
Getting game log for Orlando Magic
Getting game log for Indiana Pacers
Getting game log for Philadelphia 76ers
Getting game log for Phoenix Suns
Getting game log for Portland Trail Blazers
Getting game log for Sacramento Kings
Getting game log for San Antonio Spurs
Getting game log for Oklahoma City Thunder
Getting game log for Toronto Raptors
Getting game log for Utah Jazz
Getting game 

In [67]:
comp_team_dfs = pd.concat(
    [df.assign(TEAM_ABBREVIATION=abbr) for abbr, df in team_dfs.items()]
)

In [68]:
full_df = games_df.merge(
    comp_team_dfs,
    left_on=["GAME_ID", "TEAM_ABBREVIATION"],
    right_on=["Game_ID", "TEAM_ABBREVIATION"],
    how="left",
)
full_df = full_df.merge(
    comp_team_dfs,
    left_on=["GAME_ID", "OPP_TEAM_ABBREVIATION"],
    right_on=["Game_ID", "TEAM_ABBREVIATION"],
    how="left",
    suffixes=("", "_OPP"),
)
full_df = full_df.drop(columns=["Game_ID", "TEAM_ABBREVIATION_OPP", "Game_ID_OPP"])

In [70]:
full_df.tail(2)

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL,GAME_DATE,P_W_PCT,PAST_WL,...,P_W-L_OPP,PAST_REB_OPP,PAST_AST_OPP,PAST_STL_OPP,PAST_BLK_OPP,PAST_TOV_OPP,PAST_PF_OPP,PAST_FG3_PCT_OPP,PAST_TS_OPP,PAST_EFG_OPP
360,ABC,1610612752,NYK,1610612737,ATL,-1,-1,2024-12-11,0.625,0.8,...,1.0,49.4,30.4,11.6,4.2,15.6,20.2,0.2994,0.564431,0.526914
361,DEF,1610612745,HOU,1610612744,GSW,-1,-1,2024-12-11,0.667,0.6,...,5.0,46.2,26.4,8.4,4.2,13.2,17.8,0.3318,0.532516,0.49985


## Advanced Stats

### Combining

In [71]:
# Failed attempt at grabbing all offensive and defensive ratings + other advanced stats for each game (takes too long)
# boxscore_dfs = []
# tot = len(full_df)
# for i, game_row in full_df.iterrows():
#     print("Getting boxscore for", game_row["GAME_ID"], f"{i+1}/{tot}")
#     print(game_row["GAME_ID"])
#     boxscore_df = boxscoreadvancedv3.BoxScoreAdvancedV3(
#         game_id=game_row["GAME_ID"]
#     ).get_data_frames()[1]
#     boxscore_dfs.append(boxscore_df)
#     time.sleep(0.1)

In [72]:
# asdflasdkjf = boxscoreadvancedv3.BoxScoreAdvancedV3(
#     game_id="0021900002"
# ).get_data_frames()
# print(asdflasdkjf[1].columns)
# asdflasdkjf[1].head(10)

### ELO

In [73]:
team_elo = {team: 1500 for team in teams_df["abbreviation"]}

# create 2 new columns for elo ratings
full_df["TEAM_ELO"] = 1500
full_df["OPP_TEAM_ELO"] = 1500


def prob_win(a, b):
    return 1 / (1 + 10 ** ((team_elo[b] - team_elo[a]) / 400))


# def update_amount(winner, loser):
#     return elo_k * (1 - prob_win(winner, loser))

In [74]:
# loop through full_df and update elo ratings
for i, game_row in full_df.iterrows():
    full_df.loc[i, "TEAM_ELO"] = team_elo[game_row["TEAM_ABBREVIATION"]]
    full_df.loc[i, "OPP_TEAM_ELO"] = team_elo[game_row["OPP_TEAM_ABBREVIATION"]]
    winner = (
        game_row["TEAM_ABBREVIATION"]
        if game_row["WL"] == 1
        else game_row["OPP_TEAM_ABBREVIATION"]
    )
    loser = (
        game_row["TEAM_ABBREVIATION"]
        if game_row["WL"] == 0
        else game_row["OPP_TEAM_ABBREVIATION"]
    )
    d_update = elo_k * (1 - prob_win(winner, loser))
    team_elo[winner] += d_update
    team_elo[loser] -= d_update

In [75]:
full_df["ELO_DIFF"] = full_df["TEAM_ELO"] - full_df["OPP_TEAM_ELO"]
full_df["ELO_WIN_PROB"] = 1 / (1 + 10 ** (-full_df["ELO_DIFF"] / 400))

## Final Processing

In [76]:
full_df.columns

Index(['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'OPP_TEAM_ID',
       'OPP_TEAM_ABBREVIATION', 'PLUS_MINUS', 'WL', 'GAME_DATE', 'P_W_PCT',
       'PAST_WL', 'PAST_PTS', 'P_W-L', 'PAST_REB', 'PAST_AST', 'PAST_STL',
       'PAST_BLK', 'PAST_TOV', 'PAST_PF', 'PAST_FG3_PCT', 'PAST_TS',
       'PAST_EFG', 'GAME_DATE_OPP', 'P_W_PCT_OPP', 'PAST_WL_OPP',
       'PAST_PTS_OPP', 'P_W-L_OPP', 'PAST_REB_OPP', 'PAST_AST_OPP',
       'PAST_STL_OPP', 'PAST_BLK_OPP', 'PAST_TOV_OPP', 'PAST_PF_OPP',
       'PAST_FG3_PCT_OPP', 'PAST_TS_OPP', 'PAST_EFG_OPP', 'TEAM_ELO',
       'OPP_TEAM_ELO', 'ELO_DIFF', 'ELO_WIN_PROB'],
      dtype='object')

In [77]:
# count rows with na values
print(full_df.shape, full_df.isna().sum().sum())
df = full_df[
    [
        "GAME_ID",
        "TEAM_ABBREVIATION",
        "OPP_TEAM_ABBREVIATION",
        "PLUS_MINUS",
        "WL",
        "GAME_DATE",
        "P_W_PCT",
        "PAST_WL",
        "PAST_PTS",
        "P_W-L",
        "PAST_REB",
        "PAST_AST",
        "PAST_STL",
        "PAST_BLK",
        "PAST_TOV",
        "PAST_PF",
        "PAST_FG3_PCT",
        "PAST_TS",
        "PAST_EFG",
        "P_W_PCT_OPP",
        "PAST_WL_OPP",
        "PAST_PTS_OPP",
        "P_W-L_OPP",
        "PAST_REB_OPP",
        "PAST_AST_OPP",
        "PAST_STL_OPP",
        "PAST_BLK_OPP",
        "PAST_TOV_OPP",
        "PAST_PF_OPP",
        "PAST_FG3_PCT_OPP",
        "PAST_TS_OPP",
        "PAST_EFG_OPP",
        "TEAM_ELO",
        "OPP_TEAM_ELO",
        "ELO_WIN_PROB",
    ]
]
df.dropna(inplace=True)
df.tail(10)

(362, 39) 1688


Unnamed: 0,GAME_ID,TEAM_ABBREVIATION,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL,GAME_DATE,P_W_PCT,PAST_WL,PAST_PTS,P_W-L,...,PAST_STL_OPP,PAST_BLK_OPP,PAST_TOV_OPP,PAST_PF_OPP,PAST_FG3_PCT_OPP,PAST_TS_OPP,PAST_EFG_OPP,TEAM_ELO,OPP_TEAM_ELO,ELO_WIN_PROB
352,0022400358,LAL,POR,9,1,"DEC 08, 2024",0.522,0.2,100.6,1.0,...,6.6,3.8,14.2,19.4,0.3956,0.599462,0.579711,1493.572925,1450.4767,0.561704
353,0022400353,WAS,MEM,-28,0,"DEC 08, 2024",0.143,0.2,104.0,-15.0,...,11.0,5.2,15.8,23.8,0.3526,0.602294,0.56639,1396.253525,1565.370728,0.27418
354,0022400349,IND,CHA,-4,0,"DEC 08, 2024",0.417,0.2,112.0,-4.0,...,7.0,4.8,12.4,21.6,0.3486,0.536782,0.502143,1460.572563,1410.524297,0.571531
355,0022400347,CHI,PHI,-8,0,"DEC 08, 2024",0.417,0.4,127.6,-4.0,...,10.0,3.8,13.8,21.4,0.351,0.570792,0.542082,1476.296466,1443.820789,0.546601
356,0022400354,SAS,NOP,5,1,"DEC 08, 2024",0.478,0.2,111.6,-1.0,...,13.2,6.2,15.0,19.6,0.2908,0.531379,0.496941,1488.545563,1398.596887,0.62663
357,0022400359,TOR,NYK,-5,0,"DEC 09, 2024",0.292,0.4,112.4,-10.0,...,8.2,5.4,15.4,16.6,0.4234,0.619834,0.587403,1435.214323,1532.855925,0.363069
358,0022401201,MIL,ORL,5,1,"DEC 10, 2024",0.522,0.6,115.8,1.0,...,10.2,6.8,16.4,21.6,0.2746,0.558985,0.516568,1517.970033,1560.95963,0.438447
359,0022401203,OKC,DAL,14,1,"DEC 10, 2024",0.783,0.8,119.6,13.0,...,10.2,6.8,17.2,18.2,0.439,0.631926,0.59631,1592.323942,1566.825262,0.53663
360,ABC,NYK,ATL,-1,-1,2024-12-11,0.625,0.8,117.6,6.0,...,11.6,4.2,15.6,20.2,0.2994,0.564431,0.526914,1540.117297,1518.428821,0.531172
361,DEF,HOU,GSW,-1,-1,2024-12-11,0.667,0.6,112.4,8.0,...,8.4,4.2,13.2,17.8,0.3318,0.532516,0.49985,1558.467095,1531.683321,0.538469


In [78]:
df.to_csv(f"data/nba_{season}_data.csv", index=False)