In [3]:
from nba_api.stats.endpoints import leaguegamelog, teamgamelog
from nba_api.stats.static import teams
import pandas as pd
import time
import warnings

warnings.filterwarnings("ignore")

In [4]:
season = "2019-20"

In [5]:
teams_df = pd.DataFrame(teams.get_teams())
team_to_id = {team["full_name"]: team["id"] for team in teams_df.to_dict("records")}
team_to_id.update(
    {team["abbreviation"]: team["id"] for team in teams_df.to_dict("records")}
)
team_to_id.update({team["id"]: team["id"] for team in teams_df.to_dict("records")})
teams_df.head(5)

Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [6]:
# Get league game log for 2023 - 2024 season
game_log = leaguegamelog.LeagueGameLog(season=season).get_data_frames()
assert len(game_log) == 1
games_df = game_log[0]

# filter games_df to only include home games (prevent duplicating for both teams)
games_df = games_df[~games_df.MATCHUP.str.contains("@")]
print(games_df.shape, len(games_df.GAME_ID.unique()))

# add opponent team_id and team_abbreviation columns
games_df.loc[:, "OPP_TEAM_ABBREVIATION"] = games_df["MATCHUP"].str.split(" vs. ").str[1]
games_df.loc[:, "OPP_TEAM_ID"] = games_df["OPP_TEAM_ABBREVIATION"].map(team_to_id)

# adjust WL to 1 for win and 0 for loss
games_df.loc[:, "WL"] = games_df["WL"].map({"W": 1, "L": 0, 1: 1, 0: 0})

# get rid of useless columns
games_df = games_df[
    [
        "GAME_ID",
        "GAME_DATE",
        "TEAM_ID",
        "TEAM_ABBREVIATION",
        "OPP_TEAM_ID",
        "OPP_TEAM_ABBREVIATION",
        "PLUS_MINUS",
        "WL",
    ]
]

games_df.head(10)

(1059, 29) 1059


Unnamed: 0,GAME_ID,GAME_DATE,TEAM_ID,TEAM_ABBREVIATION,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL
1,21900002,2019-10-22,1610612746,LAC,1610612747,LAL,10,1
3,21900001,2019-10-22,1610612761,TOR,1610612740,NOP,8,1
4,21900005,2019-10-23,1610612753,ORL,1610612739,CLE,9,1
7,21900006,2019-10-23,1610612751,BKN,1610612750,MIN,-1,0
9,21900008,2019-10-23,1610612755,PHI,1610612738,BOS,14,1
13,21900007,2019-10-23,1610612748,MIA,1610612763,MEM,19,1
14,21900012,2019-10-23,1610612756,PHX,1610612758,SAC,29,1
15,21900013,2019-10-23,1610612757,POR,1610612743,DEN,-8,0
19,21900003,2019-10-23,1610612766,CHA,1610612741,CHI,1,1
22,21900010,2019-10-23,1610612759,SAS,1610612752,NYK,9,1


In [None]:
def process_team_df(team_df, window_size=5, debug=False):
    team_df = team_df.iloc[::-1]
    team_df["TS"] = team_df["PTS"] / (2 * (team_df["FGA"] + 0.44 * team_df["FTA"]))
    team_df["EFG"] = (team_df["FGM"] + 0.5 * team_df["FG3M"]) / team_df["FGA"]
    team_df["WL"] = team_df["WL"].map({"W": 1, "L": 0, 1: 1, 0: 0})
    team_df["P_W_PCT"] = team_df["W_PCT"].shift(1)
    team_df["PAST_WL"] = (
        team_df["WL"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_PTS"] = (
        team_df["PTS"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_REB"] = (
        team_df["REB"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_AST"] = (
        team_df["AST"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_STL"] = (
        team_df["STL"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_BLK"] = (
        team_df["BLK"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_TOV"] = (
        team_df["TOV"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_PF"] = (
        team_df["PF"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_FG3_PCT"] = (
        team_df["FG3_PCT"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_TS"] = (
        team_df["TS"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_EFG"] = (
        team_df["EFG"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["P_W-L"] = (team_df["W"] - team_df["L"]).shift(1)
    if debug:
        debug_df = team_df[
            [
                "Game_ID",
                "W_PCT",
                "P_W_PCT",
                "WL",
                "PAST_WL",
                "PTS",
                "PAST_PTS",
                "W",
                "L",
                "P_W-L",
            ]
        ]
        print(debug_df)
    team_df = team_df[
        [
            "Game_ID",
            "GAME_DATE",
            "P_W_PCT",
            "PAST_WL",
            "PAST_PTS",
            "P_W-L",
            "PAST_REB",
            "PAST_AST",
            "PAST_STL",
            "PAST_BLK",
            "PAST_TOV",
            "PAST_PF",
            "PAST_FG3_PCT",
            "PAST_TS",
            "PAST_EFG",
        ]
    ]
    return team_df

In [19]:
# # Testing process_team_df function
# test_df = teamgamelog.TeamGameLog(
#     team_id=team_to_id["Los Angeles Lakers"], season=season
# ).get_data_frames()[0]
# print(test_df.columns)
# test_df = process_team_df(test_df, debug=True)
# test_df.head(10)

In [7]:
team_dfs = {}
for i, team in enumerate(teams_df["id"]):
    print(f"Getting game log for {teams_df['full_name'][i]}")
    team_log = teamgamelog.TeamGameLog(team_id=team, season=season).get_data_frames()
    assert len(team_log) == 1
    team_dfs[teams_df["abbreviation"][i]] = process_team_df(team_log[0])
    time.sleep(0.5)

Getting game log for Atlanta Hawks
Getting game log for Boston Celtics
Getting game log for Cleveland Cavaliers
Getting game log for New Orleans Pelicans
Getting game log for Chicago Bulls
Getting game log for Dallas Mavericks
Getting game log for Denver Nuggets
Getting game log for Golden State Warriors
Getting game log for Houston Rockets
Getting game log for Los Angeles Clippers
Getting game log for Los Angeles Lakers
Getting game log for Miami Heat
Getting game log for Milwaukee Bucks
Getting game log for Minnesota Timberwolves
Getting game log for Brooklyn Nets
Getting game log for New York Knicks
Getting game log for Orlando Magic
Getting game log for Indiana Pacers
Getting game log for Philadelphia 76ers
Getting game log for Phoenix Suns
Getting game log for Portland Trail Blazers
Getting game log for Sacramento Kings
Getting game log for San Antonio Spurs
Getting game log for Oklahoma City Thunder
Getting game log for Toronto Raptors
Getting game log for Utah Jazz
Getting game 

In [8]:
comp_team_dfs = pd.concat(
    [df.assign(TEAM_ABBREVIATION=abbr) for abbr, df in team_dfs.items()]
)

In [9]:
full_df = games_df.merge(
    comp_team_dfs,
    left_on=["GAME_ID", "TEAM_ABBREVIATION"],
    right_on=["Game_ID", "TEAM_ABBREVIATION"],
    how="left",
)
full_df = full_df.merge(
    comp_team_dfs,
    left_on=["GAME_ID", "OPP_TEAM_ABBREVIATION"],
    right_on=["Game_ID", "TEAM_ABBREVIATION"],
    how="left",
    suffixes=("", "_OPP"),
)
full_df = full_df.drop(columns=["Game_ID", "TEAM_ABBREVIATION_OPP", "Game_ID_OPP"])

In [14]:
full_df.head(5)

Unnamed: 0,GAME_ID,GAME_DATE,TEAM_ID,TEAM_ABBREVIATION,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL,P_W_PCT,PAST_5,PAST_5_PTS,P_W-L,P_W_PCT_OPP,PAST_5_OPP,PAST_5_PTS_OPP,P_W-L_OPP
0,21900002,2019-10-22,1610612746,LAC,1610612747,LAL,10,1,,,,,,,,
1,21900001,2019-10-22,1610612761,TOR,1610612740,NOP,8,1,,,,,,,,
2,21900005,2019-10-23,1610612753,ORL,1610612739,CLE,9,1,,,,,,,,
3,21900006,2019-10-23,1610612751,BKN,1610612750,MIN,-1,0,,,,,,,,
4,21900008,2019-10-23,1610612755,PHI,1610612738,BOS,14,1,,,,,,,,


In [13]:
# count rows with na values
print(full_df.shape, full_df.isna().sum().sum())
df = full_df.dropna()
df.head(10)

(1059, 16) 120


Unnamed: 0,GAME_ID,GAME_DATE,TEAM_ID,TEAM_ABBREVIATION,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL,P_W_PCT,PAST_5,PAST_5_PTS,P_W-L,P_W_PCT_OPP,PAST_5_OPP,PAST_5_PTS_OPP,P_W-L_OPP
16,21900017,2019-10-25,1610612738,BOS,1610612761,TOR,6,1,0.0,0.0,93.0,-1.0,1.0,1.0,130.0,1.0
17,21900018,2019-10-25,1610612766,CHA,1610612750,MIN,-22,0,1.0,1.0,126.0,1.0,1.0,1.0,127.0,1.0
18,21900022,2019-10-25,1610612760,OKC,1610612764,WAS,-12,0,0.0,0.0,95.0,-1.0,0.0,0.0,100.0,-1.0
19,21900025,2019-10-25,1610612747,LAL,1610612762,UTA,9,1,0.0,0.0,102.0,-1.0,1.0,1.0,100.0,1.0
20,21900020,2019-10-25,1610612763,MEM,1610612741,CHI,-8,0,0.0,0.0,101.0,-1.0,0.0,0.0,125.0,-1.0
21,21900024,2019-10-25,1610612758,SAC,1610612757,POR,-10,0,0.0,0.0,95.0,-1.0,0.0,0.0,100.0,-1.0
22,21900021,2019-10-25,1610612740,NOP,1610612742,DAL,-7,0,0.0,0.0,122.0,-1.0,1.0,1.0,108.0,1.0
23,21900023,2019-10-25,1610612743,DEN,1610612756,PHX,1,1,1.0,1.0,108.0,1.0,1.0,1.0,124.0,1.0
24,21900019,2019-10-25,1610612751,BKN,1610612752,NYK,4,1,0.0,0.0,126.0,-1.0,0.0,0.0,111.0,-1.0
25,21900035,2019-10-26,1610612756,PHX,1610612746,LAC,8,1,0.5,0.5,115.5,0.0,1.0,1.0,126.5,2.0


In [11]:
# df.to_csv(f"nba_{season}_data.csv", index=False)