## Start

In [94]:
from nba_api.stats.endpoints import leaguegamelog, teamgamelog, boxscoreadvancedv3
from nba_api.stats.static import teams
import pandas as pd
import time
import warnings

warnings.filterwarnings("ignore")

In [95]:
season = "2019-20"
elo_k = 20
window_size = 5

## Teams and League Info

In [96]:
teams_df = pd.DataFrame(teams.get_teams())
team_to_id = {team["full_name"]: team["id"] for team in teams_df.to_dict("records")}
team_to_id.update(
    {team["abbreviation"]: team["id"] for team in teams_df.to_dict("records")}
)
team_to_id.update({team["id"]: team["id"] for team in teams_df.to_dict("records")})
teams_df.head(5)

Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [97]:
# Get league game log for season
game_log = leaguegamelog.LeagueGameLog(season=season).get_data_frames()
assert len(game_log) == 1
games_df = game_log[0]

# filter games_df to only include home games (prevent duplicating for both teams)
games_df = games_df[~games_df.MATCHUP.str.contains("@")]
print(games_df.shape, len(games_df.GAME_ID.unique()))

# add opponent team_id and team_abbreviation columns
games_df.loc[:, "OPP_TEAM_ABBREVIATION"] = games_df["MATCHUP"].str.split(" vs. ").str[1]
games_df.loc[:, "OPP_TEAM_ID"] = games_df["OPP_TEAM_ABBREVIATION"].map(team_to_id)

# adjust WL to 1 for win and 0 for loss
games_df.loc[:, "WL"] = games_df["WL"].map({"W": 1, "L": 0, 1: 1, 0: 0})

# get rid of useless columns
print(games_df.columns)
games_df = games_df[
    [
        "GAME_ID",
        "TEAM_ID",
        "TEAM_ABBREVIATION",
        "OPP_TEAM_ID",
        "OPP_TEAM_ABBREVIATION",
        "PLUS_MINUS",
        "WL",
    ]
]

games_df.head(10)

(1059, 29) 1059
Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE',
       'OPP_TEAM_ABBREVIATION', 'OPP_TEAM_ID'],
      dtype='object')


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL
0,21900002,1610612746,LAC,1610612747,LAL,10,1
3,21900001,1610612761,TOR,1610612740,NOP,8,1
4,21900008,1610612755,PHI,1610612738,BOS,14,1
6,21900004,1610612754,IND,1610612765,DET,-9,0
7,21900012,1610612756,PHX,1610612758,SAC,29,1
10,21900006,1610612751,BKN,1610612750,MIN,-1,0
13,21900005,1610612753,ORL,1610612739,CLE,9,1
17,21900010,1610612759,SAS,1610612752,NYK,9,1
18,21900009,1610612742,DAL,1610612764,WAS,8,1
20,21900013,1610612757,POR,1610612743,DEN,-8,0


## Team Game Stats

### Processing

In [114]:
def process_team_df(team_df, window_size=5, debug=False):
    team_df = team_df.iloc[::-1]
    team_df["TS"] = team_df["PTS"] / (2 * (team_df["FGA"] + 0.44 * team_df["FTA"]))
    team_df["EFG"] = (team_df["FGM"] + 0.5 * team_df["FG3M"]) / team_df["FGA"]
    team_df["WL"] = team_df["WL"].map({"W": 1, "L": 0, 1: 1, 0: 0})
    team_df["P_W_PCT"] = team_df["W_PCT"].shift(1)
    team_df["PAST_WL"] = (
        team_df["WL"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_PTS"] = (
        team_df["PTS"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_REB"] = (
        team_df["REB"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_AST"] = (
        team_df["AST"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_STL"] = (
        team_df["STL"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_BLK"] = (
        team_df["BLK"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_TOV"] = (
        team_df["TOV"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_PF"] = (
        team_df["PF"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_FG3_PCT"] = (
        team_df["FG3_PCT"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_TS"] = (
        team_df["TS"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["PAST_EFG"] = (
        team_df["EFG"].rolling(window=window_size, min_periods=1).mean().shift(1)
    )
    team_df["P_W-L"] = (team_df["W"] - team_df["L"]).shift(1)
    if debug:
        debug_df = team_df[
            [
                "Game_ID",
                "W_PCT",
                "P_W_PCT",
                "WL",
                "PAST_WL",
                "PTS",
                "PAST_PTS",
                "W",
                "L",
                "P_W-L",
            ]
        ]
        print(debug_df)
    team_df = team_df[
        [
            "Game_ID",
            "GAME_DATE",
            "P_W_PCT",
            "PAST_WL",
            "PAST_PTS",
            "P_W-L",
            "PAST_REB",
            "PAST_AST",
            "PAST_STL",
            "PAST_BLK",
            "PAST_TOV",
            "PAST_PF",
            "PAST_FG3_PCT",
            "PAST_TS",
            "PAST_EFG",
        ]
    ]
    return team_df

In [115]:
# # Testing process_team_df function
# test_df = teamgamelog.TeamGameLog(
#     team_id=team_to_id["Los Angeles Lakers"], season=season
# ).get_data_frames()[0]
# print(test_df.columns)
# test_df = process_team_df(test_df, debug=True)
# test_df.head(10)

### Combining

In [116]:
team_dfs = {}
for i, team in enumerate(teams_df["id"]):
    print(f"Getting game log for {teams_df['full_name'][i]}")
    team_log = teamgamelog.TeamGameLog(team_id=team, season=season).get_data_frames()
    assert len(team_log) == 1
    team_dfs[teams_df["abbreviation"][i]] = process_team_df(
        team_log[0], window_size=window_size
    )
    time.sleep(0.1)

Getting game log for Atlanta Hawks
Getting game log for Boston Celtics
Getting game log for Cleveland Cavaliers
Getting game log for New Orleans Pelicans
Getting game log for Chicago Bulls
Getting game log for Dallas Mavericks
Getting game log for Denver Nuggets
Getting game log for Golden State Warriors
Getting game log for Houston Rockets
Getting game log for Los Angeles Clippers
Getting game log for Los Angeles Lakers
Getting game log for Miami Heat
Getting game log for Milwaukee Bucks
Getting game log for Minnesota Timberwolves
Getting game log for Brooklyn Nets
Getting game log for New York Knicks
Getting game log for Orlando Magic
Getting game log for Indiana Pacers
Getting game log for Philadelphia 76ers
Getting game log for Phoenix Suns
Getting game log for Portland Trail Blazers
Getting game log for Sacramento Kings
Getting game log for San Antonio Spurs
Getting game log for Oklahoma City Thunder
Getting game log for Toronto Raptors
Getting game log for Utah Jazz
Getting game 

In [117]:
comp_team_dfs = pd.concat(
    [df.assign(TEAM_ABBREVIATION=abbr) for abbr, df in team_dfs.items()]
)

In [118]:
full_df = games_df.merge(
    comp_team_dfs,
    left_on=["GAME_ID", "TEAM_ABBREVIATION"],
    right_on=["Game_ID", "TEAM_ABBREVIATION"],
    how="left",
)
full_df = full_df.merge(
    comp_team_dfs,
    left_on=["GAME_ID", "OPP_TEAM_ABBREVIATION"],
    right_on=["Game_ID", "TEAM_ABBREVIATION"],
    how="left",
    suffixes=("", "_OPP"),
)
full_df = full_df.drop(columns=["Game_ID", "TEAM_ABBREVIATION_OPP", "Game_ID_OPP"])

In [119]:
full_df.head(5)

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,OPP_TEAM_ID,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL,GAME_DATE,P_W_PCT,PAST_WL,...,P_W-L_OPP,PAST_REB_OPP,PAST_AST_OPP,PAST_STL_OPP,PAST_BLK_OPP,PAST_TOV_OPP,PAST_PF_OPP,PAST_FG3_PCT_OPP,PAST_TS_OPP,PAST_EFG_OPP
0,21900002,1610612746,LAC,1610612747,LAL,10,1,"OCT 22, 2019",,,...,,,,,,,,,,
1,21900001,1610612761,TOR,1610612740,NOP,8,1,"OCT 22, 2019",,,...,,,,,,,,,,
2,21900008,1610612755,PHI,1610612738,BOS,14,1,"OCT 23, 2019",,,...,,,,,,,,,,
3,21900004,1610612754,IND,1610612765,DET,-9,0,"OCT 23, 2019",,,...,,,,,,,,,,
4,21900012,1610612756,PHX,1610612758,SAC,29,1,"OCT 23, 2019",,,...,,,,,,,,,,


## Advanced Stats

### Combining

In [120]:
# Failed attempt at grabbing all offensive and defensive ratings + other advanced stats for each game (takes too long)
# boxscore_dfs = []
# tot = len(full_df)
# for i, game_row in full_df.iterrows():
#     print("Getting boxscore for", game_row["GAME_ID"], f"{i+1}/{tot}")
#     print(game_row["GAME_ID"])
#     boxscore_df = boxscoreadvancedv3.BoxScoreAdvancedV3(
#         game_id=game_row["GAME_ID"]
#     ).get_data_frames()[1]
#     boxscore_dfs.append(boxscore_df)
#     time.sleep(0.1)

In [121]:
# asdflasdkjf = boxscoreadvancedv3.BoxScoreAdvancedV3(
#     game_id="0021900002"
# ).get_data_frames()
# print(asdflasdkjf[1].columns)
# asdflasdkjf[1].head(10)

### ELO

In [122]:
team_elo = {team: 1500 for team in teams_df["abbreviation"]}

# create 2 new columns for elo ratings
full_df["TEAM_ELO"] = 1500
full_df["OPP_TEAM_ELO"] = 1500


def prob_win(a, b):
    return 1 / (1 + 10 ** ((team_elo[b] - team_elo[a]) / 400))


# def update_amount(winner, loser):
#     return elo_k * (1 - prob_win(winner, loser))

In [123]:
# loop through full_df and update elo ratings
for i, game_row in full_df.iterrows():
    full_df.loc[i, "TEAM_ELO"] = team_elo[game_row["TEAM_ABBREVIATION"]]
    full_df.loc[i, "OPP_TEAM_ELO"] = team_elo[game_row["OPP_TEAM_ABBREVIATION"]]
    winner = (
        game_row["TEAM_ABBREVIATION"]
        if game_row["WL"] == 1
        else game_row["OPP_TEAM_ABBREVIATION"]
    )
    loser = (
        game_row["TEAM_ABBREVIATION"]
        if game_row["WL"] == 0
        else game_row["OPP_TEAM_ABBREVIATION"]
    )
    d_update = elo_k * (1 - prob_win(winner, loser))
    team_elo[winner] += d_update
    team_elo[loser] -= d_update

In [124]:
full_df["ELO_DIFF"] = full_df["TEAM_ELO"] - full_df["OPP_TEAM_ELO"]
full_df["ELO_WIN_PROB"] = 1 / (1 + 10 ** (-full_df["ELO_DIFF"] / 400))

## Final Processing

In [125]:
full_df.columns

Index(['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'OPP_TEAM_ID',
       'OPP_TEAM_ABBREVIATION', 'PLUS_MINUS', 'WL', 'GAME_DATE', 'P_W_PCT',
       'PAST_WL', 'PAST_PTS', 'P_W-L', 'PAST_REB', 'PAST_AST', 'PAST_STL',
       'PAST_BLK', 'PAST_TOV', 'PAST_PF', 'PAST_FG3_PCT', 'PAST_TS',
       'PAST_EFG', 'GAME_DATE_OPP', 'P_W_PCT_OPP', 'PAST_WL_OPP',
       'PAST_PTS_OPP', 'P_W-L_OPP', 'PAST_REB_OPP', 'PAST_AST_OPP',
       'PAST_STL_OPP', 'PAST_BLK_OPP', 'PAST_TOV_OPP', 'PAST_PF_OPP',
       'PAST_FG3_PCT_OPP', 'PAST_TS_OPP', 'PAST_EFG_OPP', 'TEAM_ELO',
       'OPP_TEAM_ELO', 'ELO_DIFF', 'ELO_WIN_PROB'],
      dtype='object')

In [126]:
# count rows with na values
print(full_df.shape, full_df.isna().sum().sum())
df = full_df[
    [
        "GAME_ID",
        "TEAM_ABBREVIATION",
        "OPP_TEAM_ABBREVIATION",
        "PLUS_MINUS",
        "WL",
        "GAME_DATE",
        "P_W_PCT",
        "PAST_WL",
        "PAST_PTS",
        "P_W-L",
        "PAST_REB",
        "PAST_AST",
        "PAST_STL",
        "PAST_BLK",
        "PAST_TOV",
        "PAST_PF",
        "PAST_FG3_PCT",
        "PAST_TS",
        "PAST_EFG",
        "P_W_PCT_OPP",
        "PAST_WL_OPP",
        "PAST_PTS_OPP",
        "P_W-L_OPP",
        "PAST_REB_OPP",
        "PAST_AST_OPP",
        "PAST_STL_OPP",
        "PAST_BLK_OPP",
        "PAST_TOV_OPP",
        "PAST_PF_OPP",
        "PAST_FG3_PCT_OPP",
        "PAST_TS_OPP",
        "PAST_EFG_OPP",
        "TEAM_ELO",
        "OPP_TEAM_ELO",
        "ELO_WIN_PROB",
    ]
]
df.dropna(inplace=True)
df.tail(10)

(1059, 39) 390


Unnamed: 0,GAME_ID,TEAM_ABBREVIATION,OPP_TEAM_ABBREVIATION,PLUS_MINUS,WL,GAME_DATE,P_W_PCT,PAST_WL,PAST_PTS,P_W-L,...,PAST_STL_OPP,PAST_BLK_OPP,PAST_TOV_OPP,PAST_PF_OPP,PAST_FG3_PCT_OPP,PAST_TS_OPP,PAST_EFG_OPP,TEAM_ELO,OPP_TEAM_ELO,ELO_WIN_PROB
1049,21901313,PHX,DAL,26,1,"AUG 13, 2020",0.458,1.0,121.6,-6.0,...,4.6,3.2,11.0,22.0,0.37,0.594754,0.546243,1521.646858,1531.377075,0.486001
1050,21901312,ORL,NOP,6,1,"AUG 13, 2020",0.444,0.0,104.8,-8.0,...,6.2,4.6,17.4,21.8,0.3652,0.588516,0.555432,1467.379102,1478.129988,0.484533
1051,21901308,BOS,WAS,-6,0,"AUG 13, 2020",0.676,0.8,124.2,25.0,...,6.2,4.4,13.4,22.6,0.3326,0.51544,0.474561,1612.307092,1381.147761,0.79095
1052,21901309,BKN,POR,-1,0,"AUG 13, 2020",0.493,0.8,118.0,-1.0,...,7.2,4.4,10.0,22.4,0.4162,0.5866,0.545659,1523.302464,1515.682811,0.510964
1053,21901314,UTA,SAS,6,1,"AUG 13, 2020",0.606,0.2,117.8,15.0,...,9.8,5.4,14.6,21.6,0.4074,0.600716,0.548967,1543.064548,1510.517673,0.546702
1054,21901311,MEM,MIL,13,1,"AUG 13, 2020",0.458,0.2,108.2,-6.0,...,4.6,4.6,15.6,22.6,0.3758,0.604802,0.56631,1476.499332,1637.300183,0.283809
1055,21901316,IND,MIA,17,1,"AUG 14, 2020",0.611,0.6,107.0,16.0,...,9.0,5.2,13.6,22.2,0.354,0.589253,0.54516,1579.852603,1538.417386,0.559349
1056,21901318,TOR,DEN,8,1,"AUG 14, 2020",0.732,0.8,111.2,33.0,...,6.2,3.8,13.4,21.2,0.4136,0.623245,0.59704,1667.911603,1565.735355,0.642946
1057,21901317,LAC,OKC,4,1,"AUG 14, 2020",0.676,0.6,121.4,25.0,...,9.0,2.6,14.0,20.6,0.3372,0.545441,0.49756,1611.448856,1604.386847,0.510162
1058,21901315,HOU,PHI,-38,0,"AUG 14, 2020",0.62,0.4,110.6,17.0,...,7.0,5.0,12.6,22.6,0.4004,0.569984,0.535909,1565.9112,1531.050046,0.550002


In [127]:
df.to_csv(f"nba_{season}_data.csv", index=False)