In [1]:
import pandas as pd
from nba_api.stats.static import teams
from nba_api.stats.endpoints import (playergamelogs, 
                                     teamgamelogs, teamestimatedmetrics)
import numpy as np
from typing import List

In [2]:
df = pd.DataFrame()
start_year = 17

while start_year < 25:
  year_string = f"20{start_year}-{start_year+1}"
  p_df = playergamelogs.PlayerGameLogs(season_nullable=year_string).get_data_frames()[0]
  df = pd.concat([df, p_df])
  start_year += 1

In [6]:
def create_avg_over_season_columns(source: pd.DataFrame, names: List[str]) -> pd.DataFrame:
    """
    @parameter source: A dataframe containing atleast 1 season of data from the playergamelogs endpoint in the api
    @parameter names: a list of columns from the dataframe you wish to average
    
    Will create a new dataframe with player_id, game_id (unique identifiers) and the averages of the stats you request
    """
    # Sort the DataFrame (optional depending on your needs)
    source = source.sort_values(by=['PLAYER_ID', 'SEASON_YEAR'])

    new_column_names = []
    for name in names:
        season_avg = source.groupby(['PLAYER_ID', 'SEASON_YEAR'])[name].mean()
        column_name = f"{name}_SEASON_AVG"
        new_column_names.append(column_name)
    
        # Map the calculated averages back to the original DataFrame
        source[column_name] = source.set_index(['PLAYER_ID', 'SEASON_YEAR']).index.map(season_avg)

    new_column_names.append('PLAYER_ID')
    new_column_names.append('GAME_ID')
    # Return the desired DataFrame with PLAYER_ID, GAME_ID, and the new column
    return source[new_column_names]

In [19]:
cols_to_avg = ['PTS', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'STL', 'BLK']

new_df = create_avg_over_season_columns(source=df, names=cols_to_avg)
new_df.head()

Unnamed: 0,PTS_SEASON_AVG,FG_PCT_SEASON_AVG,FG3_PCT_SEASON_AVG,FT_PCT_SEASON_AVG,REB_SEASON_AVG,AST_SEASON_AVG,STL_SEASON_AVG,BLK_SEASON_AVG,PLAYER_ID,GAME_ID
189,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701230
534,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701210
1011,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701188
1496,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701167
1831,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701155


In [7]:
def weighted_average(series):
        weights = np.arange(10, 0, -1)
        if len(series) < len(weights):
            current_weights = weights[:len(series)]
        else:
            current_weights = weights
        return np.average(series, weights=current_weights)

In [8]:
def calculate_weighted_average_column(source: pd.DataFrame, names: List[str]) -> pd.DataFrame:
    """
    @parameter source: A dataframe from the playergamelogs endpoint of the nba_api
    @parameter names: The names of the columns we are calculating WMA for
    """
    source = source.sort_values(by=['PLAYER_ID', 'SEASON_YEAR', 'GAME_DATE'])

    wma_names = []
    for name in names:
        wma_col_name = f"WMA_{name}_LAST_10"
        wma_names.append(wma_col_name)

        source[wma_col_name] = (
            source.groupby(['PLAYER_ID', 'SEASON_YEAR'])[name]
            .rolling(window=10, min_periods=1)
            .apply(weighted_average, raw=True)
            .reset_index(level=[0, 1], drop=True)
        )
    
    wma_names.extend(['PLAYER_ID', 'GAME_ID'])

    return source[wma_names]

In [23]:
wma_names = ['PTS', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'MIN']
third_df = calculate_weighted_average_column(source=df, names=wma_names)
third_df.head()

Unnamed: 0,WMA_PTS_LAST_5,WMA_FG_PCT_LAST_5,WMA_FG3_PCT_LAST_5,WMA_FT_PCT_LAST_5,PLAYER_ID,GAME_ID
25992,6.0,0.667,0.667,0.0,1713,21700013
25688,6.0,0.518556,0.592778,0.0,1713,21700024
25502,4.5,0.388917,0.444583,0.0,1713,21700035
25204,3.857143,0.333357,0.381071,0.0,1713,21700048
24672,3.8,0.333333,0.389,0.0,1713,21700069


In [9]:
def calc_fatigue(source: pd.DataFrame) -> pd.DataFrame:
    """
    @parameter source: A dataframe from the playergamelogs nba_api endpoint

    Will give you three new columns 'HOME' a 1/0 boolean if the
    game is at home or not.

    'GAMES_LAST_7_DAYS' an int with how many games in the past 7 days
    'AWAY_GAMES_IN_A_ROW' used to determine amount of time on the row 
    """
    source['GAME_DATE'] = pd.to_datetime(source['GAME_DATE'])
    source['HOME'] = np.where(source['MATCHUP'].str.contains('@'), 0, 1)

    def calc_games_last_7(group):
        counts = []
        for i in range(len(group)):
            current_date = group.iloc[i]['GAME_DATE']
            # Filter rows strictly before the current date and within the past 7 days
            past_week_games = group[(group['GAME_DATE'] < current_date) & 
                                (group['GAME_DATE'] >= current_date - pd.Timedelta(days=7))]
            counts.append(len(past_week_games))
        return pd.Series(counts, index=group.index)

    source = source.sort_values(by=['PLAYER_ID', 'GAME_DATE'])
    source['GAMES_LAST_7_DAYS'] = (
        source
        .groupby('PLAYER_ID', group_keys=False)
        .apply(calc_games_last_7)
    )

    def calc_away_streak(group):
        # Initialize the result list to store away game streaks
        away_streak = []
        current_streak = 0  # To keep track of the ongoing streak

        for i in range(len(group)):
            if group.iloc[i]['HOME'] == 0:
                # Increment the streak if HOME = 0
                current_streak += 1
            else:
                # Reset the streak if HOME = 1
                current_streak = 0
            # Append the current streak to the result
            away_streak.append(current_streak)

        return pd.Series(away_streak, index=group.index)
    
    source['AWAY_GAMES_IN_A_ROW'] = source.groupby('PLAYER_ID', group_keys=False).apply(calc_away_streak)

    return source[['PLAYER_ID', 'GAME_ID', 'HOME', 'GAMES_LAST_7_DAYS','AWAY_GAMES_IN_A_ROW']]

In [25]:
fat_df = calc_fatigue(source=df)
fat_df.head(10)

  source['AWAY_GAMES_IN_A_ROW'] = source.groupby('PLAYER_ID', group_keys=False).apply(calc_away_streak)


Unnamed: 0,PLAYER_ID,GAME_ID,HOME,AWAY_GAMES_IN_A_ROW,GAME_DATE,MATCHUP
25938,1713,21700013,1,0,2017-10-18,SAC vs. HOU
25636,1713,21700024,0,1,2017-10-20,SAC @ DAL
25370,1713,21700035,0,2,2017-10-21,SAC @ DEN
25217,1713,21700048,0,3,2017-10-23,SAC @ PHX
24644,1713,21700069,1,0,2017-10-26,SAC vs. NOP
24181,1713,21700089,1,0,2017-10-29,SAC vs. WAS
23697,1713,21700109,0,1,2017-11-01,SAC @ BOS
23286,1713,21700132,0,2,2017-11-04,SAC @ DET
22793,1713,21700158,1,0,2017-11-07,SAC vs. OKC
20524,1713,21700266,1,0,2017-11-22,SAC vs. LAL


In [10]:
def calc_historic_performance(source: pd.DataFrame) -> pd.DataFrame:
    source.rename(columns={'TEAM_ABBREVIATION': 'P_TEAM_ABBR'}, inplace=True)
    source['A_TEAM_ABBR'] = source['MATCHUP'].str.split(' ').str[-1]
    teams_df = teams.get_teams()
    teams_df = pd.DataFrame(teams_df)
    source = source.merge(teams_df[['id', 'abbreviation']], left_on='A_TEAM_ABBR', right_on='abbreviation', how='left')
    source.rename(columns={'id': 'A_TEAM_ID'}, inplace=True)
    source.drop(columns=['abbreviation'], inplace=True)

    source['GAME_DATE'] = pd.to_datetime(source['GAME_DATE'])
    source = source.sort_values(by=['PLAYER_ID', 'A_TEAM_ID', 'GAME_DATE'])

    historic_vs_team = []
    for index, row in source.iterrows():

        past_games = source[
            (source['PLAYER_ID'] == row['PLAYER_ID']) &
             (source['A_TEAM_ID'] == row['GAME_DATE']) &
             (source['GAME_DATE'] < row['GAME_DATE'])
        ]

        if past_games.empty:
            avg = row['PTS']
        else:
            avg = past_games['PTS'].mean()
        historic_vs_team.append(avg)

    source['HISTORIC_VS_TEAM'] = historic_vs_team
    return source[['PLAYER_ID', 'GAME_ID', 'HISTORIC_VS_TEAM']]
    

In [47]:
hist_df = calc_historic_performance(source=df)
hist_df.head(10)

Unnamed: 0,PLAYER_ID,GAME_ID,HISTORIC_VS_TEAM
3266,1713,21701077,8
23697,1713,21700109,2
2860,1713,21701099,5
46285,1713,21800268,0
43201,1713,21800420,12
37589,1713,21800683,4
30212,1713,21801034,16
58904,1713,21900745,4
58273,1713,21900771,10
18392,1713,21700353,5


In [55]:
team_df = pd.DataFrame()
start_year = 17

while start_year < 25:
    year_string = f"20{start_year}-{start_year+1}"
    t_df = teamgamelogs.TeamGameLogs(season_nullable=year_string).get_data_frames()[0]
    team_df = pd.concat([team_df, t_df])
    start_year += 1

In [11]:
def calc_team_stats(t_source: pd.DataFrame) -> pd.DataFrame:
    t_source = t_source.sort_values(by=['TEAM_ID', 'SEASON_YEAR', 'GAME_DATE']).reset_index(drop=True)
    t_source['WIN_INDICATOR'] = (t_source['WL'] == 'W').astype(int)
    
    t_source['WINS_PER_LAST_10'] = (
        t_source.groupby(['TEAM_ID', 'SEASON_YEAR'])['WIN_INDICATOR']
        .rolling(window=10, min_periods=1)  # Rolling window of size 10
        .apply(lambda x: x.mean(), raw=False)
        .shift(1)  # Exclude the current game from the count
        .reset_index(drop=True)  # Align indices with the original DataFrame
    )

    t_source['OPP_TEAM_ABBR'] = t_source['MATCHUP'].apply(lambda x: x.split(' ')[-1])

    # Step 2: Create an opponent DataFrame with GAME_ID and PTS (linking via OPP_TEAM_ABBR)
    opp_df = t_source[['GAME_ID', 'TEAM_ABBREVIATION', 'PTS']].rename(columns={
        'TEAM_ABBREVIATION': 'OPP_TEAM_ABBR',  # Use 'TEAM_ABBR' to align with the extracted 'OPP_TEAM_ABBR'
        'PTS': 'PTS_ALLOWED'          # Rename PTS to PTS_ALLOWED
    })

    t_source = t_source.merge(opp_df, how='left', on=['GAME_ID', 'OPP_TEAM_ABBR'])
    
    t_source['WMA_PTS_ALLOWED_10'] = (
        t_source.groupby(['TEAM_ID', 'SEASON_YEAR'])['PTS_ALLOWED']
        .rolling(window=10, min_periods=1)
        .apply(weighted_average, raw=True)
        .reset_index(level=[0, 1], drop=True)
    )

    t_source['WMA_AST_10'] = (
        t_source.groupby(['TEAM_ID', 'SEASON_YEAR'])['AST']
        .rolling(window=10, min_periods=1)
        .apply(weighted_average, raw=True)
        .reset_index(level=[0, 1], drop=True)
    )

    season_avg = t_source.groupby(['TEAM_ID', 'SEASON_YEAR'])['PTS_ALLOWED'].mean()
    t_source['PTS_ALLOWED_OVR_SEASON'] = t_source.set_index(['TEAM_ID', 'SEASON_YEAR']).index.map(season_avg)

    return t_source[['TEAM_ID', 'GAME_ID', 'WINS_PER_LAST_10', 'WMA_PTS_ALLOWED_10', 'WMA_AST_10', 'PTS_ALLOWED_OVR_SEASON']]

    

In [12]:
def get_team_stats_for_players(p_source: pd.DataFrame, t_source) -> pd.DataFrame:
    t_source = calc_team_stats(t_source)

    p_source = p_source.merge(t_source[['TEAM_ID', 'GAME_ID', 'WMA_AST_10']], how='left', on=['GAME_ID', 'TEAM_ID'])

    p_source['OPP_TEAM_ABBR'] = p_source['MATCHUP'].apply(lambda x: x.split(' ')[-1])
    
    team_df = pd.DataFrame(teams.teams)
    p_source = p_source.merge(team_df[[0, 1]], how='left', left_on='OPP_TEAM_ABBR', right_on=1)

    p_source.rename(columns={0: 'OPP_TEAM_ID'}, inplace=True)
    p_source.drop(columns=['OPP_TEAM_ABBR', 1], inplace=True)

    p_source = p_source.merge(t_source[['GAME_ID', 'TEAM_ID', 
                                        'WINS_PER_LAST_10', 'WMA_PTS_ALLOWED_10', 
                                        'PTS_ALLOWED_OVR_SEASON']], how='left', left_on=['GAME_ID', 'OPP_TEAM_ID'], right_on=['GAME_ID', 'TEAM_ID'])

    p_source.rename(columns={
        'WINS_PER_LAST_10': 'OPP_WINS_LAST_10',
        'WMA_PTS_ALLOWED_10': 'OPP_WMA_PTS_ALLOWED',
        'PTS_ALLOWED_OVR_SEASON': 'OPP_PTS_ALLOWED'
    }, inplace=True)

    # Include player teams pace and opp teams pace
    est_df = pd.DataFrame()
    start_year = 17

    while start_year < 25:
        year_string = f"20{start_year}-{start_year+1}"
        e_df = teamestimatedmetrics.TeamEstimatedMetrics(season=year_string).get_data_frames()[0]
        e_df['SEASON_YEAR'] = year_string
        est_df = pd.concat([est_df, e_df])
        start_year += 1    

    p_source.drop(columns=['TEAM_ID_y'], inplace=True)
    p_source.rename(columns={
        'TEAM_ID_x': 'TEAM_ID'
    }, inplace=True)

    p_source = p_source.merge(est_df[['TEAM_ID', 'E_PACE', 'E_OFF_RATING', 'SEASON_YEAR']], how='left', on=['TEAM_ID', 'SEASON_YEAR'])
    p_source.rename(columns={'E_PACE': 'PLAYER_TEAM_PACE',
                           'E_OFF_RATING': 'PLAYER_TEAM_OFF_RATING'}, inplace=True)
    
    p_source = p_source.merge(est_df[['TEAM_ID', 'E_PACE', 'E_DEF_RATING', 'SEASON_YEAR']], how='left', 
                              right_on=['TEAM_ID', 'SEASON_YEAR'], left_on=['OPP_TEAM_ID', 'SEASON_YEAR'])
    p_source.rename(columns={
        'E_PACE': 'OPP_TEAM_PACE',
        'E_DEF_RATING': 'OPP_DEF_RATING'
    }, inplace=True)
    
    return p_source[['GAME_ID', 'PLAYER_ID', 'WMA_AST_10', 'OPP_WINS_LAST_10', 
                     'OPP_WMA_PTS_ALLOWED', 'OPP_PTS_ALLOWED', 'PLAYER_TEAM_PACE', 
                     'PLAYER_TEAM_OFF_RATING', 'OPP_TEAM_PACE', 'OPP_DEF_RATING']]


In [92]:
w_df = get_team_stats_for_players(df, team_df)
w_df.head()

Unnamed: 0,GAME_ID,PLAYER_ID,WMA_AST_10,OPP_WINS_LAST_10,OPP_WMA_PTS_ALLOWED,OPP_PTS_ALLOWED,PLAYER_TEAM_PACE,PLAYER_TEAM_OFF_RATING,OPP_TEAM_PACE,OPP_DEF_RATING
0,21701230,1626161,23.290909,0.8,96.8,103.878049,97.1,101.1,99.7,103.8
1,21701224,202397,25.527273,0.3,107.763636,109.963415,98.4,104.9,100.3,109.1
2,21701221,201949,23.345455,0.6,105.309091,103.878049,97.8,104.5,99.8,103.4
3,21701225,200794,26.581818,0.6,106.636364,107.280488,99.1,109.6,98.3,108.4
4,21701227,201596,22.581818,0.5,105.654545,104.439024,97.3,101.8,99.2,104.7


In [28]:
df = pd.DataFrame()
start_year = 17

while start_year < 25:
  year_string = f"20{start_year}-{start_year+1}"
  p_df = playergamelogs.PlayerGameLogs(season_nullable=year_string).get_data_frames()[0]
  df = pd.concat([df, p_df])
  start_year += 1

pd.set_option('display.max_columns', None)
final_df = df[['PLAYER_ID', 'GAME_ID', 'PTS']]
final_df.head()

Unnamed: 0,PLAYER_ID,GAME_ID,PTS
0,203463,21701227,14
1,204179,21701227,13
2,1627853,21701224,3
3,1626155,21701228,10
4,201163,21701225,0


In [3]:
team_df = pd.DataFrame()
start_year = 17

while start_year < 25:
    year_string = f"20{start_year}-{start_year+1}"
    t_df = teamgamelogs.TeamGameLogs(season_nullable=year_string).get_data_frames()[0]
    team_df = pd.concat([team_df, t_df])
    start_year += 1

In [14]:
final_df = df[['PLAYER_ID', 'GAME_ID', 'PTS']]
final_df.head()

Unnamed: 0,PLAYER_ID,GAME_ID,PTS
0,203082,21701222,8
1,201158,21701223,13
2,1627750,21701225,20
3,1628418,21701228,6
4,1628444,21701219,7


In [15]:
cols_to_avg = ['PTS', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'STL', 'BLK']
wma_names = ['PTS', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'MIN']
wma_df = calculate_weighted_average_column(source=df, names=wma_names)
avg_df = create_avg_over_season_columns(source=df, names=cols_to_avg)
fat_df = calc_fatigue(source=df)
hist_df = calc_historic_performance(df)
team_df = get_team_stats_for_players(df, team_df)

final_df = final_df.merge(avg_df, how='left', on=['PLAYER_ID', 'GAME_ID'])
final_df = final_df.merge(wma_df, how='left', on=['PLAYER_ID', 'GAME_ID'])
final_df = final_df.merge(fat_df, how='left', on=['PLAYER_ID', 'GAME_ID'])
final_df.head()

  .apply(calc_games_last_7)
  source['AWAY_GAMES_IN_A_ROW'] = source.groupby('PLAYER_ID', group_keys=False).apply(calc_away_streak)


ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)