In [1]:
import pandas as pd
from nba_api.stats.static import teams
from nba_api.stats.endpoints import playergamelogs
import numpy as np
from typing import List

In [45]:
df = pd.DataFrame()
start_year = 17

while start_year < 25:
  year_string = f"20{start_year}-{start_year+1}"
  p_df = playergamelogs.PlayerGameLogs(season_nullable=year_string).get_data_frames()[0]
  df = pd.concat([df, p_df])
  start_year += 1

print(df.columns.to_list())

['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS', 'NBA_FANTASY_PTS', 'DD2', 'TD3', 'WNBA_FANTASY_PTS', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK', 'WNBA_FANTASY_PTS_RANK', 'AVAILABLE_FLAG', 'MIN_SEC']


In [18]:
def create_avg_over_season_columns(source: pd.DataFrame, names: List[str]) -> pd.DataFrame:
    """
    @parameter source: A dataframe containing atleast 1 season of data from the playergamelogs endpoint in the api
    @parameter names: a list of columns from the dataframe you wish to average
    
    Will create a new dataframe with player_id, game_id (unique identifiers) and the averages of the stats you request
    """
    # Sort the DataFrame (optional depending on your needs)
    source = source.sort_values(by=['PLAYER_ID', 'SEASON_YEAR'])

    new_column_names = []
    for name in names:
        season_avg = source.groupby(['PLAYER_ID', 'SEASON_YEAR'])[name].mean()
        column_name = f"{name}_SEASON_AVG"
        new_column_names.append(column_name)
    
        # Map the calculated averages back to the original DataFrame
        source[column_name] = source.set_index(['PLAYER_ID', 'SEASON_YEAR']).index.map(season_avg)

    new_column_names.append('PLAYER_ID')
    new_column_names.append('GAME_ID')
    # Return the desired DataFrame with PLAYER_ID, GAME_ID, and the new column
    return source[new_column_names]

In [19]:
cols_to_avg = ['PTS', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'STL', 'BLK']

new_df = create_avg_over_season_columns(source=df, names=cols_to_avg)
new_df.head()

Unnamed: 0,PTS_SEASON_AVG,FG_PCT_SEASON_AVG,FG3_PCT_SEASON_AVG,FT_PCT_SEASON_AVG,REB_SEASON_AVG,AST_SEASON_AVG,STL_SEASON_AVG,BLK_SEASON_AVG,PLAYER_ID,GAME_ID
189,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701230
534,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701210
1011,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701188
1496,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701167
1831,5.396552,0.356897,0.283638,0.220121,2.551724,1.189655,0.724138,0.448276,1713,21701155


In [22]:
def calculate_weighted_average_column(source: pd.DataFrame, num_of_games: int, names: List[str]) -> pd.DataFrame:
    """
    @parameter source: A dataframe from the playergamelogs endpoint of the nba_api
    @num_of_games The number of games we calculate the moving average form
    @parameter names: The names of the columns we are calculating WMA for
    """
    source = source.sort_values(by=['PLAYER_ID', 'SEASON_YEAR', 'GAME_DATE'])

    weights = np.arange(num_of_games, 0, -1)
    def weighted_average(series):
        if len(series) < len(weights):
            current_weights = weights[:len(series)]
        else:
            current_weights = weights
        return np.average(series, weights=current_weights)
    
    wma_names = []
    for name in names:
        wma_col_name = f"WMA_{name}_LAST_{num_of_games}"
        wma_names.append(wma_col_name)

        source[wma_col_name] = (
            source.groupby(['PLAYER_ID', 'SEASON_YEAR'])[name]
            .rolling(window=num_of_games, min_periods=1)
            .apply(weighted_average, raw=True)
            .reset_index(level=[0, 1], drop=True)
        )
    
    wma_names.extend(['PLAYER_ID', 'GAME_ID'])

    return source[wma_names]

In [23]:
wma_names = ['PTS', 'FG_PCT', 'FG3_PCT', 'FT_PCT']
third_df = calculate_weighted_average_column(source=df, num_of_games=5, names=wma_names)
third_df.head()

Unnamed: 0,WMA_PTS_LAST_5,WMA_FG_PCT_LAST_5,WMA_FG3_PCT_LAST_5,WMA_FT_PCT_LAST_5,PLAYER_ID,GAME_ID
25992,6.0,0.667,0.667,0.0,1713,21700013
25688,6.0,0.518556,0.592778,0.0,1713,21700024
25502,4.5,0.388917,0.444583,0.0,1713,21700035
25204,3.857143,0.333357,0.381071,0.0,1713,21700048
24672,3.8,0.333333,0.389,0.0,1713,21700069


In [26]:
def calc_fatigue(source: pd.DataFrame) -> pd.DataFrame:
    """
    @parameter source: A dataframe from the playergamelogs nba_api endpoint

    Will give you three new columns 'HOME' a 1/0 boolean if the
    game is at home or not.

    'GAMES_LAST_7_DAYS' an int with how many games in the past 7 days
    'AWAY_GAMES_IN_A_ROW' used to determine amount of time on the row 
    """
    source['GAME_DATE'] = pd.to_datetime(source['GAME_DATE'])
    source['HOME'] = np.where(source['MATCHUP'].str.contains('@'), 0, 1)

    def calc_games_last_7(group):
        counts = []
        for i in range(len(group)):
            current_date = group.iloc[i]['GAME_DATE']
            # Filter rows strictly before the current date and within the past 7 days
            past_week_games = group[(group['GAME_DATE'] < current_date) & 
                                (group['GAME_DATE'] >= current_date - pd.Timedelta(days=7))]
            counts.append(len(past_week_games))
        return pd.Series(counts, index=group.index)

    source = source.sort_values(by=['PLAYER_ID', 'GAME_DATE'])
    source['GAMES_LAST_7_DAYS'] = (
        source
        .groupby('PLAYER_ID', group_keys=False)
        .apply(calc_games_last_7)
    )

    def calc_away_streak(group):
        # Initialize the result list to store away game streaks
        away_streak = []
        current_streak = 0  # To keep track of the ongoing streak

        for i in range(len(group)):
            if group.iloc[i]['HOME'] == 0:
                # Increment the streak if HOME = 0
                current_streak += 1
            else:
                # Reset the streak if HOME = 1
                current_streak = 0
            # Append the current streak to the result
            away_streak.append(current_streak)

        return pd.Series(away_streak, index=group.index)
    
    source['AWAY_GAMES_IN_A_ROW'] = source.groupby('PLAYER_ID', group_keys=False).apply(calc_away_streak)

    return source[['PLAYER_ID', 'GAME_ID', 'HOME', 'GAMES_LAST_7_DAYS','AWAY_GAMES_IN_A_ROW']]

In [25]:
fat_df = calc_fatigue(source=df)
fat_df.head(10)

  source['AWAY_GAMES_IN_A_ROW'] = source.groupby('PLAYER_ID', group_keys=False).apply(calc_away_streak)


Unnamed: 0,PLAYER_ID,GAME_ID,HOME,AWAY_GAMES_IN_A_ROW,GAME_DATE,MATCHUP
25938,1713,21700013,1,0,2017-10-18,SAC vs. HOU
25636,1713,21700024,0,1,2017-10-20,SAC @ DAL
25370,1713,21700035,0,2,2017-10-21,SAC @ DEN
25217,1713,21700048,0,3,2017-10-23,SAC @ PHX
24644,1713,21700069,1,0,2017-10-26,SAC vs. NOP
24181,1713,21700089,1,0,2017-10-29,SAC vs. WAS
23697,1713,21700109,0,1,2017-11-01,SAC @ BOS
23286,1713,21700132,0,2,2017-11-04,SAC @ DET
22793,1713,21700158,1,0,2017-11-07,SAC vs. OKC
20524,1713,21700266,1,0,2017-11-22,SAC vs. LAL


In [46]:
def calc_historic_performance(source: pd.DataFrame) -> pd.DataFrame:
    source.rename(columns={'TEAM_ABBREVIATION': 'P_TEAM_ABBR'}, inplace=True)
    source['A_TEAM_ABBR'] = source['MATCHUP'].str.split(' ').str[-1]
    teams_df = teams.get_teams()
    teams_df = pd.DataFrame(teams_df)
    source = source.merge(teams_df[['id', 'abbreviation']], left_on='A_TEAM_ABBR', right_on='abbreviation', how='left')
    source.rename(columns={'id': 'A_TEAM_ID'}, inplace=True)
    source.drop(columns=['abbreviation'], inplace=True)

    source['GAME_DATE'] = pd.to_datetime(source['GAME_DATE'])
    source = source.sort_values(by=['PLAYER_ID', 'A_TEAM_ID', 'GAME_DATE'])

    historic_vs_team = []
    for index, row in source.iterrows():

        past_games = source[
            (source['PLAYER_ID'] == row['PLAYER_ID']) &
             (source['A_TEAM_ID'] == row['GAME_DATE']) &
             (source['GAME_DATE'] < row['GAME_DATE'])
        ]

        if past_games.empty:
            avg = row['PTS']
        else:
            avg = past_games['PTS'].mean()
        historic_vs_team.append(avg)

    source['HISTORIC_VS_TEAM'] = historic_vs_team
    return source[['PLAYER_ID', 'GAME_ID', 'HISTORIC_VS_TEAM']]
    

In [47]:
hist_df = calc_historic_performance(source=df)
hist_df.head(10)

Unnamed: 0,PLAYER_ID,GAME_ID,HISTORIC_VS_TEAM
3266,1713,21701077,8
23697,1713,21700109,2
2860,1713,21701099,5
46285,1713,21800268,0
43201,1713,21800420,12
37589,1713,21800683,4
30212,1713,21801034,16
58904,1713,21900745,4
58273,1713,21900771,10
18392,1713,21700353,5


In [6]:
df = df.sort_values(by=['PLAYER_ID', 'SEASON_YEAR', 'GAME_DATE'])

weighted_moving_avg = []
weights = np.arange(10, 0, -1)

for index, row in df.iterrows():
    # Filter the dataframe to get the last 10 games of the same player in the same season
    past_games = df[
        (df['PLAYER_ID'] == row['PLAYER_ID']) &
        (df['SEASON_YEAR'] == row['SEASON_YEAR']) &
        (df['GAME_DATE'] < row['GAME_DATE'])
    ].head(10)  # Get the last 10 games
    
    if not past_games.empty:
        # Use weights for the weighted moving average
        game_count = len(past_games)
        current_weights = weights[:game_count]  # Use only the available weights
        weighted_avg = np.average(past_games['PTS'], weights=current_weights)
    else:
        # If no past games, set to NaN
        weighted_avg = np.nan
    
    # Append the calculated value to the list
    weighted_moving_avg.append(weighted_avg)

df['WMA_PPG_LAST_10'] = weighted_moving_avg

KeyboardInterrupt: 