### Import libraries

In [1]:
from nba_api.stats.endpoints import playercareerstats, playergamelog, leaguedashteamstats, commonplayerinfo, leaguedashplayerstats
from nba_api.stats.static import players

import pandas as pd
import numpy as np
from pandas import json_normalize
import requests, time

### Identify top 200 scorers in the league for this season and return a list of player names and IDs.

In [2]:
# Get a list of all "relevant players" this will just be the players in the top 200 PPG for this year.

# Fetch player stats for the current season
player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season='2024-25')  # Update the season as needed
data = player_stats.get_data_frames()[0]

# Sort by Points Per Game (PTS) in descending order and filter the top 200 players
top_200_players = data.sort_values(by='PTS', ascending=False)[:200]

top_200_players.columns = top_200_players.columns.str.lower()

top_200_players = top_200_players[['player_id', 'player_name', 'team_id', 'team_abbreviation']]
top_200_players.to_csv('exports/top_200_players.csv')

top_200_players.head()

Unnamed: 0,player_id,player_name,team_id,team_abbreviation
433,1628983,Shai Gilgeous-Alexander,1610612760,OKC
164,203507,Giannis Antetokounmpo,1610612749,MIL
116,1628368,De'Aaron Fox,1610612758,SAC
27,203076,Anthony Davis,1610612747,LAL
236,1628369,Jayson Tatum,1610612738,BOS


### Get game logs for each player

In [None]:
def get_current_season_game_logs(season='2024-25'):
    
    gamelogs = []

    for _, player in top_200_players.iterrows():
        player_id = player['player_id']
        try:
            logs = playergamelog.PlayerGameLog(player_id=player_id, season=season, timeout=30)
            gamelog_df = logs.get_data_frames()[0]
            gamelogs.append(gamelog_df)
            time.sleep(1)
        except Exception as e:
            print(f"Error fetching game logs for {player['player_name']} (ID {player_id}): {e}")

    # Combine all game logs into a single DataFrame
    if gamelogs:
        all_gamelogs_df = pd.concat(gamelogs, ignore_index=True)
        return all_gamelogs_df
    else:
        return pd.DataFrame()

# Fetch game logs for top 200 players and save to CSV
top_player_gamelogs_season24_25 = get_current_season_game_logs()
top_player_gamelogs_season24_25.to_csv('exports/gamelogs_top200_ppg_season24_25.csv', index=False)

##### New get game logs function, allows for multiple seasons.

In [None]:
def get_multi_season_game_logs(seasons=None):
    
    # Default to the current season if no seasons are provided
    if seasons is None:
        seasons = ['2024-25']

    gamelogs = []

    for season in seasons:
        print(f"Fetching data for season: {season}")
        for _, player in top_200_players.iterrows():
            player_id = player['player_id']
            try:
                logs = playergamelog.PlayerGameLog(player_id=player_id, season=season, timeout=30)
                gamelog_df = logs.get_data_frames()[0]
                gamelog_df['season'] = season  # Add a season column to track logs by season
                gamelogs.append(gamelog_df)
                time.sleep(1)  # To avoid hitting API rate limits
            except Exception as e:
                print(f"Error fetching game logs for {player['player_name']} (ID {player_id}, Season {season}): {e}")

    # Combine all game logs into a single DataFrame
    if gamelogs:
        all_gamelogs_df = pd.concat(gamelogs, ignore_index=True)
        return all_gamelogs_df
    else:
        return pd.DataFrame()


seasons_list = ['2022-23', '2023-24']
top_player_gamelogs_historic = get_multi_season_game_logs(seasons=seasons_list)

Fetching data for season: 2023-24
Fetching data for season: 2022-23


  all_gamelogs_df = pd.concat(gamelogs, ignore_index=True)


#### Split df's by year and save to seperate csv's.

In [None]:
# Split the df by date and then check for null values, length, etc. We will need to handle rookies from 24-25 and 23-24 seasons that end up in the datasets.

top_player_gamelogs_historic.columns = top_player_gamelogs_historic.columns.str.lower()

gamelogs_top200_ppg_season22_23 = top_player_gamelogs_historic[top_player_gamelogs_historic['season'] == '2022-23']
gamelogs_top200_ppg_season23_24 = top_player_gamelogs_historic[top_player_gamelogs_historic['season'] == '2023-24']

# gamelogs_top200_ppg_season22_23.to_csv('exports/gamelogs_top200_ppg_season22_23.csv', index=False)
# gamelogs_top200_ppg_season23_24.to_csv('exports/gamelogs_top200_ppg_season23_24.csv', index=False)

display(gamelogs_top200_ppg_season22_23.head(), gamelogs_top200_ppg_season23_24.head())

NameError: name 'top_player_gamelogs_historic' is not defined

### Function 'process_gamelogs' for feature creation and further processing.

In [None]:
def process_gamelogs(df_list=None):
    
    processed_dfs = []

    for df in df_list:

        top_player_gamelogs_raw = df

        # Merge in player name
        top_200_players = pd.read_csv('exports/top_200_players.csv', index_col=False)

        # Merge the game logs with the player list
        top_player_gamelogs = top_200_players.merge(
            top_player_gamelogs_raw, 
            how='right', 
            left_on='player_id', 
            right_on='player_id'
        )



        # Add scheduling features: home v away, opponent, win v loss, back to back games, time of season.

        # Convert to game_date col to datetime
        top_player_gamelogs['date_dt'] = pd.to_datetime(top_player_gamelogs['game_date'])
        top_player_gamelogs.sort_values(by=['player_id', 'date_dt'], inplace=True)

        # Identify home and away game, use one hot encoding to make binary.
        top_player_gamelogs['home_vs_away'] = top_player_gamelogs['matchup'].apply(lambda x: 'home' if 'vs.' in x else 'away')
        top_player_gamelogs['home_vs_away_hot'] = top_player_gamelogs['matchup'].apply(lambda x: 1 if 'vs.' in x else 0)

        top_player_gamelogs['wl_hot'] = top_player_gamelogs['wl'].apply(lambda x: 1 if 'W' in x else 0)
        top_player_gamelogs['opponent'] = top_player_gamelogs['matchup'].apply(lambda x: x.split('vs. ')[-1] if 'vs. ' in x else x.split('@ ')[-1])

        # Create a column to identify if the player played in back-to-back games. Initialize the 'b2b' column with zero.
        top_player_gamelogs['b2b'] = 0

        # Iterate over groups of player_id
        for player_id, group in top_player_gamelogs.groupby('player_id'):
            group = group.reset_index()
            for i in range(1, len(group)):
                # Check if the current date is consecutive with the previous one
                if (group.loc[i, 'date_dt'] - group.loc[i - 1, 'date_dt']).days == 1:
                    top_player_gamelogs.loc[group.loc[i, 'index'], 'b2b'] = 1


        # Calculate the time of season, splitting into 3 buckets: start, middle, end.
        season_start = top_player_gamelogs['date_dt'].min()
        season_end = top_player_gamelogs['date_dt'].max()

        start_date = season_start
        middle_date = season_start + (season_end - season_start) / 3
        end_date = season_start + 2 * (season_end - season_start) / 3


        # Create tos (time of season) column.
        top_player_gamelogs['tos'] = pd.cut(
            top_player_gamelogs['date_dt'],
            bins=[season_start - pd.Timedelta(days=1), middle_date, end_date, season_end],
            labels=['start', 'middle', 'end']
            )

        # Only take the columns we need
        top_player_gamelogs = top_player_gamelogs[['player_id', 'player_name', 'team_id', 'season', 'season_id', 'tos', 'team_abbreviation', 'game_id', 'date_dt', 'home_vs_away', 'home_vs_away_hot', 'opponent', 'b2b', 'wl', 'wl_hot', 'min', 'plus_minus', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct','pts']]

        # Take a random slice to check for any errors.
        # top_player_gamelogs[1850:1860]








        # Calculate 10, 5 and 3 game rolling averages.

        def player_per_game_rolling_avg(df, num_games_list=None, cols=None):
            if num_games_list is None:
                num_games_list = [10, 5, 3]
            if cols is None:
                cols = ['min', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct',
                'ftm', 'fta', 'ft_pct', 'pts']

            for num_games in num_games_list:
                for col in cols:
                    col_name = f'{col}_{num_games}'
                    df[col_name] = (
                        df.groupby('player_id')[col]
                        .rolling(window=num_games, min_periods=num_games)
                        .mean().round(2)
                        .reset_index(level=0, drop=True)
                    )
            
            # Add rolling average for all games available
            for col in cols:
                col_name = f'{col}_season'
                df[col_name] = (
                    df.groupby('player_id')[col]
                    .expanding(min_periods=1)
                    .mean().round(2)
                    .reset_index(level=0, drop=True)
                )
            
            # Add columns for stats from the players most recent game
            for col in cols:
                col_name = f'{col}_last'
                df[col_name] = df.groupby('player_id')[col].shift(1)
            
            return df

        # Apply rolling averages
        top_player_gamelogs = player_per_game_rolling_avg(top_player_gamelogs)
        
        # Append the processed DataFrame to the list
        processed_dfs.append(top_player_gamelogs)
    
    # Concatenate all DataFrames from the list and return
    return pd.concat(processed_dfs, ignore_index=True)

#### Run the function and review the results

In [43]:
df_list = [gamelogs_top200_ppg_season22_23, gamelogs_top200_ppg_season23_24]
df_list_str = ['gamelogs_top200_ppg_season22_23', 'gamelogs_top200_ppg_season23_24']

processed_dfs = process_gamelogs(df_list=df_list)

display(processed_dfs.head(), processed_dfs.tail())

  top_player_gamelogs['date_dt'] = pd.to_datetime(top_player_gamelogs['game_date'])
  top_player_gamelogs['date_dt'] = pd.to_datetime(top_player_gamelogs['game_date'])


Unnamed: 0,player_id,player_name,team_id,season,season_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,...,fgm_last,fga_last,fg_pct_last,fg3m_last,fg3a_last,fg3_pct_last,ftm_last,fta_last,ft_pct_last,pts_last
0,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200002,2022-10-18,away,0,...,,,,,,,,,,
1,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200016,2022-10-20,home,1,...,12.0,26.0,0.462,3.0,10.0,0.3,4.0,4.0,1.0,31.0
2,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200037,2022-10-23,home,1,...,7.0,17.0,0.412,2.0,8.0,0.25,4.0,6.0,0.667,20.0
3,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200064,2022-10-26,away,0,...,12.0,22.0,0.545,2.0,9.0,0.222,5.0,7.0,0.714,31.0
4,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200076,2022-10-28,away,0,...,8.0,21.0,0.381,2.0,8.0,0.25,1.0,2.0,0.5,19.0


Unnamed: 0,player_id,player_name,team_id,season,season_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,...,fgm_last,fga_last,fg_pct_last,fg3m_last,fg3a_last,fg3_pct_last,ftm_last,fta_last,ft_pct_last,pts_last
23659,1641764,Brandin Podziemski,1610613000.0,2023-24,22023,GSW,22301142,2024-04-07,home,1,...,6.0,11.0,0.545,2.0,2.0,1.0,0.0,0.0,0.0,14.0
23660,1641764,Brandin Podziemski,1610613000.0,2023-24,22023,GSW,22301155,2024-04-09,away,0,...,6.0,10.0,0.6,2.0,4.0,0.5,2.0,2.0,1.0,16.0
23661,1641764,Brandin Podziemski,1610613000.0,2023-24,22023,GSW,22301169,2024-04-11,away,0,...,5.0,6.0,0.833,3.0,3.0,1.0,0.0,0.0,0.0,13.0
23662,1641764,Brandin Podziemski,1610613000.0,2023-24,22023,GSW,22301182,2024-04-12,home,1,...,4.0,9.0,0.444,0.0,1.0,0.0,1.0,1.0,1.0,9.0
23663,1641764,Brandin Podziemski,1610613000.0,2023-24,22023,GSW,22301198,2024-04-14,home,1,...,2.0,4.0,0.5,1.0,3.0,0.333,0.0,0.0,0.0,5.0


### Handling missing values.

In [44]:
# Show all rows of the df where player name is null
print(processed_dfs.info(), "\n\n", "Length of df:", processed_dfs.shape[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23664 entries, 0 to 23663
Data columns (total 81 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   player_id          23664 non-null  int64         
 1   player_name        23430 non-null  object        
 2   team_id            23430 non-null  float64       
 3   season             23664 non-null  object        
 4   season_id          23664 non-null  int64         
 5   team_abbreviation  23430 non-null  object        
 6   game_id            23664 non-null  int64         
 7   date_dt            23664 non-null  datetime64[ns]
 8   home_vs_away       23664 non-null  object        
 9   home_vs_away_hot   23664 non-null  int64         
 10  opponent           23664 non-null  object        
 11  b2b                23664 non-null  int64         
 12  wl                 23664 non-null  object        
 13  wl_hot             23664 non-null  int64         
 14  min   

In [45]:
# Using our eye we can identify 3 cols immediately but to ensure data accuracy, we'll check the list of columns where null values are present.
# We should expect nulls for any rolling avg window, with 10 game rolling averages (col suffix = _10) having the highest rate of nulls and last games (col suffix = _last) having the lowest rate of nulls.

# Summary of nulls in each column
null_summary = processed_dfs.isnull().sum().reset_index()
null_summary.columns = ['Column', 'Null Count']
null_summary['Null Percentage'] = ((null_summary['Null Count'] / processed_dfs.shape[0]) * 100).round(3)

# Display columns with nulls
print(null_summary[null_summary['Null Count'] > 0].sort_values(by='Null Count', ascending=False))

               Column  Null Count  Null Percentage
31            fg3a_10        3259           13.772
36             pts_10        3259           13.772
26             min_10        3259           13.772
27             fgm_10        3259           13.772
28             fga_10        3259           13.772
29          fg_pct_10        3259           13.772
30            fg3m_10        3259           13.772
32         fg3_pct_10        3259           13.772
33             ftm_10        3259           13.772
34             fta_10        3259           13.772
35          ft_pct_10        3259           13.772
44              ftm_5        1454            6.144
45              fta_5        1454            6.144
46           ft_pct_5        1454            6.144
41             fg3m_5        1454            6.144
47              pts_5        1454            6.144
43          fg3_pct_5        1454            6.144
42             fg3a_5        1454            6.144
40           fg_pct_5        14

In [46]:
# Looking at player IDs to identify the reason for the discrepancy.
nulls = processed_dfs[processed_dfs['player_name'].isnull() == True]
print(nulls['player_id'].unique())

# Output of this data shows 2 players IDs: 201143 - Al Horford and 1630170 - Devin Vassell.
# Neither of these players were identified in the top 200 scorers at the start of the script.
# We'll keep this error checking in place and drop the column to assure that all players in the resulting df's are from our top 200 list.
processed_dfs.dropna(subset=['player_name'], inplace=True)
display(processed_dfs.head())

[ 201143 1630170]


Unnamed: 0,player_id,player_name,team_id,season,season_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,...,fgm_last,fga_last,fg_pct_last,fg3m_last,fg3a_last,fg3_pct_last,ftm_last,fta_last,ft_pct_last,pts_last
0,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200002,2022-10-18,away,0,...,,,,,,,,,,
1,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200016,2022-10-20,home,1,...,12.0,26.0,0.462,3.0,10.0,0.3,4.0,4.0,1.0,31.0
2,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200037,2022-10-23,home,1,...,7.0,17.0,0.412,2.0,8.0,0.25,4.0,6.0,0.667,20.0
3,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200064,2022-10-26,away,0,...,12.0,22.0,0.545,2.0,9.0,0.222,5.0,7.0,0.714,31.0
4,2544,LeBron James,1610613000.0,2022-23,22022,LAL,22200076,2022-10-28,away,0,...,8.0,21.0,0.381,2.0,8.0,0.25,1.0,2.0,0.5,19.0


### Splitting the dataframes by season and then saving to csv's.

In [48]:
for df, df_name in zip(df_list, df_list_str):

    unique_values = df['season_id'].unique()

    for value in unique_values:
        # Filter the DataFrame based on the current unique value
        new_df = processed_dfs[processed_dfs['season_id'] == value].sort_values(by='date_dt')
        new_var_name = f"processed_{df_name}"   
        globals()[new_var_name] = new_df

        display(globals()[new_var_name])

        csv_file_path = f"exports/{new_var_name}.csv"
        new_df.to_csv(csv_file_path, index=False)

Unnamed: 0,player_id,player_name,team_id,season,season_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,...,fgm_last,fga_last,fg_pct_last,fg3m_last,fg3a_last,fg3_pct_last,ftm_last,fta_last,ft_pct_last,pts_last
0,2544,LeBron James,1.610613e+09,2022-23,22022,LAL,22200002,2022-10-18,away,0,...,,,,,,,,,,
1043,202699,Tobias Harris,1.610613e+09,2022-23,22022,DET,22200001,2022-10-18,away,0,...,,,,,,,,,,
9500,1630559,Austin Reaves,1.610613e+09,2022-23,22022,LAL,22200002,2022-10-18,away,0,...,,,,,,,,,,
1181,203076,Anthony Davis,1.610613e+09,2022-23,22022,LAL,22200002,2022-10-18,away,0,...,,,,,,,,,,
4881,1628401,Derrick White,1.610613e+09,2022-23,22022,BOS,22200001,2022-10-18,home,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2675,1626156,D'Angelo Russell,1.610613e+09,2022-23,22022,LAL,22201228,2023-04-09,home,1,...,9.0,14.0,0.643,6.0,9.0,0.667,0.0,0.0,0.000,24.0
9085,1630533,Ziaire Williams,1.610613e+09,2022-23,22022,BKN,22201226,2023-04-09,away,0,...,5.0,8.0,0.625,1.0,3.0,0.333,1.0,1.0,1.000,12.0
9048,1630532,Franz Wagner,1.610613e+09,2022-23,22022,ORL,22201219,2023-04-09,away,0,...,4.0,11.0,0.364,0.0,2.0,0.000,4.0,4.0,1.000,12.0
3060,1626181,Norman Powell,1.610613e+09,2022-23,22022,LAC,22201229,2023-04-09,away,0,...,7.0,14.0,0.500,0.0,2.0,0.000,9.0,11.0,0.818,23.0


Unnamed: 0,player_id,player_name,team_id,season,season_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,...,fgm_last,fga_last,fg_pct_last,fg3m_last,fg3a_last,fg3_pct_last,ftm_last,fta_last,ft_pct_last,pts_last
11254,2544,LeBron James,1.610613e+09,2023-24,22023,LAL,22300061,2023-10-24,away,0,...,,,,,,,,,,
20916,1630559,Austin Reaves,1.610613e+09,2023-24,22023,LAL,22300061,2023-10-24,away,0,...,,,,,,,,,,
12537,203076,Anthony Davis,1.610613e+09,2023-24,22023,LAL,22300061,2023-10-24,away,0,...,,,,,,,,,,
17231,1629008,Michael Porter Jr.,1.610613e+09,2023-24,22023,DEN,22300061,2023-10-24,home,1,...,,,,,,,,,,
12254,202691,Klay Thompson,1.610613e+09,2023-24,22023,DAL,22300062,2023-10-24,home,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19636,1630182,Josh Green,1.610613e+09,2023-24,22023,CHA,22301196,2024-04-14,away,0,...,2.0,6.0,0.333,0.0,3.0,0.000,0.0,0.0,0.000,4.0
20598,1630534,Ochai Agbaji,1.610613e+09,2023-24,22023,TOR,22301189,2024-04-14,away,0,...,3.0,5.0,0.600,1.0,2.0,0.500,0.0,0.0,0.000,7.0
21863,1630625,Dalano Banton,1.610613e+09,2023-24,22023,POR,22301200,2024-04-14,away,0,...,9.0,26.0,0.346,2.0,10.0,0.200,8.0,9.0,0.889,28.0
20469,1630532,Franz Wagner,1.610613e+09,2023-24,22023,ORL,22301191,2024-04-14,home,1,...,9.0,18.0,0.500,1.0,4.0,0.250,5.0,5.0,1.000,24.0


### Start here if new data is not needed.

In [None]:
# Merge in player name
top_200_players = pd.read_csv('exports/top_200_players.csv', index_col=False)

top_player_gamelogs_raw_season24_25 = pd.read_csv('exports/gamelogs_top200_ppg_season24_25.csv', index_col=False)
top_player_gamelogs_raw_season24_25.columns = top_player_gamelogs_raw_season24_25.columns.str.lower()

top_player_gamelogs_raw_season24_25 = top_200_players.merge(top_player_gamelogs_raw_season24_25, how='right', left_on='player_id', right_on='player_id') 
top_player_gamelogs_raw_season24_25.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4370 entries, 0 to 4369
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4303 non-null   float64
 1   player_id          4370 non-null   int64  
 2   player_name        4303 non-null   object 
 3   team_id            4303 non-null   float64
 4   team_abbreviation  4303 non-null   object 
 5   season_id          4370 non-null   int64  
 6   game_id            4370 non-null   int64  
 7   game_date          4370 non-null   object 
 8   matchup            4370 non-null   object 
 9   wl                 4370 non-null   object 
 10  min                4370 non-null   int64  
 11  fgm                4370 non-null   int64  
 12  fga                4370 non-null   int64  
 13  fg_pct             4370 non-null   float64
 14  fg3m               4370 non-null   int64  
 15  fg3a               4370 non-null   int64  
 16  fg3_pct            4370 

#### Add scheduling features: home v away, opponent, win v loss, back to back games, time of season.

In [None]:
top_player_gamelogs_season24_25 = top_player_gamelogs_raw_season24_25

top_player_gamelogs_season24_25['home_vs_away'] = top_player_gamelogs_season24_25['matchup'].apply(lambda x: 'home' if 'vs.' in x else 'away')
top_player_gamelogs_season24_25['home_vs_away_hot'] = top_player_gamelogs_season24_25['matchup'].apply(lambda x: 1 if 'vs.' in x else 0)

top_player_gamelogs_season24_25['wl_hot'] = top_player_gamelogs_season24_25['wl'].apply(lambda x: 1 if 'W' in x else 0)

top_player_gamelogs_season24_25['opponent'] = top_player_gamelogs_season24_25['matchup'].apply(lambda x: x.split('vs. ')[-1] if 'vs. ' in x else x.split('@ ')[-1])

In [None]:
# Convert to game_date col to datetime
top_player_gamelogs_season24_25['date_dt'] = pd.to_datetime(top_player_gamelogs_season24_25['game_date'])

top_player_gamelogs_season24_25.sort_values(by=['player_id', 'date_dt'], inplace=True)

# Initialize the 'b2b' column with zeros
top_player_gamelogs_season24_25['b2b'] = 0

# Iterate over groups of player_id
for player_id, group in top_player_gamelogs_season24_25.groupby('player_id'):
    # Reset the index for the group
    group = group.reset_index()
    for i in range(1, len(group)):
        # Check if the current date is consecutive with the previous one
        if (group.loc[i, 'date_dt'] - group.loc[i - 1, 'date_dt']).days == 1:
            top_player_gamelogs_season24_25.loc[group.loc[i, 'index'], 'b2b'] = 1

# Take a random slice
top_player_gamelogs_season24_25[1850:1860]

  top_player_gamelogs['date_dt'] = pd.to_datetime(top_player_gamelogs['game_date'])


Unnamed: 0.1,Unnamed: 0,player_id,player_name,team_id,team_abbreviation,season_id,game_id,game_date,matchup,wl,...,pf,pts,plus_minus,video_available,home_vs_away,home_vs_away_hot,wl_hot,opponent,date_dt,b2b
526,226.0,1628991,Jaren Jackson Jr.,1610613000.0,MEM,22024,22400345,"DEC 07, 2024",MEM @ BOS,W,...,4,27,9,1,away,0,1,BOS,2024-12-07,0
525,226.0,1628991,Jaren Jackson Jr.,1610613000.0,MEM,22024,22400353,"DEC 08, 2024",MEM @ WAS,W,...,3,21,18,1,away,0,1,WAS,2024-12-08,1
4179,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400066,"OCT 23, 2024",PHI vs. MIL,L,...,4,12,-20,1,home,1,0,MIL,2024-10-23,0
4178,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400078,"OCT 25, 2024",PHI @ TOR,L,...,4,11,2,1,away,0,0,TOR,2024-10-25,0
4177,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400097,"OCT 27, 2024",PHI @ IND,W,...,4,17,-5,1,away,0,1,IND,2024-10-27,0
4176,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400120,"OCT 30, 2024",PHI vs. DET,L,...,4,7,-1,1,home,1,0,DET,2024-10-30,0
4175,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400142,"NOV 02, 2024",PHI vs. MEM,L,...,1,9,-26,1,home,1,0,MEM,2024-11-02,0
4174,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400167,"NOV 04, 2024",PHI @ PHX,L,...,2,5,7,1,away,0,0,PHX,2024-11-04,0
4173,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400179,"NOV 06, 2024",PHI @ LAC,L,...,1,14,-11,1,away,0,0,LAC,2024-11-06,0
4172,64.0,1628997,Caleb Martin,1610613000.0,PHI,22024,22400195,"NOV 08, 2024",PHI @ LAL,L,...,0,13,-3,1,away,0,0,LAL,2024-11-08,0


In [None]:
# Get time of season
# Define the split dates for the NBA season
start_date = pd.to_datetime('2024-10-24')
middle_date = pd.to_datetime('2024-12-21')
end_date = pd.to_datetime('2025-02-17')

# Assign values to 'tos' based on the 'date_dt' column
top_player_gamelogs_season24_25['tos'] = pd.cut(
    top_player_gamelogs_season24_25['date_dt'],
    bins=[start_date - pd.Timedelta(days=1), middle_date, end_date, pd.to_datetime('2025-04-14')],
    labels=['start', 'middle', 'end']
)

# Check the result
print(top_player_gamelogs_season24_25[['date_dt', 'tos']].head())

       date_dt    tos
404 2024-10-22    NaN
403 2024-10-25  start
402 2024-10-26  start
401 2024-10-28  start
400 2024-10-30  start


In [None]:
# Only take the columns we need
top_player_gamelogs_season24_25 = top_player_gamelogs_season24_25[['player_id', 'player_name', 'team_id', 'team_abbreviation', 'game_id', 'date_dt', 'home_vs_away', 'home_vs_away_hot', 'opponent', 'b2b', 'wl', 'wl_hot', 'min', 'plus_minus', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct','pts']]

top_player_gamelogs_season24_25.tail()

Unnamed: 0,player_id,player_name,team_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,opponent,b2b,...,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,pts
2264,1642377,Jaylen Wells,1610613000.0,MEM,22400306,2024-12-01,home,1,IND,0,...,5,9,0.556,3,7,0.429,0,0,0.0,13
2263,1642377,Jaylen Wells,1610613000.0,MEM,22400056,2024-12-03,away,0,DAL,0,...,4,9,0.444,2,7,0.286,0,0,0.0,10
2262,1642377,Jaylen Wells,1610613000.0,MEM,22400329,2024-12-05,home,1,SAC,0,...,2,5,0.4,1,3,0.333,5,6,0.833,10
2261,1642377,Jaylen Wells,1610613000.0,MEM,22400345,2024-12-07,away,0,BOS,0,...,3,6,0.5,1,4,0.25,0,0,0.0,7
2260,1642377,Jaylen Wells,1610613000.0,MEM,22400353,2024-12-08,away,0,WAS,1,...,0,3,0.0,0,0,0.0,4,4,1.0,4


In [None]:
# Calculate 10, 5 and 3 game rolling averages.
# Note: need to rewerite this as:
# def per_game_rolling_avg(df, num_games_list=None, cols=None):

def player_per_game_rolling_avg(df, num_games_list=[10, 5, 3], cols=['min', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct',
'ftm', 'fta', 'ft_pct', 'pts']):

    for num_games in num_games_list:
        for col in cols:
            col_name = f'{col}_{num_games}'
            df[col_name] = (
                df.groupby('player_id')[col]
                .rolling(window=num_games, min_periods=num_games)
                .mean().round(2)
                .reset_index(level=0, drop=True)
            )
    
    # Add rolling average for all games available
    for col in cols:
        col_name = f'{col}_season'
        df[col_name] = (
            df.groupby('player_id')[col]
            .expanding(min_periods=1)
            .mean().round(2)
            .reset_index(level=0, drop=True)
        )
    
    return df

# Apply the function to the df and take a random slice to check for errors
top_player_gamelogs_season24_25 = player_per_game_rolling_avg(top_player_gamelogs_season24_25)
top_player_gamelogs_season24_25[1035:1045]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documen

Unnamed: 0,player_id,player_name,team_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,opponent,b2b,...,fgm_season,fga_season,fg_pct_season,fg3m_season,fg3a_season,fg3_pct_season,ftm_season,fta_season,ft_pct_season,pts_season
673,1627734,Domantas Sabonis,1610613000.0,SAC,22400276,2024-11-24,home,1,BKN,0,...,7.67,12.33,0.64,1.0,2.53,0.31,3.6,4.2,0.75,19.93
672,1627734,Domantas Sabonis,1610613000.0,SAC,22400285,2024-11-25,home,1,OKC,1,...,7.62,12.25,0.64,1.0,2.5,0.32,3.75,4.44,0.75,20.0
671,1627734,Domantas Sabonis,1610613000.0,SAC,22400294,2024-11-27,away,0,MIN,0,...,7.76,12.53,0.63,1.0,2.47,0.33,3.88,4.59,0.75,20.41
670,1627734,Domantas Sabonis,1610613000.0,SAC,22400049,2024-11-29,away,0,POR,0,...,7.78,12.39,0.64,1.06,2.5,0.35,3.83,4.56,0.75,20.44
669,1627734,Domantas Sabonis,1610613000.0,SAC,22400313,2024-12-01,home,1,SAS,0,...,7.79,12.58,0.64,1.11,2.58,0.36,4.0,4.84,0.75,20.68
668,1627734,Domantas Sabonis,1610613000.0,SAC,22400059,2024-12-03,home,1,HOU,0,...,8.05,12.9,0.64,1.05,2.55,0.34,3.85,4.8,0.73,21.0
667,1627734,Domantas Sabonis,1610613000.0,SAC,22400329,2024-12-05,away,0,MEM,0,...,8.0,12.81,0.64,1.05,2.52,0.35,3.76,4.67,0.74,20.81
666,1627734,Domantas Sabonis,1610613000.0,SAC,22400337,2024-12-06,away,0,SAS,1,...,7.95,12.77,0.64,1.0,2.41,0.33,3.95,4.86,0.75,20.86
665,1627734,Domantas Sabonis,1610613000.0,SAC,22400357,2024-12-08,home,1,UTA,0,...,7.96,12.78,0.63,1.04,2.43,0.35,3.83,4.7,0.76,20.78
1218,1627736,Malik Beasley,1610613000.0,DET,22400063,2024-10-23,home,1,IND,0,...,5.0,13.0,0.38,2.0,6.0,0.33,2.0,3.0,0.67,14.0


In [10]:
top_player_gamelogs_processed = top_player_gamelogs.drop(columns=['player_id', 'opponent'], inplace=False)
top_player_gamelogs_processed.to_csv('exports/processed_gamelogs_top200_ppg_season24_25.csv', index=False)

### Adding team game logs to the dataset.

#### Skip the 2 blocks below if new data is not needed.

In [135]:
from nba_api.stats.endpoints import teamgamelog
from nba_api.stats.static import teams

# Get all NBA teams
nba_teams = teams.get_teams()
nba_teams_df = pd.DataFrame(nba_teams)
nba_teams_df = nba_teams_df[['id', 'full_name', 'abbreviation']]

nba_teams_df.to_csv('exports/nba_teams.csv')

In [12]:
# Initialize an empty list to store DataFrames for each team
all_team_logs = []

# Fetch game logs for each team
for team_name, team_id in team_dict.items():
    print(f"Fetching game logs for: {team_name} (ID: {team_id})")
    team_logs = teamgamelog.TeamGameLog(season='2024-25', season_type_all_star='Regular Season', team_id=team_id)
    team_gamelogs = team_logs.get_data_frames()[0]
    team_gamelogs['team_name'] = team_name
    all_team_logs.append(team_gamelogs)

    time.sleep(10)

# Combine all team logs into a single DataFrame
team_gamelogs_season_24_25_raw = pd.concat(all_team_logs, ignore_index=True)

# Convert column names to lowercase
team_gamelogs_season_24_25_raw.columns = team_gamelogs_season_24_25_raw.columns.str.lower()

# Display unique team IDs, a sample of the data, and column names
print(team_gamelogs_season_24_25_raw['team_id'].drop_duplicates())
print("\nSample data:\n", team_gamelogs_season_24_25_raw.head())
print("\nColumns:\n", team_gamelogs_season_24_25_raw.columns)

# Save to CSV
team_gamelogs_season_24_25_raw.to_csv('exports/team_game_logs_season_24_25_raw.csv', index=False)

Fetching game logs for: Atlanta Hawks (ID: 1610612737)
Fetching game logs for: Boston Celtics (ID: 1610612738)
Fetching game logs for: Cleveland Cavaliers (ID: 1610612739)
Fetching game logs for: New Orleans Pelicans (ID: 1610612740)
Fetching game logs for: Chicago Bulls (ID: 1610612741)
Fetching game logs for: Dallas Mavericks (ID: 1610612742)
Fetching game logs for: Denver Nuggets (ID: 1610612743)
Fetching game logs for: Golden State Warriors (ID: 1610612744)
Fetching game logs for: Houston Rockets (ID: 1610612745)
Fetching game logs for: Los Angeles Clippers (ID: 1610612746)
Fetching game logs for: Los Angeles Lakers (ID: 1610612747)
Fetching game logs for: Miami Heat (ID: 1610612748)
Fetching game logs for: Milwaukee Bucks (ID: 1610612749)
Fetching game logs for: Minnesota Timberwolves (ID: 1610612750)
Fetching game logs for: Brooklyn Nets (ID: 1610612751)
Fetching game logs for: New York Knicks (ID: 1610612752)
Fetching game logs for: Orlando Magic (ID: 1610612753)
Fetching game l

### Calculate pace factor for teams and games.

In [None]:
team_gamelogs_season_24_25 = pd.read_csv('exports/team_game_logs_season_24_25_raw.csv', index_col=False)

# Identify the opponent in a seperate column.
team_gamelogs_season_24_25['opponent'] = team_gamelogs_season_24_25['matchup'].apply(lambda x: x.split('vs. ')[-1] if 'vs. ' in x else x.split('@ ')[-1])

# Calculate team pace factor.

team_gamelogs_season_24_25['pace_factor_team'] = ""

# Function to calculate pace
def calculate_team_pace_factor(fga, fta, oreb, tov, team_minutes):
    # Calculate possessions
    possessions = fga + (0.44 * fta) - oreb + tov
    # Calculate pace
    pace = 48 * (possessions / (team_minutes / 5))
    return pace

team_gamelogs_season_24_25['pace_factor_team'] = calculate_team_pace_factor(team_gamelogs_season_24_25["fga"], team_gamelogs_season_24_25["fta"], team_gamelogs_season_24_25["oreb"], team_gamelogs_season_24_25["tov"],team_gamelogs_season_24_25["min"])

team_gamelogs_season_24_25.sort_values(by='pace_factor_team', ascending=False).head(10)

Unnamed: 0,team_id,game_id,game_date,matchup,wl,w,l,w_pct,min,fgm,...,reb,ast,stl,blk,tov,pf,pts,team,opponent,pace_factor_team
659,1610612763,22400023,"NOV 19, 2024",MEM vs. DEN,L,8,7,0.533,240,41,...,55,24,9,10,22,24,110,MEM,DEN,119.64
686,1610612764,22400012,"NOV 15, 2024",WAS @ ATL,L,2,9,0.182,240,43,...,52,28,11,5,15,23,117,WAS,ATL,117.24
596,1610612760,22400091,"OCT 26, 2024",OKC @ CHI,W,2,0,1.0,240,45,...,54,21,15,5,10,18,114,OKC,CHI,116.8
126,1610612741,22400091,"OCT 26, 2024",CHI vs. OKC,L,1,2,0.333,240,32,...,61,23,6,2,26,19,95,CHI,OKC,116.68
115,1610612741,22400241,"NOV 17, 2024",CHI vs. HOU,L,5,9,0.357,240,36,...,47,26,8,7,13,21,107,CHI,HOU,116.68
210,1610612745,22400254,"NOV 20, 2024",HOU vs. IND,W,11,5,0.688,240,47,...,47,25,16,2,13,21,130,HOU,IND,116.52
512,1610612757,22400210,"NOV 10, 2024",POR vs. MEM,L,3,8,0.273,240,34,...,43,18,9,3,23,21,89,POR,MEM,116.24
27,1610612738,22400345,"DEC 07, 2024",BOS vs. MEM,L,19,5,0.792,240,42,...,51,26,6,7,13,16,121,BOS,MEM,115.32
212,1610612745,22400241,"NOV 17, 2024",HOU @ CHI,W,10,4,0.714,240,55,...,66,34,8,9,10,18,143,HOU,CHI,115.08
187,1610612744,22400258,"NOV 20, 2024",GSW vs. ATL,W,11,3,0.786,240,47,...,54,36,10,7,15,17,120,GSW,ATL,114.92


#### Calculate game pace factor.

In [12]:
## Calculate game pace factor.

# Pace factor is an estimate of the number of possessions per 48 minutes by a team. The formula is:
#       48 * ((Tm Poss + Opp Poss) / (2 * (Tm MP / 5))). 

# Function to calculate possessions
def calculate_possessions(fga, fta, oreb, tov):
    return fga + (0.44 * fta) - oreb + tov

# Function to calculate game pace factor
def calculate_pace_factor_game(df):
    # Ensure `team_id` and `opponent` have the same type
    df['team'] = df['team'].astype(str)
    df['opponent'] = df['opponent'].astype(str)

    # Merge the DataFrame with itself to get both team and opponent stats for each game
    merged = pd.merge(
        df,
        df,
        left_on=['game_id', 'team'],
        right_on=['game_id', 'opponent'],  # Match team_id with opponent for merging
        suffixes=('_team', '_opp')
    )

    # Calculate possessions for team and opponent
    merged['team_possessions'] = calculate_possessions(
        merged['fga_team'], merged['fta_team'], merged['oreb_team'], merged['tov_team']
    )
    merged['opp_possessions'] = calculate_possessions(
        merged['fga_opp'], merged['fta_opp'], merged['oreb_opp'], merged['tov_opp']
    )

    # Calculate pace factor
    merged['pace_factor_game'] = 48 * (
        (merged['team_possessions'] + merged['opp_possessions']) / 
        (2 * (merged['min_team'] / 5))
    )

    new_cols = ['pace_factor_team_opp', 'team_possessions',	'opp_possessions',	'pace_factor_game']

    for col in new_cols:
        merged[col] = merged[col].round(2)
    
    # Return relevant columns
    return merged[['game_id', 'pace_factor_game', 'pace_factor_team_opp']]

# Example usage
pace_factor_df = calculate_pace_factor_game(team_gamelogs_season_24_25)

team_gamelogs_season_24_25 = team_gamelogs_season_24_25.merge(pace_factor_df, how='left', left_on='game_id', right_on='game_id')
team_gamelogs_season_24_25.drop_duplicates(subset=['team_id', 'game_id'], inplace=True)

# Sort and display the top 10 games by pace factor
team_gamelogs_season_24_25.sort_values(by='pace_factor_game', ascending=False).head(10)

Unnamed: 0,team_id,game_id,game_date,matchup,wl,w,l,w_pct,min,fgm,...,stl,blk,tov,pf,pts,team,opponent,pace_factor_team,pace_factor_game,pace_factor_team_opp
328,1610612743,22400023,"NOV 19, 2024",DEN @ MEM,W,8,5,0.615,240,48,...,15,6,18,29,122,DEN,MEM,114.68,117.16,119.64
1318,1610612763,22400023,"NOV 19, 2024",MEM vs. DEN,L,8,7,0.533,240,41,...,9,10,22,24,110,MEM,DEN,119.64,117.16,119.64
1192,1610612760,22400091,"OCT 26, 2024",OKC @ CHI,W,2,0,1.0,240,45,...,15,5,10,18,114,OKC,CHI,116.8,116.74,116.8
252,1610612741,22400091,"OCT 26, 2024",CHI vs. OKC,L,1,2,0.333,240,32,...,6,2,26,19,95,CHI,OKC,116.68,116.74,116.8
230,1610612741,22400241,"NOV 17, 2024",CHI vs. HOU,L,5,9,0.357,240,36,...,8,7,13,21,107,CHI,HOU,116.68,115.88,115.08
424,1610612745,22400241,"NOV 17, 2024",HOU @ CHI,W,10,4,0.714,240,55,...,8,9,10,18,143,HOU,CHI,115.08,115.88,115.08
420,1610612745,22400254,"NOV 20, 2024",HOU vs. IND,W,11,5,0.688,240,47,...,16,2,13,21,130,HOU,IND,116.52,115.26,114.0
872,1610612754,22400254,"NOV 20, 2024",IND @ HOU,L,6,9,0.4,240,44,...,7,9,23,24,113,IND,HOU,114.0,115.26,114.0
1372,1610612764,22400012,"NOV 15, 2024",WAS @ ATL,L,2,9,0.182,240,43,...,11,5,15,23,117,WAS,ATL,117.24,114.72,117.24
26,1610612737,22400012,"NOV 15, 2024",ATL vs. WAS,W,6,7,0.462,240,46,...,10,10,16,17,129,ATL,WAS,112.2,114.72,117.24


In [None]:
team_gamelogs_season_24_25['date_dt'] = pd.to_datetime(team_gamelogs_season_24_25['game_date'])

team_gamelogs_season_24_25.sort_values(by=['team_id', 'date_dt'], inplace=True)

# Calculate 10, 5 and 3 game rolling averages.
def team_per_game_rolling_avg(df, num_games_list=None, cols=None):

    for num_games in num_games_list:
        for col in cols:
            col_name = f'{col}_{num_games}'
            df[col_name] = (
                df.groupby('team_id')[col]
                .rolling(window=num_games, min_periods=num_games)
                .mean().round(2)
                .reset_index(level=0, drop=True)
            )
    
    # Add rolling average for all games available
    for col in cols:
        col_name = f'{col}_season'
        df[col_name] = (
            df.groupby('team_id')[col]
            .expanding(min_periods=1)
            .mean().round(2)
            .reset_index(level=0, drop=True)
        )
    
    return df


team_per_game_rolling_avg(team_gamelogs_season_24_25, num_games_list=[10, 5, 3], cols=['pace_factor_team' , 'pace_factor_game', 'pace_factor_team_opp'])

  team_gamelogs_season_24_25['date_dt'] = pd.to_datetime(team_gamelogs_season_24_25['game_date'])


Unnamed: 0,team_id,game_id,game_date,matchup,wl,w,l,w_pct,min,fgm,...,pace_factor_team_opp_10,pace_factor_team_5,pace_factor_game_5,pace_factor_team_opp_5,pace_factor_team_3,pace_factor_game_3,pace_factor_team_opp_3,pace_factor_team_season,pace_factor_game_season,pace_factor_team_opp_season
50,1610612737,22400064,"OCT 23, 2024",ATL vs. BKN,W,1,0,1.000,240,39,...,,,,,,,,104.24,105.62,107.00
48,1610612737,22400079,"OCT 25, 2024",ATL vs. CHA,W,2,0,1.000,240,39,...,,,,,,,,103.98,104.90,105.82
46,1610612737,22400100,"OCT 27, 2024",ATL @ OKC,L,2,1,0.667,240,36,...,,,,,104.57,104.57,104.57,104.57,104.57,104.57
44,1610612737,22400103,"OCT 28, 2024",ATL vs. WAS,L,2,2,0.500,240,39,...,,,,,105.44,105.45,105.47,105.14,105.50,105.85
42,1610612737,22400121,"OCT 30, 2024",ATL @ WAS,L,2,3,0.400,240,45,...,,105.56,105.92,106.28,106.61,106.60,106.59,105.56,105.92,106.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1610612766,22400050,"DEC 03, 2024",CHA vs. PHI,L,6,15,0.286,240,34,...,97.48,95.43,95.74,95.43,94.09,94.89,94.09,99.25,99.18,99.25
1452,1610612766,22400327,"DEC 05, 2024",CHA @ NYK,L,6,16,0.273,240,40,...,97.20,94.84,95.10,94.84,93.69,94.14,93.69,98.98,98.91,98.98
1450,1610612766,22400340,"DEC 07, 2024",CHA vs. CLE,L,6,17,0.261,240,37,...,96.61,94.01,94.17,94.01,92.36,92.75,92.36,98.78,98.65,98.78
1448,1610612766,22400349,"DEC 08, 2024",CHA @ IND,W,7,17,0.292,240,40,...,96.98,95.78,95.68,95.78,97.07,96.41,97.07,98.97,98.83,98.97


In [15]:
# Merge pace info from the team game logs back into the top player game logs df.

top_player_gamelogs_processed_with_pace = top_player_gamelogs_processed.merge(team_gamelogs_season_24_25[['team_id', 'game_id',
       'pace_factor_team', 'pace_factor_game',
       'pace_factor_team_opp', 'pace_factor_team_10',
       'pace_factor_game_10', 'pace_factor_team_opp_10', 'pace_factor_team_5',
       'pace_factor_game_5', 'pace_factor_team_opp_5', 'pace_factor_team_3',
       'pace_factor_game_3', 'pace_factor_team_opp_3',
       'pace_factor_team_season', 'pace_factor_game_season',
       'pace_factor_team_opp_season']], 
       how='left',
       left_on=['game_id', 'team_id'],
       right_on=['game_id', 'team_id'])

top_player_gamelogs_processed_with_pace

Unnamed: 0,player_name,team_id,team_abbreviation,game_id,date_dt,home_vs_away,home_vs_away_hot,b2b,wl,wl_hot,...,pace_factor_team_opp_10,pace_factor_team_5,pace_factor_game_5,pace_factor_team_opp_5,pace_factor_team_3,pace_factor_game_3,pace_factor_team_opp_3,pace_factor_team_season,pace_factor_game_season,pace_factor_team_opp_season
0,LeBron James,1.610613e+09,LAL,22400062,2024-10-22,home,1,0,W,1,...,,,,,,,,98.00,98.94,99.88
1,LeBron James,1.610613e+09,LAL,22400085,2024-10-25,home,1,0,W,1,...,,,,,,,,98.08,99.13,100.18
2,LeBron James,1.610613e+09,LAL,22400096,2024-10-26,home,1,1,W,1,...,,,,,101.57,102.18,102.79,101.57,102.18,102.79
3,LeBron James,1.610613e+09,LAL,22400111,2024-10-28,away,0,0,L,0,...,,,,,102.88,102.32,101.76,101.66,101.48,101.29
4,LeBron James,1.610613e+09,LAL,22400118,2024-10-30,away,0,0,L,0,...,,102.63,102.48,102.34,105.67,104.72,103.77,102.63,102.48,102.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4365,Jaylen Wells,1.610613e+09,MEM,22400306,2024-12-01,home,1,0,W,1,...,106.21,104.83,105.65,104.62,105.80,106.17,105.45,106.20,106.00,106.06
4366,Jaylen Wells,1.610613e+09,MEM,22400056,2024-12-03,away,0,0,L,0,...,106.68,106.10,106.78,105.90,106.84,107.75,106.84,106.33,106.20,106.20
4367,Jaylen Wells,1.610613e+09,MEM,22400329,2024-12-05,home,1,0,W,1,...,105.93,106.73,107.40,106.52,107.71,109.04,107.71,106.37,106.28,106.24
4368,Jaylen Wells,1.610613e+09,MEM,22400345,2024-12-07,away,0,0,W,1,...,106.72,106.74,108.44,106.74,107.45,109.73,107.45,106.36,106.47,106.23


In [16]:
top_player_gamelogs_processed_with_pace.to_csv('exports/processed_gamelogs_top200_ppg_season24_25_with_pace.csv', index=False)