# Scraping Game Data

In this notebook we'll scrape data on every game played by each team in the NBA (1982 onwards) using the `leaguegamefinder` endpoint which retrieves data on all the games played by a particular team over the years.  

In [2]:
import pandas as pd
import os
import time
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
from loguru import logger
from IPython.display import clear_output

Retrieving list of all teams so their ids can be passed into the `LeagueGameFinder` function which will scrape the game data for every team :

In [None]:
all_teams = teams.get_teams()
all_teams

In [11]:
# sort teams by ID
sorted_teams = sorted(all_teams, key=lambda x: x['id'])

In [17]:
# function to scrape game data for every team

def fetch_and_save_team_games(team_id, team_name):

    logger.info(f"Fetching game data for team: {team_name} (ID: {team_id})")
    
    game_data_dir = os.path.join("..", "data", "raw", "team_game_data")
    os.makedirs(game_data_dir, exist_ok=True)
    
    game_finder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id) 
    team_games = game_finder.get_data_frames()[0]
    output_filename = os.path.join(game_data_dir, f"{team_name}_gameData.csv")
    team_games.to_csv(output_filename, index=False)
    logger.info(f"Data saved to: {output_filename}")
    clear_output()
    time.sleep(1)

In [18]:
for team in sorted_teams:
    team_id = team['id']
    team_name = team['full_name']
    fetch_and_save_team_games(team_id, team_name)

In [3]:
# merging all team game data into one dataframe

team_game_data_dir = "../data/raw/team_game_data"


team_gamedata_dfs = []


for filename in os.listdir(team_game_data_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(team_game_data_dir, filename) 
        df = pd.read_csv(file_path)
        team_gamedata_dfs.append(df)
team_gamedata_combined = pd.concat(team_gamedata_dfs, ignore_index=True)

In [4]:
# converting game date to datetime object
team_gamedata_combined['GAME_DATE'] = pd.to_datetime(team_gamedata_combined['GAME_DATE'])

In [17]:
team_gamedata_combined.head(10)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22023,1610612737,ATL,Atlanta Hawks,22301076,2024-03-30,ATL vs. MIL,L,239,113,...,0.833,9.0,30.0,39.0,24,5.0,3,11,26,-9.0
1,22023,1610612737,ATL,Atlanta Hawks,22301060,2024-03-28,ATL vs. BOS,W,265,123,...,0.6,17.0,36.0,53.0,24,5.0,5,11,17,1.0
2,22023,1610612737,ATL,Atlanta Hawks,22301051,2024-03-27,ATL vs. POR,W,240,120,...,0.8,8.0,33.0,41.0,23,12.0,9,16,17,14.0
3,22023,1610612737,ATL,Atlanta Hawks,22301034,2024-03-25,ATL vs. BOS,W,241,120,...,0.588,15.0,29.0,44.0,30,5.0,3,13,18,2.0
4,22023,1610612737,ATL,Atlanta Hawks,22301021,2024-03-23,ATL vs. CHA,W,240,132,...,0.857,11.0,32.0,43.0,40,9.0,5,8,14,41.0
5,22023,1610612737,ATL,Atlanta Hawks,22301010,2024-03-21,ATL @ PHX,L,241,115,...,0.793,12.0,24.0,36.0,27,11.0,3,8,23,-13.0
6,22023,1610612737,ATL,Atlanta Hawks,22300990,2024-03-18,ATL @ LAL,L,240,105,...,0.8,12.0,30.0,42.0,32,8.0,5,9,14,-31.0
7,22023,1610612737,ATL,Atlanta Hawks,22300982,2024-03-17,ATL @ LAC,W,240,110,...,0.917,10.0,35.0,45.0,32,10.0,7,14,22,17.0
8,22023,1610612737,ATL,Atlanta Hawks,22300966,2024-03-15,ATL @ UTA,L,241,122,...,0.741,11.0,30.0,41.0,32,7.0,5,9,16,-2.0
9,22023,1610612737,ATL,Atlanta Hawks,22300953,2024-03-13,ATL @ POR,L,240,102,...,0.667,13.0,25.0,38.0,19,11.0,6,14,23,-4.0


In [18]:
team_gamedata_combined.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')