# Scraping NBA player data

In this notebook we will scrape data on every player in the nba (including college stats for every player before they were drafted) using the `playercareerstats` endpoint

In [2]:
import pandas as pd
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.static import players
from nba_api.stats.endpoints import playercareerbycollege
from IPython.display import clear_output
import matplotlib.pyplot as plt
import os
from loguru import logger
import time

Retrieving list of all the players in the nba

In [2]:
all_players = players.get_players()
all_players

[{'id': 76001,
  'full_name': 'Alaa Abdelnaby',
  'first_name': 'Alaa',
  'last_name': 'Abdelnaby',
  'is_active': False},
 {'id': 76002,
  'full_name': 'Zaid Abdul-Aziz',
  'first_name': 'Zaid',
  'last_name': 'Abdul-Aziz',
  'is_active': False},
 {'id': 76003,
  'full_name': 'Kareem Abdul-Jabbar',
  'first_name': 'Kareem',
  'last_name': 'Abdul-Jabbar',
  'is_active': False},
 {'id': 51,
  'full_name': 'Mahmoud Abdul-Rauf',
  'first_name': 'Mahmoud',
  'last_name': 'Abdul-Rauf',
  'is_active': False},
 {'id': 1505,
  'full_name': 'Tariq Abdul-Wahad',
  'first_name': 'Tariq',
  'last_name': 'Abdul-Wahad',
  'is_active': False},
 {'id': 949,
  'full_name': 'Shareef Abdur-Rahim',
  'first_name': 'Shareef',
  'last_name': 'Abdur-Rahim',
  'is_active': False},
 {'id': 76005,
  'full_name': 'Tom Abernethy',
  'first_name': 'Tom',
  'last_name': 'Abernethy',
  'is_active': False},
 {'id': 76006,
  'full_name': 'Forest Able',
  'first_name': 'Forest',
  'last_name': 'Able',
  'is_active': Fa

To get all the stats of a particular player, we will use the `playercareerstats` endpoint and pass the `id` of that player into the predefined function `PlayerCareerStats`. Lets look at the structure of the dataset by retrieving the stats for a specific player (Nikola Jokić in this case) :

In [3]:
# Nikola Jokić
career = playercareerstats.PlayerCareerStats(player_id='203999') 
career.get_data_frames()[0]


Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,203999,2015-16,0,1610612743,DEN,21.0,80,55,1733.0,307,...,0.811,181,379,560,189,79,50,104,208,796
1,203999,2016-17,0,1610612743,DEN,22.0,73,59,2038.0,494,...,0.825,212,506,718,359,61,55,171,214,1221
2,203999,2017-18,0,1610612743,DEN,23.0,75,73,2443.0,504,...,0.85,195,608,803,458,90,61,210,212,1385
3,203999,2018-19,0,1610612743,DEN,24.0,80,80,2504.0,616,...,0.821,228,637,865,580,108,55,248,228,1604
4,203999,2019-20,0,1610612743,DEN,25.0,73,73,2335.0,565,...,0.817,166,545,711,512,85,44,226,222,1456
5,203999,2020-21,0,1610612743,DEN,26.0,72,72,2488.0,732,...,0.868,205,575,780,599,95,48,222,192,1898
6,203999,2021-22,0,1610612743,DEN,27.0,74,74,2476.0,764,...,0.81,206,813,1019,584,109,63,281,191,2004
7,203999,2022-23,0,1610612743,DEN,28.0,69,69,2323.0,646,...,0.822,167,650,817,678,87,47,247,174,1690
8,203999,2023-24,0,1610612743,DEN,29.0,75,75,2595.0,778,...,0.823,210,720,930,677,96,66,223,181,1979


As you can see all the data on Nikola Jokic over the years has been scraped which includes various stats (for eg : PTS-points, OREB- offensive rebounds etc)

Now we'll do the same for every player in the NBA (from 1985) and store the data for each player in a separate .csv file

In [9]:
# Function to fetch and save data for a player
def fetch_and_save_player_data(player_id, player_name):
    logger.info(f"Fetching data for player: {player_name} (ID: {player_id})")

    # Fetch career stats
    career = playercareerstats.PlayerCareerStats(player_id=str(player_id))
    career_df = career.get_data_frames()[0]

    output_dir = os.path.join("..", "data", "raw", "players_data")
    os.makedirs(output_dir, exist_ok=True)

    output_filename = os.path.join(output_dir, f"{player_name}_{player_id}.csv")
    
    career_df.to_csv(output_filename, index=False)
    logger.info(f"Data saved to: {output_filename}")

    time.sleep(1)

In [10]:
# sorting the players based on id to easily keep track of player data being fetched
sorted_players = sorted(all_players, key=lambda x: x['id'])

for player in sorted_players:
    player_id = player['id']
    player_name = player['full_name']
    fetch_and_save_player_data(player_id, player_name)

[32m2024-03-30 16:06:47.572[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_and_save_player_data[0m:[36m3[0m - [1mFetching data for player: Byron Scott (ID: 2)[0m
[32m2024-03-30 16:06:48.571[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_and_save_player_data[0m:[36m12[0m - [1mData saved to: Byron Scott_2.csv[0m
[32m2024-03-30 16:06:49.572[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_and_save_player_data[0m:[36m3[0m - [1mFetching data for player: Grant Long (ID: 3)[0m
[32m2024-03-30 16:06:50.925[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_and_save_player_data[0m:[36m12[0m - [1mData saved to: Grant Long_3.csv[0m
[32m2024-03-30 16:06:51.939[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_and_save_player_data[0m:[36m3[0m - [1mFetching data for player: Dan Schayes (ID: 7)[0m
[32m2024-03-30 16:06:54.318[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_and_save_player_data[0m:[36m12[0m - [1mData saved to: Dan Schaye

Merging all the csv files into one dataframe :

In [6]:
player_data_dir = "../data/raw/players_data"


player_data_frames = []


for filename in os.listdir(player_data_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(player_data_dir, filename)
        
        player_name = filename.split("_")[0]  
        df = pd.read_csv(file_path)
        df['player_name'] = player_name
        player_data_frames.append(df)
player_data_combined = pd.concat(player_data_frames, ignore_index=True)

player_data_combined

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,player_name
0,920,1985-86,0,1610612747,LAL,22.0,82,1.0,1542.0,209,...,160.0,221.0,381.0,54,49.0,49.0,99.0,229,521,A.C. Green
1,920,1986-87,0,1610612747,LAL,23.0,79,72.0,2240.0,316,...,210.0,405.0,615.0,84,70.0,80.0,102.0,171,852,A.C. Green
2,920,1987-88,0,1610612747,LAL,24.0,82,64.0,2636.0,322,...,245.0,465.0,710.0,93,87.0,45.0,120.0,204,937,A.C. Green
3,920,1988-89,0,1610612747,LAL,25.0,82,82.0,2510.0,401,...,258.0,481.0,739.0,103,94.0,55.0,119.0,172,1088,A.C. Green
4,920,1989-90,0,1610612747,LAL,26.0,82,82.0,2709.0,385,...,262.0,450.0,712.0,90,66.0,50.0,116.0,207,1061,A.C. Green
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29870,980,2008-09,0,1610612739,CLE,34.0,65,65.0,1765.0,342,...,157.0,333.0,490.0,64,28.0,84.0,90.0,183,838,Zydrunas Ilgauskas
29871,980,2009-10,0,1610612739,CLE,35.0,64,6.0,1339.0,194,...,114.0,231.0,345.0,48,14.0,50.0,63.0,183,474,Zydrunas Ilgauskas
29872,980,2010-11,0,1610612748,MIA,36.0,72,51.0,1145.0,162,...,108.0,179.0,287.0,26,23.0,58.0,52.0,185,360,Zydrunas Ilgauskas
29873,1629597,2019-20,0,1610612740,NOP,24.0,4,0.0,51.0,6,...,3.0,6.0,9.0,3,1.0,1.0,4.0,10,12,Zylan Cheatham


In [7]:
player_data_combined.columns

Index(['PLAYER_ID', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'player_name'],
      dtype='object')

The above column names are very vague and will be rectified when we clean the data ("`cleaning_data`" folder). 

# Scraping player stats in College (before they were drafted)

To scrape college data for every player we will use the `playercareerbycollege` endpoint. The function `PlayerCareerByCollege` defined in the library takes the name of a college as input and returns the stats of every player who has previously played for that college. 

From the draft dataset (refer to notebook where draft data is scraped) we have retrieved a list of all the colleges in the USA which we will pass into the function mentioned above and save the data for each college to a separate csv file : 

In [3]:
draft_df = pd.read_csv('../data/raw/draft_data/draft_data.csv')

In [4]:
temp_draft_df= draft_df[draft_df['ORGANIZATION_TYPE']=='College/University'].copy()
list_college = temp_draft_df['ORGANIZATION'].unique()
list_college

array(['Alabama', 'Arkansas', 'Houston', 'Central Florida', 'Kentucky',
       'Michigan', 'Duke', 'Kansas', 'Connecticut', 'Baylor', 'Indiana',
       'California-Los Angeles', 'Santa Clara', 'Villanova', 'Iowa',
       'Marquette', 'Belmont', 'Ohio State', 'Gonzaga', 'Missouri',
       'Penn State', 'Xavier', 'Tennessee', 'Clemson', 'Washington State',
       'Pepperdine', 'South Carolina', 'Miami (FL)', 'Eastern Michigan',
       'Kansas State', 'Dayton', 'Furman', 'Auburn', 'Purdue', 'Arizona',
       'Wisconsin', 'Memphis', 'Louisiana State', 'Arizona State',
       'Wake Forest', 'Colorado State', 'Notre Dame',
       'Wisconsin-Milwaukee', 'Michigan State', 'Nebraska', 'Toledo',
       'Virginia Commonwealth', 'Southern California', 'Colorado',
       'Oklahoma State', 'Florida State', 'Stanford', 'Oregon',
       'Virginia', 'Florida', 'Texas', 'Madrid (ESP)', 'North Carolina',
       'Loyola-Maryland', 'Ohio', 'West Virginia', 'Illinois',
       'Utah State', 'Louisville', 'Cr

In [25]:
# function to scrape college player data

def get_college_players_data(college):
    logger.info(f"Fetching data for college: {college}")


    college_data = playercareerbycollege.PlayerCareerByCollege(college=college)
    college_data_df = college_data.get_data_frames()[0]

    output_dir = os.path.join("..", "data", "raw", "players_college_stats")
    os.makedirs(output_dir, exist_ok=True)

    output_filename = os.path.join(output_dir, f"{college}_data.csv")
    
    college_data_df.to_csv(output_filename, index=False)
    logger.info(f"Data saved to: {output_filename}")
    clear_output()
    time.sleep(1)

In [None]:
for college in list_college:
    get_college_players_data(college)

Merging all individual csv files into a single dataframe

In [3]:
college_data_dir = "../data/raw/players_college_stats"


college_data_dfs = []


for filename in os.listdir(college_data_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(college_data_dir, filename)
        df = pd.read_csv(file_path)
        college_data_dfs.append(df)
        
college_data_combined = pd.concat(college_data_dfs, ignore_index=True)


In [7]:
college_data_combined.head(10)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,COLLEGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76984,Brian Heaney,Acadia (CAN),14,70.0,13,24,0.541666,,,...,0.5,,,4.0,6,,,,17,28
1,78385,Bill Turner,Akron,294,4060.0,603,1481,0.407157,,,...,0.728119,,,1039.0,167,,,,605,1597
2,77684,Fred Nagy,Akron,50,,94,271,0.346863,,,...,0.670103,,,,68,,,,84,253
3,76669,Ned Endress,Akron,16,,3,25,0.12,,,...,0.533333,,,,4,,,,13,14
4,202148,Mickell Gladness,Alabama A&M,26,252.0,25,59,0.423728,0.0,0.0,...,0.5,14.0,44.0,58.0,5,4.0,20.0,7.0,32,56
5,77401,Kevin Loder,Alabama State,148,2094.0,365,791,0.461441,6.0,23.0,...,0.695,113.0,225.0,338.0,174,67.0,43.0,143.0,261,875
6,202407,Elijah Millsap,Alabama-Birmingham,69,1120.0,93,284,0.327464,29.0,104.0,...,0.679245,37.0,154.0,191.0,78,63.0,19.0,78.0,159,287
7,101182,Donell Taylor,Alabama-Birmingham,98,834.0,109,276,0.394927,7.0,34.0,...,0.640625,30.0,76.0,106.0,90,47.0,7.0,59.0,81,266
8,1628,Alan Ogg,Alabama-Birmingham,80,657.0,75,152,0.493421,0.0,2.0,...,0.568181,48.0,85.0,133.0,13,12.0,58.0,30.0,132,175
9,1641998,Trey Jemison,Alabama-Birmingham,18,351.0,49,83,0.590361,0.0,0.0,...,0.823529,38.0,38.0,76.0,16,7.0,16.0,22.0,48,112


In [9]:
college_data_combined.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'COLLEGE', 'GP', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')