In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
#Get a list of the last 5 years to collect NBA data from starting with 2023
last_five_years = []
current_year = date.today().year

for i in range(0,5):
    last_five_years.append(int(current_year) - i)

In [3]:
last_five_years

[2025, 2024, 2023, 2022, 2021]

In [4]:
#Empty list I will use to gather data
data_rows = []
for year in last_five_years:
    #Get URL for each year
    URL = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    r = requests.get(URL)
    soup = BeautifulSoup(r.content, 'lxml')

    #Find part in the html that has the table data
    results = soup.find('div', id = 'div_totals_stats')
    #Get all the data in the table and add it to the empty list we created
    table_data = results.find_all('tr')
    for row in table_data[1:]:
        #Find all table data
        data = row.find_all('td')
        individual_data = [i.text for i in data]
        individual_data.append(year)
        #Add data to the empty list
        data_rows.append(individual_data)

In [5]:
#Find all the class names for the headers
data_headers = results.find_all("th", attrs={"data-stat": True})

for th in data_headers:
    print(f"Text: '{th.get_text(strip=True)}' | Class: {th.get('class')} | Data-Stat: {th.get('data-stat')}")

Text: 'Rk' | Class: ['ranker', 'poptip', 'center'] | Data-Stat: ranker
Text: 'Player' | Class: ['poptip', 'sort_default_asc', 'center'] | Data-Stat: name_display
Text: 'Age' | Class: ['poptip', 'center'] | Data-Stat: age
Text: 'Team' | Class: ['poptip', 'sort_default_asc', 'center'] | Data-Stat: team_name_abbr
Text: 'Pos' | Class: ['poptip', 'center'] | Data-Stat: pos
Text: 'G' | Class: ['poptip', 'center'] | Data-Stat: games
Text: 'GS' | Class: ['poptip', 'center'] | Data-Stat: games_started
Text: 'MP' | Class: ['poptip', 'center'] | Data-Stat: mp
Text: 'FG' | Class: ['poptip', 'center'] | Data-Stat: fg
Text: 'FGA' | Class: ['poptip', 'center'] | Data-Stat: fga
Text: 'FG%' | Class: ['poptip', 'hide_non_quals', 'center'] | Data-Stat: fg_pct
Text: '3P' | Class: ['poptip', 'center'] | Data-Stat: fg3
Text: '3PA' | Class: ['poptip', 'center'] | Data-Stat: fg3a
Text: '3P%' | Class: ['poptip', 'hide_non_quals', 'center'] | Data-Stat: fg3_pct
Text: '2P' | Class: ['poptip', 'center'] | Data-St

In [6]:
#Get a list of all the headers
data_headers = results.find_all('th', class_ = ['poptip center','poptip sort_default_asc center','ranker poptip sort_default_asc show_partial_when_sorting center','poptip hide_non_quals center','poptip sort_col center'])
for row in data_headers:
    titles = [i.text for i in data_headers]

In [7]:
titles.append('Year')
titles

['Player',
 'Age',
 'Team',
 'Pos',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'Trp-Dbl',
 'Awards',
 'Year']

In [8]:
print(f"Data Rows: {len(data_rows[0])}")  # Number of columns in data
print(f"Headers: {len(titles)}")         # Number of headers

Data Rows: 32
Headers: 32


In [9]:
df = pd.DataFrame(data_rows, columns= titles)
df.head()

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,TRB,AST,STL,BLK,TOV,PF,PTS,Trp-Dbl,Awards,Year
0,Shai Gilgeous-Alexander,26,OKC,PG,70,70,2402,795,1522,0.522,...,353,440,123,71,175,153,2301,0,,2025
1,Anthony Edwards,23,MIN,SG,70,70,2539,628,1425,0.441,...,401,320,80,42,225,126,1912,0,,2025
2,Nikola Jokić,29,DEN,C,63,63,2301,706,1226,0.576,...,806,647,109,43,206,145,1845,30,,2025
3,Jayson Tatum,26,BOS,PF,66,66,2404,612,1345,0.455,...,573,393,73,34,198,145,1791,2,,2025
4,Giannis Antetokounmpo,30,MIL,PF,59,59,2004,703,1172,0.6,...,708,350,50,72,189,144,1784,7,,2025


In [32]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Trp-Dbl,Awards,Year
0,Shai Gilgeous-Alexander,26,OKC,PG,68,68,2335,770,1474,.522,145,392,.370,625,1082,.578,.572,547,606,.903,60,282,342,429,120,70,170,151,2232,0,,2025
1,Anthony Edwards,23,MIN,SG,69,69,2505,621,1406,.442,278,695,.400,343,711,.482,.541,375,446,.841,53,346,399,316,79,42,222,124,1895,0,,2025
2,Nikola Jokić,29,DEN,C,62,62,2263,690,1201,.575,117,283,.413,573,918,.624,.623,309,383,.807,179,617,796,637,109,43,203,143,1806,29,,2025
3,Jayson Tatum,26,BOS,PF,65,65,2377,604,1330,.454,231,661,.349,373,669,.558,.541,327,401,.815,44,522,566,385,73,34,195,144,1766,2,,2025
4,Giannis Antetokounmpo,30,MIL,PF,58,58,1967,691,1153,.599,8,45,.178,683,1108,.616,.603,363,600,.605,136,562,698,345,49,72,187,141,1753,7,,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,Will Magnay,22,NOP,C,1,0,3,0,1,.000,0,1,.000,0,0,,.000,0,0,,0,0,0,0,0,0,1,1,0,0,,2021
3640,Anžejs Pasečņiks,25,WAS,C,1,0,6,0,1,.000,0,1,.000,0,0,,.000,0,0,,1,0,1,1,0,0,5,2,0,0,,2021
3641,Noah Vonleh,25,BRK,C,4,0,11,0,3,.000,0,2,.000,0,1,.000,.000,0,0,,0,1,1,1,0,0,2,2,0,0,,2021
3642,Greg Whittington,27,DEN,PF,4,0,12,0,3,.000,0,2,.000,0,1,.000,.000,0,0,,0,0,0,0,0,0,0,0,0,0,,2021


In [38]:
#See if there are only players with no games played
df.sort_values('G',ascending = True).head(20)

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Trp-Dbl,Awards,Year
689,Jamaree Bouyea,25,MIL,PG,1,0,5,1,1,1.0,0,0,,1,1,1.0,1.0,0,0,,0,0,0,1,1,0,1,0,2,0,,2025
701,Riley Minix,24,SAS,SF,1,0,7,0,1,0.0,0,1,0.0,0,0,,0.0,0,0,,0,2,2,0,0,0,0,0,0,0,,2025
702,Miles Norris,24,BOS,PF,1,0,4,0,0,,0,0,,0,0,,,0,0,,0,2,2,0,0,0,0,0,0,0,,2025
60,Anthony Davis,31,DAL,PF,1,1,31,10,18,0.556,2,2,1.0,8,16,0.5,0.611,4,6,0.667,4,12,16,7,0,3,1,0,26,0,,2025
703,Jahlil Okafor,29,IND,C,1,0,3,0,0,,0,0,,0,0,,,0,0,,0,1,1,1,0,0,0,0,0,0,,2025
706,P.J. Tucker,39,NYK,SF,1,0,2,0,0,,0,0,,0,0,,,0,0,,0,0,0,0,0,0,0,0,0,0,,2025
707,Jahmir Young,24,CHI,PG,1,0,2,0,0,,0,0,,0,0,,,0,0,,0,1,1,1,0,0,0,0,0,0,,2025
2586,Tomáš Satoranský,30,SAS,SF,1,0,9,0,0,,0,0,,0,0,,,3,4,0.75,0,1,1,0,0,0,0,0,3,0,,2022
2839,Justin Jackson,26,BOS,SF,1,0,2,0,1,0.0,0,0,,0,1,0.0,0.0,2,2,1.0,0,0,0,0,0,0,0,0,2,0,,2022
1416,Izaiah Brockington,24,NOP,PG,1,0,3,2,5,0.4,0,1,0.0,2,4,0.5,0.4,0,0,,1,1,2,0,0,0,0,0,4,0,,2024


In [37]:
#Get rid of league average rows
df = df[df['Player'] != 'League Average']