In [1]:
!pip install nba_api

Collecting nba_api
  Downloading nba_api-1.6.1-py3-none-any.whl.metadata (5.5 kB)
Downloading nba_api-1.6.1-py3-none-any.whl (279 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/279.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/279.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.4/279.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nba_api
Successfully installed nba_api-1.6.1


In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import nba_api
from nba_api.stats.endpoints import playerindex

from google.colab import drive

## Extracting Data

We will want to extract player data from the NBA api to obtain ID values for individual players and teams so we can easily index and sort them.

We will then scrape html data from basketball-reference.com to get box score statistics for every lakers game played so far this season (2024-2025).

We will take the scraped data and tack on the ID values from the nba api

In [3]:
players_df=playerindex.PlayerIndex().get_data_frames()[0]

In [4]:
players_df

Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_ID,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,...,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,ROSTER_STATUS,FROM_YEAR,TO_YEAR,PTS,REB,AST,STATS_TIMEFRAME
0,1630173,Achiuwa,Precious,precious-achiuwa,1610612752,knicks,0,New York,Knicks,NYK,...,2020.0,1.0,20.0,1.0,2020,2024,5.6,6.0,0.9,Season
1,203500,Adams,Steven,steven-adams,1610612745,rockets,0,Houston,Rockets,HOU,...,2013.0,1.0,12.0,1.0,2013,2024,3.0,4.2,1.1,Season
2,1628389,Adebayo,Bam,bam-adebayo,1610612748,heat,0,Miami,Heat,MIA,...,2017.0,1.0,14.0,1.0,2017,2024,16.1,9.9,4.6,Season
3,1630534,Agbaji,Ochai,ochai-agbaji,1610612761,raptors,0,Toronto,Raptors,TOR,...,2022.0,1.0,14.0,1.0,2022,2024,10.9,3.9,1.8,Season
4,1630583,Aldama,Santi,santi-aldama,1610612763,grizzlies,0,Memphis,Grizzlies,MEM,...,2021.0,1.0,30.0,1.0,2021,2024,13.0,7.3,3.0,Season
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,203469,Zeller,Cody,cody-zeller,1610612737,hawks,0,Atlanta,Hawks,ATL,...,2013.0,1.0,4.0,1.0,2013,2024,,,,Season
537,1627826,Zubac,Ivica,ivica-zubac,1610612746,clippers,0,LA,Clippers,LAC,...,2016.0,2.0,32.0,1.0,2016,2024,15.1,12.6,2.5,Season
538,1641783,da Silva,Tristan,tristan-da-silva,1610612753,magic,0,Orlando,Magic,ORL,...,2024.0,1.0,18.0,1.0,2024,2024,8.9,3.9,1.7,Season
539,1628427,Čančar,Vlatko,vlatko-čančar,1610612743,nuggets,0,Denver,Nuggets,DEN,...,2017.0,2.0,49.0,1.0,2019,2024,2.3,2.0,0.0,Season


In [5]:
# Step 1: Fetch the Lakers' 2025 season page
base_url = "https://www.basketball-reference.com"
team_url = f"{base_url}/teams/LAL/2025_games.html"

response = requests.get(team_url)
if response.status_code != 200:
    raise Exception("Failed to fetch the Lakers' season page!")

# Parse the main team page for box score links
team_soup = BeautifulSoup(response.content, 'html.parser')
box_score_table = team_soup.find('table', {'id': 'games'})

# Extract box score links
box_score_links = []

opps = []

pa = [] #points against
pf = [] #points for

for row in box_score_table.find_all('tr'):
    box_score_cell = row.find('td', {'data-stat': 'box_score_text'})
    if box_score_cell and box_score_cell.find('a'):
        relative_link = box_score_cell.find('a')['href']
        box_score_links.append(f"{base_url}{relative_link}")

    opp_cell = row.find('td', {'data-stat': 'opp_name'})
    if opp_cell and opp_cell.find('a'):
        opps.append(opp_cell.find('a')['href'].split('/')[2])

    pa_cell = row.find('td', {'data-stat': 'opp_pts'})

    pa.append(pa_cell.text if pa_cell else 0)

    pf_cell = row.find('td', {'data-stat': 'pts'})
    pf.append(pf_cell.text if pf_cell else 0)


print(f"Found {len(box_score_links)} box score links.")


Found 82 box score links.


## create a list that subtracts 'points for' from 'points against' for each match

negative points == L

positive points == W

In [6]:
pa_arr = []
pf_arr = []

for i in list(filter(None,pf)):
    pf_arr.append(int(i))
pf_arr=pf_arr[1:]

for i in list(filter(None,pa)):
    pa_arr.append(int(i))
pa_arr=pa_arr[1:]

In [7]:
pa_arr=np.array(pa_arr)
pf_arr = np.array(pf_arr)

In [8]:
tp_array=np.subtract(pf_arr, pa_arr)
print(tp_array)
tp=tp_array.tolist()


[  7   4  -4 -24   6 -12 -17  10  20   5   5   5   6  -1 -25 -27  18  -8
   1 -29 -41  -2   9 -10   6  13   4  -3   2  10 -12   8  17  -4]


response = requests.get('https://www.basketball-reference.com/teams/BOS/2025_games.html')
if response.status_code != 200:
    raise Exception("Failed to fetch box score stats!")
box_soup = BeautifulSoup(response.content, 'html.parser')

In [9]:
# Step 2: Parse the local game stats file

all_game_stats = []

for url in box_score_links[:len(tp)]:
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception("Failed to fetch box score stats!")
    box_soup = BeautifulSoup(response.content, 'html.parser')

    box_score_stats = box_soup.find('table', {'id': 'box-LAL-game-basic'})
    home_team = box_soup.title.string.split(',')[0].split('vs')[0].strip()


    box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table

    box_score_df['Game'] = box_score_links.index(url) + 1

    box_score_df['Home'] = 1 if home_team == 'Lakers' else 0 # Boolean value representing whether they played home or away

    box_score_df['Opponet'] = opps[box_score_links.index(url)]

    box_score_df['Final_Score_Delta'] = tp[box_score_links.index(url)]

    all_game_stats.append(box_score_df)

# Combine all game stats into a single DataFrame
lakers_df = pd.concat(all_game_stats, ignore_index=True)

  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table
  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table
  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table
  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table
  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table
  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table
  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table
  box_score_df = pd.read_html(str(box_score_stats))[0].droplevel(0, axis=1) # remove hierarchial columns from the html table


In [10]:
type(box_score_df.loc[1,'MP']) # minutes played (MP) should be a timestamp column not string

str

In [11]:
lakers_df

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,BLK,TOV,PF,PTS,GmSc,+/-,Game,Home,Opponet,Final_Score_Delta
0,Anthony Davis,37:35,11,23,.478,1,3,.333,13,15,...,3,1,1,36,34.0,+1,1,0,MIN,7
1,Austin Reaves,36:02,6,14,.429,0,5,.000,0,1,...,1,0,4,12,11.4,+12,1,0,MIN,7
2,Rui Hachimura,35:05,7,14,.500,1,4,.250,3,4,...,1,0,2,18,15.9,+19,1,0,MIN,7
3,LeBron James,34:39,7,16,.438,1,4,.250,1,1,...,2,2,3,16,10.1,-6,1,0,MIN,7
4,D'Angelo Russell,34:13,4,12,.333,1,7,.143,0,0,...,0,1,2,9,6.2,+15,1,0,MIN,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,Cam Reddish,12:50,1,2,.500,1,2,.500,0,0,...,0,0,1,3,3.0,+1,34,0,ATL,-4
480,Jaxson Hayes,10:42,1,2,.500,0,0,,1,2,...,0,0,0,3,1.9,0,34,0,ATL,-4
481,Bronny James,1:41,0,1,.000,0,0,,0,0,...,0,0,0,0,0.0,+2,34,0,ATL,-4
482,Gabe Vincent,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,...,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,Did Not Dress,34,0,ATL,-4


## Check if the team abbreviations in the players dataframe and lakers dataframe use the same naming conventions



In [12]:
players_df.TEAM_ABBREVIATION.unique()

array(['NYK', 'HOU', 'MIA', 'TOR', 'MEM', 'DEN', 'MIN', 'PHX', 'CLE',
       'NOP', 'GSW', 'MIL', 'ORL', 'POR', 'WAS', 'CHA', 'CHI', 'LAC',
       'ATL', 'SAS', 'DET', 'BKN', 'PHI', 'BOS', None, 'IND', 'OKC',
       'SAC', 'LAL', 'UTA', 'DAL'], dtype=object)

In [13]:
lakers_df.Opponet.unique()

array(['MIN', 'PHO', 'SAC', 'CLE', 'TOR', 'DET', 'MEM', 'PHI', 'SAS',
       'NOP', 'UTA', 'ORL', 'DEN', 'OKC', 'MIA', 'ATL', 'POR', 'GSW'],
      dtype=object)

The abbreviation for Phoenix does not match

'PHX' != 'PHO'



In [14]:
#replace 'PHO' in lakers_df Opponet column with 'PHX' to align with TEAM_ABBREVIATION in players_df

lakers_df['Opponet'] = lakers_df['Opponet'].replace('PHO', 'PHX')

## Merge datasets together

Join Lakers roster dataset to the nba players dataframe on 'Starters' = 'full_name' to get player IDs.

Join new dataframe on the players dataframe on 'Opponet' = 'TEAM_ABBREVIATION' to get the team IDs of the Lakers opponets

In [15]:

players_df['full_name'] = players_df['PLAYER_FIRST_NAME'] + ' ' + players_df['PLAYER_LAST_NAME']

# Merge team_df with roster_df based on the concatenated full_name column
indexed_df = pd.merge(lakers_df, players_df[['full_name', 'PERSON_ID']].drop_duplicates(), left_on='Starters', right_on='full_name', how='inner', validate='m:1')
indexed_df = pd.merge(indexed_df, players_df[['TEAM_ABBREVIATION', 'TEAM_ID']].drop_duplicates(), left_on='Opponet', right_on='TEAM_ABBREVIATION', how='inner', validate='m:1')


final_df = indexed_df.drop(['Starters', 'Opponet', 'TEAM_ABBREVIATION', 'full_name'], axis=1)

final_df=final_df.replace(r'[a-zA-Z]', np.nan, regex=True)
#final_df.dropna(inplace=True, how='all')

In [16]:
indexed_df

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,GmSc,+/-,Game,Home,Opponet,Final_Score_Delta,full_name,PERSON_ID,TEAM_ABBREVIATION,TEAM_ID
0,Anthony Davis,37:35,11,23,.478,1,3,.333,13,15,...,34.0,+1,1,0,MIN,7,Anthony Davis,203076,MIN,1610612750
1,Austin Reaves,36:02,6,14,.429,0,5,.000,0,1,...,11.4,+12,1,0,MIN,7,Austin Reaves,1630559,MIN,1610612750
2,Rui Hachimura,35:05,7,14,.500,1,4,.250,3,4,...,15.9,+19,1,0,MIN,7,Rui Hachimura,1629060,MIN,1610612750
3,LeBron James,34:39,7,16,.438,1,4,.250,1,1,...,10.1,-6,1,0,MIN,7,LeBron James,2544,MIN,1610612750
4,D'Angelo Russell,34:13,4,12,.333,1,7,.143,0,0,...,6.2,+15,1,0,MIN,7,D'Angelo Russell,1626156,MIN,1610612750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Shake Milton,13:29,0,3,.000,0,2,.000,2,2,...,3.2,+1,34,0,ATL,-4,Shake Milton,1629003,ATL,1610612737
391,Cam Reddish,12:50,1,2,.500,1,2,.500,0,0,...,3.0,+1,34,0,ATL,-4,Cam Reddish,1629629,ATL,1610612737
392,Jaxson Hayes,10:42,1,2,.500,0,0,,1,2,...,1.9,0,34,0,ATL,-4,Jaxson Hayes,1629637,ATL,1610612737
393,Bronny James,1:41,0,1,.000,0,0,,0,0,...,0.0,+2,34,0,ATL,-4,Bronny James,1642355,ATL,1610612737


In [17]:
for i in players_df.full_name.duplicated():
    if i == True:
        print(True)
    else:
        pass

In [18]:
indexed_df

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,GmSc,+/-,Game,Home,Opponet,Final_Score_Delta,full_name,PERSON_ID,TEAM_ABBREVIATION,TEAM_ID
0,Anthony Davis,37:35,11,23,.478,1,3,.333,13,15,...,34.0,+1,1,0,MIN,7,Anthony Davis,203076,MIN,1610612750
1,Austin Reaves,36:02,6,14,.429,0,5,.000,0,1,...,11.4,+12,1,0,MIN,7,Austin Reaves,1630559,MIN,1610612750
2,Rui Hachimura,35:05,7,14,.500,1,4,.250,3,4,...,15.9,+19,1,0,MIN,7,Rui Hachimura,1629060,MIN,1610612750
3,LeBron James,34:39,7,16,.438,1,4,.250,1,1,...,10.1,-6,1,0,MIN,7,LeBron James,2544,MIN,1610612750
4,D'Angelo Russell,34:13,4,12,.333,1,7,.143,0,0,...,6.2,+15,1,0,MIN,7,D'Angelo Russell,1626156,MIN,1610612750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Shake Milton,13:29,0,3,.000,0,2,.000,2,2,...,3.2,+1,34,0,ATL,-4,Shake Milton,1629003,ATL,1610612737
391,Cam Reddish,12:50,1,2,.500,1,2,.500,0,0,...,3.0,+1,34,0,ATL,-4,Cam Reddish,1629629,ATL,1610612737
392,Jaxson Hayes,10:42,1,2,.500,0,0,,1,2,...,1.9,0,34,0,ATL,-4,Jaxson Hayes,1629637,ATL,1610612737
393,Bronny James,1:41,0,1,.000,0,0,,0,0,...,0.0,+2,34,0,ATL,-4,Bronny James,1642355,ATL,1610612737


In [19]:
final_df

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,TOV,PF,PTS,GmSc,+/-,Game,Home,Final_Score_Delta,PERSON_ID,TEAM_ID
0,37:35,11,23,.478,1,3,.333,13,15,.867,...,1,1,36,34.0,+1,1,0,7,203076,1610612750
1,36:02,6,14,.429,0,5,.000,0,1,.000,...,0,4,12,11.4,+12,1,0,7,1630559,1610612750
2,35:05,7,14,.500,1,4,.250,3,4,.750,...,0,2,18,15.9,+19,1,0,7,1629060,1610612750
3,34:39,7,16,.438,1,4,.250,1,1,1.000,...,2,3,16,10.1,-6,1,0,7,2544,1610612750
4,34:13,4,12,.333,1,7,.143,0,0,,...,1,2,9,6.2,+15,1,0,7,1626156,1610612750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,13:29,0,3,.000,0,2,.000,2,2,1.000,...,1,1,2,3.2,+1,34,0,-4,1629003,1610612737
391,12:50,1,2,.500,1,2,.500,0,0,,...,0,1,3,3.0,+1,34,0,-4,1629629,1610612737
392,10:42,1,2,.500,0,0,,1,2,.500,...,0,0,3,1.9,0,34,0,-4,1629637,1610612737
393,1:41,0,1,.000,0,0,,0,0,,...,0,0,0,0.0,+2,34,0,-4,1642355,1610612737


In [20]:

played = []

for i in final_df.loc[:,'MP'].to_list():
   if i not in ["00:00", np.nan]:
       played.append(1)
   else:
       played.append(0)

final_df["Played"]=played

In [21]:
final_df

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,PF,PTS,GmSc,+/-,Game,Home,Final_Score_Delta,PERSON_ID,TEAM_ID,Played
0,37:35,11,23,.478,1,3,.333,13,15,.867,...,1,36,34.0,+1,1,0,7,203076,1610612750,1
1,36:02,6,14,.429,0,5,.000,0,1,.000,...,4,12,11.4,+12,1,0,7,1630559,1610612750,1
2,35:05,7,14,.500,1,4,.250,3,4,.750,...,2,18,15.9,+19,1,0,7,1629060,1610612750,1
3,34:39,7,16,.438,1,4,.250,1,1,1.000,...,3,16,10.1,-6,1,0,7,2544,1610612750,1
4,34:13,4,12,.333,1,7,.143,0,0,,...,2,9,6.2,+15,1,0,7,1626156,1610612750,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,13:29,0,3,.000,0,2,.000,2,2,1.000,...,1,2,3.2,+1,34,0,-4,1629003,1610612737,1
391,12:50,1,2,.500,1,2,.500,0,0,,...,1,3,3.0,+1,34,0,-4,1629629,1610612737,1
392,10:42,1,2,.500,0,0,,1,2,.500,...,0,3,1.9,0,34,0,-4,1629637,1610612737,1
393,1:41,0,1,.000,0,0,,0,0,,...,0,0,0.0,+2,34,0,-4,1642355,1610612737,1


In [22]:
final_df["MP"]=pd.to_datetime('00:'+final_df["MP"], format='%H:%M:%S').dt.time
final_df["MP"]

Unnamed: 0,MP
0,00:37:35
1,00:36:02
2,00:35:05
3,00:34:39
4,00:34:13
...,...
390,00:13:29
391,00:12:50
392,00:10:42
393,00:01:41


In [24]:
drive.mount("/content/drive")
from datetime import datetime

final_df.to_csv('/content/drive/MyDrive/NBA_Models/lakers_augmented_data_{timestamp}.csv'.format(timestamp=datetime.now().strftime('%m-%d-%Y')))

Mounted at /content/drive


In [None]:
indexed_df[indexed_df["Opponet"] == "SAC"]