# Data collection

We will scrape the following data from the 1990-91 season up through the 2023-24 season from [basketball-reference.com](https://www.basketball-reference.com):
* Per game statistics
* Advanced statistics
* Salaries
* Transactions (waives, releases, trades)

In [1]:
from nba_api.stats.static import players
from nba_api.stats.static import teams
# from selenium import webdriver
# from selenium.webdriver.common.action_chains import ActionChains
# from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup, Comment
import requests
import csv
import re
import os
from io import StringIO
from time import sleep
from tqdm import tqdm
import numpy as np
import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.display.max_columns = None

### Per game statistics and advanced statistics

In [7]:
# create DataFrame containing all per game and advanced stats
# for seasons 1990-91 through 2023-24

stats_df = pd.DataFrame()

for year in range(1990, 2024):
    # get per game stats
    per_game_df = pd.read_csv(f'./per_game_stats/per_game_stats_{year}.csv')
    per_game_df = per_game_df.drop(len(per_game_df)-1) # drop last row containing league averages
    per_game_df = per_game_df.drop(columns=['Rk', 'Awards'])
    per_game_df['SEASON_START'] = year
    per_game_df['Team'] = per_game_df['Team'].apply(
        lambda x: 'TOT' if 'TM' in x else x
    )

    # get advanced stats
    advanced_df = pd.read_csv(f'./advanced_stats/advanced_stats_{year}.csv')
    advanced_df = advanced_df.drop(columns=['Rk', 'Player', 'Pos', 'Age', 'G', 'MP',
                                            'Unnamed: 19', 'Unnamed: 24'])
    advanced_df = advanced_df.rename(columns={'Tm': 'Team'})

    # merge per game stats and advanced stats
    temp_stats_df = pd.merge(per_game_df, advanced_df, how='left', on=['Player-additional', 'Team'])

    # concatenate this years stats to total DataFrame
    stats_df = pd.concat([stats_df, temp_stats_df], ignore_index=True)

In [8]:
# organize/rename columns (drop irrelevant ones)

stats_columns = stats_df.columns
stats_columns = stats_columns.drop(['Player-additional', 'SEASON_START', 'Player'])
stats_columns = stats_columns.insert(0, 'Player-additional')
stats_columns = stats_columns.insert(1, 'SEASON_START')
stats_columns = stats_columns.insert(2, 'Player')

stats_df = stats_df[stats_columns]

stats_df = stats_df.rename(columns={'Player': 'PLAYER_NAME', 'Age': 'AGE', 'Team': 'TEAM', 'Player-additional': 'PLAYER_ID', 'Pos': 'POS'})

In [9]:
stats_df

Unnamed: 0,PLAYER_ID,SEASON_START,PLAYER_NAME,AGE,TEAM,POS,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,jordami01,1990,Michael Jordan,27.0,CHI,SG,82.0,82.0,37.0,12.1,22.4,0.539,0.4,1.1,0.312,11.7,21.3,0.551,0.547,7.0,8.2,0.851,1.4,4.6,6.0,5.5,2.7,1.0,2.5,2.8,31.5,31.6,0.605,0.051,0.365,4.6,14.3,9.5,25.2,3.7,1.7,8.7,32.9,14.9,5.4,20.3,0.321,8.9,3.2,12.0,10.8
1,malonka01,1990,Karl Malone,27.0,UTA,PF,82.0,82.0,40.3,10.3,19.6,0.527,0.0,0.2,0.286,10.3,19.4,0.529,0.528,8.3,10.8,0.770,2.9,8.9,11.8,3.3,1.1,1.0,3.0,3.3,29.0,24.8,0.596,0.009,0.552,9.0,24.5,17.2,14.7,1.4,1.5,10.9,30.1,9.9,5.6,15.5,0.225,4.8,0.6,5.4,6.2
2,kingbe01,1990,Bernard King,34.0,WSB,SF,64.0,64.0,37.5,11.1,23.6,0.472,0.1,0.6,0.216,11.0,23.0,0.478,0.475,6.0,7.6,0.790,1.8,3.2,5.0,4.6,0.9,0.3,4.0,2.9,28.4,19.1,0.527,0.024,0.321,5.2,9.4,7.3,21.8,1.1,0.4,12.9,34.4,1.9,1.6,3.5,0.070,2.8,-1.7,1.1,1.8
3,barklch01,1990,Charles Barkley,27.0,PHI,SF,67.0,67.0,37.3,9.9,17.4,0.570,0.7,2.3,0.284,9.3,15.1,0.614,0.589,7.1,9.8,0.722,3.9,6.3,10.1,4.2,1.6,0.5,3.1,2.6,27.6,28.9,0.635,0.133,0.564,11.8,18.6,15.3,20.6,2.2,0.8,12.6,29.1,10.3,3.1,13.4,0.258,8.3,1.0,9.3,7.0
4,ewingpa01,1990,Patrick Ewing,28.0,NYK,C,81.0,81.0,38.3,10.4,20.3,0.514,0.0,0.1,0.000,10.4,20.2,0.516,0.514,5.7,7.7,0.745,2.4,8.8,11.2,3.0,1.0,3.2,3.6,3.5,26.6,23.7,0.561,0.004,0.379,7.4,25.7,16.8,14.1,1.3,5.0,13.2,31.1,4.4,5.6,10.0,0.155,2.5,1.7,4.2,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19624,greenda02,2023,Danny Green,36.0,PHI,SG,2.0,0.0,9.0,0.0,1.0,0.000,0.0,0.5,0.000,0.0,0.5,0.000,0.000,0.0,0.0,,0.0,1.0,1.0,0.5,0.5,0.0,0.0,0.5,0.0,0.7,0.000,0.500,0.000,0.0,12.5,6.1,6.5,2.7,0.0,0.0,4.8,0.0,0.0,0.0,-0.035,-7.4,0.8,-6.6,0.0
19625,harpero02,2023,Ron Harper Jr.,23.0,TOR,PF,1.0,0.0,4.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,,,,0.0,0.0,0.0,28.6,0.0,0.0,,0.0,0.0,0.0,0.0,0.087,-11.7,2.8,-8.8,0.0
19626,jacksju01,2023,Justin Jackson,28.0,MIN,SF,2.0,0.0,0.5,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.031,-6.3,-1.2,-7.5,0.0
19627,skapidm01,2023,Dmytro Skapintsev,25.0,NYK,C,2.0,0.0,1.0,0.0,0.5,0.000,0.0,0.0,,0.0,0.5,0.000,0.000,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-19.3,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,0.0,0.0,0.0,-0.483,-16.0,-9.8,-25.9,0.0


In [10]:
stats_df[stats_df['TEAM']=='TOT']

Unnamed: 0,PLAYER_ID,SEASON_START,PLAYER_NAME,AGE,TEAM,POS,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
20,piercri01,1990,Ricky Pierce,31.0,TOT,SG,78.0,2.0,27.8,7.2,14.8,0.485,0.6,1.5,0.397,6.6,13.3,0.495,0.505,5.5,6.0,0.913,0.9,1.6,2.4,2.2,0.8,0.2,1.9,2.2,20.5,21.0,0.586,0.100,0.407,3.6,6.9,5.3,13.0,1.4,0.4,9.7,29.7,6.7,1.2,7.9,0.174,4.1,-1.8,2.3,2.4
54,mcdanxa01,1990,Xavier McDaniel,27.0,TOT,SF,81.0,79.0,32.5,7.3,14.6,0.497,0.0,0.1,0.000,7.3,14.5,0.501,0.497,2.4,3.3,0.723,2.1,4.7,6.9,2.3,0.9,0.6,2.3,3.3,17.0,15.2,0.527,0.007,0.225,7.6,15.5,11.7,10.5,1.4,1.0,12.4,23.2,2.2,3.0,5.1,0.094,-0.1,-0.4,-0.5,1.0
62,ellisda01,1990,Dale Ellis,30.0,TOT,SG,51.0,24.0,27.9,6.7,14.1,0.474,1.1,3.1,0.363,5.5,11.0,0.504,0.513,2.4,3.3,0.723,1.3,2.1,3.4,1.9,1.0,0.2,1.6,2.2,16.8,17.0,0.542,0.219,0.231,5.4,9.1,7.3,10.6,1.7,0.4,9.3,26.0,2.2,1.0,3.2,0.109,1.9,-1.6,0.3,0.8
65,johnsed03,1990,Eddie Johnson,31.0,TOT,SG,81.0,27.0,25.7,6.7,13.9,0.484,0.5,1.5,0.325,6.2,12.4,0.503,0.501,2.8,3.2,0.891,1.3,2.0,3.3,1.4,0.7,0.1,1.5,2.2,16.7,17.8,0.548,0.107,0.229,6.0,9.3,7.7,8.5,1.4,0.3,9.0,27.2,4.3,1.2,5.5,0.127,1.9,-2.1,-0.3,0.9
68,gilliar01,1990,Armen Gilliam,26.0,TOT,PF,75.0,75.0,35.3,6.5,13.3,0.487,0.0,0.0,0.000,6.5,13.3,0.487,0.487,3.6,4.4,0.815,2.9,5.0,8.0,1.4,0.9,0.7,2.3,2.5,16.6,15.5,0.542,0.002,0.329,9.4,16.2,12.8,6.2,1.3,1.2,13.2,21.6,2.3,2.3,4.6,0.083,-0.3,-1.5,-1.8,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19540,wainris01,2023,Ish Wainright,29.0,TOT,SF,11.0,0.0,5.6,0.6,2.2,0.292,0.6,1.8,0.350,0.0,0.4,0.000,0.438,0.2,0.2,1.000,0.3,1.0,1.3,0.1,0.4,0.1,0.2,0.7,2.1,8.9,0.462,0.833,0.083,5.8,20.8,12.7,2.0,3.2,1.5,7.4,18.5,0.0,0.1,0.0,0.029,-2.9,0.5,-2.4,0.0
19544,diakima01,2023,Mamadi Diakite,27.0,TOT,PF,6.0,0.0,4.0,0.7,1.0,0.667,0.0,0.2,0.000,0.7,0.8,0.800,0.667,0.7,1.0,0.667,0.2,0.5,0.7,0.3,0.2,0.2,0.2,0.3,2.0,18.6,0.694,0.167,1.000,4.5,13.8,9.1,13.5,2.1,3.5,10.4,17.0,0.1,0.0,0.1,0.168,0.8,2.7,3.5,0.0
19560,gibsota01,2023,Taj Gibson,38.0,TOT,PF,20.0,1.0,10.2,0.8,1.9,0.405,0.1,0.5,0.200,0.7,1.4,0.481,0.432,0.1,0.1,1.000,0.6,1.3,1.9,0.6,0.2,0.4,0.3,1.6,1.7,5.9,0.449,0.270,0.054,6.6,13.8,10.2,6.9,0.7,3.2,11.7,9.0,0.0,0.2,0.2,0.044,-5.3,0.1,-5.3,-0.2
19563,tuckepj01,2023,P.J. Tucker,38.0,TOT,PF,31.0,10.0,15.7,0.6,1.6,0.360,0.4,1.1,0.371,0.2,0.5,0.333,0.490,0.1,0.1,1.000,0.9,1.8,2.7,0.5,0.5,0.2,0.3,1.7,1.7,5.9,0.507,0.700,0.060,6.7,12.9,9.8,3.9,1.6,1.3,13.5,5.3,0.2,0.5,0.7,0.068,-4.0,0.4,-3.6,-0.2


In [11]:
stats_df.to_csv('./bbref_stats.csv', index=None)

### Salaries

Now for each player, we will scrape their salary data and merge it into `stats_df`.

In [5]:
stats_df['PLAYER_ID'].nunique()

2985

In [6]:
def currency_to_float(x):
    """Converts a currency string to a float."""
    try:
        return float(x.replace('$', '').replace(',', ''))
    except:
        return None

In [27]:
# for each player_id, scrape their salary data and concatenate into salary_df

# salary_df = pd.DataFrame()

for player_id in tqdm(set(stats_df['PLAYER_ID'].unique()).difference(set(salary_df['PLAYER_ID'].unique()))):
# for player_id in tqdm(stats_df['PLAYER_ID'].unique()):
    response = requests.get(f'https://www.basketball-reference.com/players/{player_id[0]}/{player_id}.html')
    html = response.content
    sleep(3)
    soup = BeautifulSoup(html, 'html.parser')

    # find comment where salary data is hidden
    found_table = False
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        if 'all_salaries' in comment:
            table = BeautifulSoup(comment).find('table', attrs={'id': 'all_salaries'})
            found_table = True
            break

    if found_table:
        # get salary data for given player_id
        salary_dict = {'SEASON_START': [],
                    'TEAM': [],
                    'PLAYER_ID': [],
                    'SALARY': []}
        for tr in table.find('tbody').find_all('tr'):
            salary_dict['SEASON_START'].append(int(tr.find('th').get_text()[:4]))
            salary_dict['TEAM'].append(
                re.search(r'/teams/([^/]+)/', tr.find('td', attrs={'data-stat': 'team_name'})
                        .find('a')['href'])
                        .group(1)
            )
            salary_dict['PLAYER_ID'].append(player_id)
            salary_dict['SALARY'].append(currency_to_float(tr.find('td', attrs={'data-stat': 'salary'}).get_text()))
        salary_df = pd.concat([salary_df, pd.DataFrame(salary_dict)])

    else:
        print(f'No salary data for {player_id}')

  salary_df = pd.concat([salary_df, pd.DataFrame(salary_dict)])
  1%|▏         | 16/1203 [00:55<1:07:22,  3.41s/it]

No salary data for hendrita0


  4%|▎         | 44/1203 [02:30<1:06:02,  3.42s/it]

No salary data for newbeme01


  4%|▍         | 46/1203 [02:37<1:05:04,  3.37s/it]

No salary data for edwarbi01


  5%|▌         | 64/1203 [03:38<1:03:22,  3.34s/it]

No salary data for davisty01


  6%|▌         | 69/1203 [03:55<1:03:24,  3.36s/it]

No salary data for zimmede01


  6%|▋         | 77/1203 [04:23<1:04:01,  3.41s/it]

No salary data for fullehi01


  7%|▋         | 81/1203 [04:36<1:03:43,  3.41s/it]

No salary data for higgimi01


  8%|▊         | 94/1203 [05:20<1:03:20,  3.43s/it]

No salary data for pondssh01


  9%|▉         | 106/1203 [06:01<1:02:39,  3.43s/it]

No salary data for mitchto03


 10%|█         | 125/1203 [07:06<1:00:55,  3.39s/it]

No salary data for knighbr02


 12%|█▏        | 144/1203 [08:12<1:00:34,  3.43s/it]

No salary data for claxtch01


 13%|█▎        | 159/1203 [09:02<59:16,  3.41s/it]  

No salary data for whitfdw01


 14%|█▍        | 171/1203 [09:43<58:55,  3.43s/it]  

No salary data for dentmju01


 14%|█▍        | 172/1203 [09:47<58:44,  3.42s/it]

No salary data for smithst02


 15%|█▌        | 184/1203 [10:27<57:53,  3.41s/it]

No salary data for colsose01


 16%|█▌        | 189/1203 [10:45<56:53,  3.37s/it]  

No salary data for wareca01


 16%|█▋        | 197/1203 [11:12<56:18,  3.36s/it]

No salary data for marticu01


 18%|█▊        | 215/1203 [12:14<56:39,  3.44s/it]

No salary data for chrispa01


 18%|█▊        | 216/1203 [12:17<55:49,  3.39s/it]

No salary data for henrysk01


 20%|█▉        | 237/1203 [13:30<54:00,  3.35s/it]

No salary data for webstje01


 20%|██        | 245/1203 [13:57<54:22,  3.41s/it]

No salary data for felixno01


 21%|██        | 252/1203 [14:21<53:48,  3.39s/it]

No salary data for vanteda01


 23%|██▎       | 278/1203 [15:48<50:59,  3.31s/it]

No salary data for edwarsh01


 23%|██▎       | 282/1203 [16:02<52:31,  3.42s/it]

No salary data for heggsal01


 24%|██▍       | 290/1203 [16:29<51:45,  3.40s/it]

No salary data for nathaho01


 26%|██▌       | 312/1203 [17:44<49:58,  3.37s/it]

No salary data for loftoza01


 27%|██▋       | 320/1203 [18:11<49:35,  3.37s/it]

No salary data for simsal01


 28%|██▊       | 334/1203 [19:00<48:33,  3.35s/it]

No salary data for harriel01


 30%|███       | 363/1203 [20:40<48:48,  3.49s/it]

No salary data for blackja01


 32%|███▏      | 388/1203 [22:11<48:21,  3.56s/it]

No salary data for cousima01


 33%|███▎      | 398/1203 [22:46<45:46,  3.41s/it]

No salary data for dysonje01


 34%|███▍      | 407/1203 [23:16<45:44,  3.45s/it]

No salary data for smithcl01


 35%|███▌      | 423/1203 [24:11<44:22,  3.41s/it]

No salary data for wrighch02


 36%|███▋      | 439/1203 [25:06<43:10,  3.39s/it]

No salary data for churcro01


 37%|███▋      | 443/1203 [25:20<43:18,  3.42s/it]

No salary data for jordath01


 37%|███▋      | 450/1203 [25:44<43:26,  3.46s/it]

No salary data for benneel01


 38%|███▊      | 462/1203 [26:25<42:06,  3.41s/it]

No salary data for smithje01


 40%|███▉      | 480/1203 [27:27<41:36,  3.45s/it]

No salary data for calipde01


 40%|████      | 484/1203 [27:41<41:11,  3.44s/it]

No salary data for smithch05


 41%|████▏     | 497/1203 [28:25<40:12,  3.42s/it]

No salary data for wheelty01


 41%|████▏     | 499/1203 [28:32<40:21,  3.44s/it]

No salary data for akoonde01


 42%|████▏     | 503/1203 [28:46<40:28,  3.47s/it]

No salary data for bakerma01


 43%|████▎     | 512/1203 [29:18<41:24,  3.60s/it]

No salary data for byarsde01


 46%|████▌     | 553/1203 [31:44<41:02,  3.79s/it]

No salary data for fontais01


 47%|████▋     | 567/1203 [32:33<39:03,  3.68s/it]

No salary data for fergude01


 49%|████▊     | 584/1203 [33:35<35:45,  3.47s/it]

No salary data for jeffedo01


 50%|█████     | 606/1203 [34:53<34:09,  3.43s/it]

No salary data for hansore01


 51%|█████▏    | 618/1203 [35:35<33:36,  3.45s/it]

No salary data for kiddwa01


 52%|█████▏    | 625/1203 [35:59<33:13,  3.45s/it]

No salary data for harmoje01


 52%|█████▏    | 626/1203 [36:02<32:43,  3.40s/it]

No salary data for steveba01


 53%|█████▎    | 632/1203 [36:24<33:25,  3.51s/it]

No salary data for allrela01


 53%|█████▎    | 641/1203 [36:55<32:07,  3.43s/it]

No salary data for dawsoto01


 53%|█████▎    | 642/1203 [36:59<33:43,  3.61s/it]

No salary data for southja01


 54%|█████▍    | 655/1203 [37:44<31:54,  3.49s/it]

No salary data for jacksst01


 56%|█████▌    | 673/1203 [38:48<30:56,  3.50s/it]

No salary data for mckinca01


 58%|█████▊    | 702/1203 [40:29<28:01,  3.36s/it]

No salary data for graceri01


 61%|██████    | 730/1203 [42:08<28:07,  3.57s/it]

No salary data for jeffeot01


 61%|██████    | 734/1203 [42:23<27:31,  3.52s/it]

No salary data for steigma01


 63%|██████▎   | 757/1203 [43:43<25:56,  3.49s/it]

No salary data for lettcl01


 63%|██████▎   | 762/1203 [44:01<25:54,  3.53s/it]

No salary data for anderan02


 65%|██████▌   | 787/1203 [45:28<23:45,  3.43s/it]

No salary data for huntece01


 66%|██████▌   | 792/1203 [45:45<24:02,  3.51s/it]

No salary data for ewingpa02


 66%|██████▌   | 794/1203 [45:52<23:36,  3.46s/it]

No salary data for lewisce01


 69%|██████▊   | 826/1203 [47:44<21:22,  3.40s/it]

No salary data for grayev01


 70%|███████   | 844/1203 [48:47<21:11,  3.54s/it]

No salary data for marblro01


 71%|███████   | 849/1203 [49:05<20:41,  3.51s/it]

No salary data for lafayol01


 71%|███████▏  | 859/1203 [49:40<19:58,  3.48s/it]

No salary data for crawfjo01


 72%|███████▏  | 862/1203 [49:51<20:09,  3.55s/it]

No salary data for mcclida01


 74%|███████▍  | 892/1203 [51:36<17:47,  3.43s/it]

No salary data for phelpde01


 74%|███████▍  | 895/1203 [51:45<17:12,  3.35s/it]

No salary data for pinknke01


 75%|███████▌  | 908/1203 [52:30<16:53,  3.43s/it]

No salary data for dinkiby01


 76%|███████▌  | 915/1203 [52:53<16:18,  3.40s/it]

No salary data for cartema01


 76%|███████▋  | 918/1203 [53:04<16:15,  3.42s/it]

No salary data for ballce01


 78%|███████▊  | 937/1203 [54:08<14:54,  3.36s/it]

No salary data for thompst01


 78%|███████▊  | 943/1203 [54:28<14:33,  3.36s/it]

No salary data for grundan01


 79%|███████▉  | 950/1203 [54:52<14:28,  3.43s/it]

No salary data for viannjo01


 79%|███████▉  | 954/1203 [55:05<13:44,  3.31s/it]

No salary data for carruji01


 80%|████████  | 964/1203 [55:39<13:36,  3.42s/it]

No salary data for wiltjky01


 80%|████████  | 966/1203 [55:46<13:18,  3.37s/it]

No salary data for fordal01


 84%|████████▎ | 1006/1203 [58:05<11:25,  3.48s/it]

No salary data for sykesla01


 84%|████████▍ | 1011/1203 [58:23<11:14,  3.51s/it]

No salary data for ubileed01


 85%|████████▍ | 1020/1203 [58:54<10:21,  3.39s/it]

No salary data for bookeme01


 85%|████████▌ | 1024/1203 [59:08<10:27,  3.50s/it]

No salary data for djordal01


 87%|████████▋ | 1044/1203 [1:00:18<09:02,  3.41s/it]

No salary data for alumape01


 89%|████████▉ | 1074/1203 [1:02:03<07:30,  3.49s/it]

No salary data for johnsda02


 90%|█████████ | 1088/1203 [1:02:53<06:48,  3.55s/it]

No salary data for daviebr01


 91%|█████████ | 1093/1203 [1:03:10<06:23,  3.48s/it]

No salary data for driggna01


 91%|█████████ | 1097/1203 [1:03:24<05:58,  3.38s/it]

No salary data for mcdonmi01


 93%|█████████▎| 1116/1203 [1:04:32<05:11,  3.58s/it]

No salary data for lorthry01


 93%|█████████▎| 1120/1203 [1:04:46<04:49,  3.48s/it]

No salary data for gallach01


 95%|█████████▍| 1137/1203 [1:05:46<03:46,  3.44s/it]

No salary data for boneyde01


 95%|█████████▌| 1147/1203 [1:06:22<03:22,  3.61s/it]

No salary data for penigde01


 96%|█████████▌| 1151/1203 [1:06:37<03:07,  3.61s/it]

No salary data for munkch01


 98%|█████████▊| 1179/1203 [1:08:16<01:23,  3.47s/it]

No salary data for jonesma03


 98%|█████████▊| 1181/1203 [1:08:22<01:14,  3.40s/it]

No salary data for bakerla01


 99%|█████████▉| 1190/1203 [1:08:54<00:45,  3.49s/it]

No salary data for powelka01


100%|██████████| 1203/1203 [1:09:39<00:00,  3.47s/it]


In [28]:
salary_df

Unnamed: 0,SEASON_START,TEAM,PLAYER_ID,SALARY
0,1984,CHI,jordami01,550000.0
1,1985,CHI,jordami01,630000.0
2,1986,CHI,jordami01,737500.0
3,1987,CHI,jordami01,845000.0
4,1988,CHI,jordami01,2000000.0
...,...,...,...,...
4,2021,CLE,markkla01,15690909.0
5,2022,UTA,markkla01,16475454.0
6,2023,UTA,markkla01,17259999.0
0,2023,LAC,millejo02,


In [30]:
stats_df

Unnamed: 0,PLAYER_ID,SEASON_START,PLAYER_NAME,AGE,TEAM,POS,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,jordami01,1990,Michael Jordan,27.0,CHI,SG,82.0,82.0,37.0,12.1,22.4,0.539,0.4,1.1,0.312,11.7,21.3,0.551,0.547,7.0,8.2,0.851,1.4,4.6,6.0,5.5,2.7,1.0,2.5,2.8,31.5,31.6,0.605,0.051,0.365,4.6,14.3,9.5,25.2,3.7,1.7,8.7,32.9,14.9,5.4,20.3,0.321,8.9,3.2,12.0,10.8
1,malonka01,1990,Karl Malone,27.0,UTA,PF,82.0,82.0,40.3,10.3,19.6,0.527,0.0,0.2,0.286,10.3,19.4,0.529,0.528,8.3,10.8,0.770,2.9,8.9,11.8,3.3,1.1,1.0,3.0,3.3,29.0,24.8,0.596,0.009,0.552,9.0,24.5,17.2,14.7,1.4,1.5,10.9,30.1,9.9,5.6,15.5,0.225,4.8,0.6,5.4,6.2
2,kingbe01,1990,Bernard King,34.0,WSB,SF,64.0,64.0,37.5,11.1,23.6,0.472,0.1,0.6,0.216,11.0,23.0,0.478,0.475,6.0,7.6,0.790,1.8,3.2,5.0,4.6,0.9,0.3,4.0,2.9,28.4,19.1,0.527,0.024,0.321,5.2,9.4,7.3,21.8,1.1,0.4,12.9,34.4,1.9,1.6,3.5,0.070,2.8,-1.7,1.1,1.8
3,barklch01,1990,Charles Barkley,27.0,PHI,SF,67.0,67.0,37.3,9.9,17.4,0.570,0.7,2.3,0.284,9.3,15.1,0.614,0.589,7.1,9.8,0.722,3.9,6.3,10.1,4.2,1.6,0.5,3.1,2.6,27.6,28.9,0.635,0.133,0.564,11.8,18.6,15.3,20.6,2.2,0.8,12.6,29.1,10.3,3.1,13.4,0.258,8.3,1.0,9.3,7.0
4,ewingpa01,1990,Patrick Ewing,28.0,NYK,C,81.0,81.0,38.3,10.4,20.3,0.514,0.0,0.1,0.000,10.4,20.2,0.516,0.514,5.7,7.7,0.745,2.4,8.8,11.2,3.0,1.0,3.2,3.6,3.5,26.6,23.7,0.561,0.004,0.379,7.4,25.7,16.8,14.1,1.3,5.0,13.2,31.1,4.4,5.6,10.0,0.155,2.5,1.7,4.2,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19624,greenda02,2023,Danny Green,36.0,PHI,SG,2.0,0.0,9.0,0.0,1.0,0.000,0.0,0.5,0.000,0.0,0.5,0.000,0.000,0.0,0.0,,0.0,1.0,1.0,0.5,0.5,0.0,0.0,0.5,0.0,0.7,0.000,0.500,0.000,0.0,12.5,6.1,6.5,2.7,0.0,0.0,4.8,0.0,0.0,0.0,-0.035,-7.4,0.8,-6.6,0.0
19625,harpero02,2023,Ron Harper Jr.,23.0,TOR,PF,1.0,0.0,4.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,,,,0.0,0.0,0.0,28.6,0.0,0.0,,0.0,0.0,0.0,0.0,0.087,-11.7,2.8,-8.8,0.0
19626,jacksju01,2023,Justin Jackson,28.0,MIN,SF,2.0,0.0,0.5,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.031,-6.3,-1.2,-7.5,0.0
19627,skapidm01,2023,Dmytro Skapintsev,25.0,NYK,C,2.0,0.0,1.0,0.0,0.5,0.000,0.0,0.0,,0.0,0.5,0.000,0.000,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-19.3,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,0.0,0.0,0.0,-0.483,-16.0,-9.8,-25.9,0.0


In [32]:
temp_salary_df = salary_df.groupby(['SEASON_START', 'TEAM', 'PLAYER_ID'], as_index=False)['SALARY'].agg('sum').reset_index(drop=True)

In [40]:
stats_df = pd.merge(stats_df,
                    temp_salary_df,
                    how='left',
                    on=['PLAYER_ID', 'SEASON_START', 'TEAM'])

In [36]:
temp_stats_df[(temp_stats_df['SALARY']==0) | (temp_stats_df['SALARY'].isnull())]

Unnamed: 0,PLAYER_ID,SEASON_START,PLAYER_NAME,AGE,TEAM,POS,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,SALARY
20,piercri01,1990,Ricky Pierce,31.0,2TM,SG,78.0,2.0,27.8,7.2,14.8,0.485,0.6,1.5,0.397,6.6,13.3,0.495,0.505,5.5,6.0,0.913,0.9,1.6,2.4,2.2,0.8,0.2,1.9,2.2,20.5,,,,,,,,,,,,,,,,,,,,,
21,piercri01,1990,Ricky Pierce,31.0,MIL,SG,46.0,0.0,28.8,7.8,15.7,0.499,0.8,2.0,0.398,7.0,13.6,0.514,0.524,6.1,6.8,0.907,0.8,1.7,2.5,2.1,0.8,0.2,2.0,2.0,22.5,22.7,0.605,0.129,0.432,3.3,7.3,5.3,12.7,1.4,0.5,9.8,30.7,4.5,0.9,5.3,0.192,5.3,-1.6,3.7,1.9,
54,mcdanxa01,1990,Xavier McDaniel,27.0,2TM,SF,81.0,79.0,32.5,7.3,14.6,0.497,0.0,0.1,0.000,7.3,14.5,0.501,0.497,2.4,3.3,0.723,2.1,4.7,6.9,2.3,0.9,0.6,2.3,3.3,17.0,,,,,,,,,,,,,,,,,,,,,
55,mcdanxa01,1990,Xavier McDaniel,27.0,SEA,SF,15.0,15.0,35.3,9.3,19.3,0.479,0.0,0.2,0.000,9.3,19.1,0.484,0.479,3.3,4.6,0.710,2.4,3.0,5.4,2.5,1.7,0.3,2.7,3.3,21.8,17.3,0.510,0.010,0.238,8.0,10.3,9.2,11.6,2.5,0.5,11.1,28.6,0.4,0.5,0.9,0.082,1.6,-0.8,0.8,0.4,
62,ellisda01,1990,Dale Ellis,30.0,2TM,SG,51.0,24.0,27.9,6.7,14.1,0.474,1.1,3.1,0.363,5.5,11.0,0.504,0.513,2.4,3.3,0.723,1.3,2.1,3.4,1.9,1.0,0.2,1.6,2.2,16.8,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19622,funkan01,2023,Andrew Funk,24.0,CHI,SG,5.0,0.0,2.6,0.0,0.8,0.000,0.0,0.6,0.000,0.0,0.2,0.000,0.000,0.0,0.0,,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,-5.0,0.000,0.750,0.000,0.0,0.0,0.0,0.0,3.8,7.9,0.0,13.5,-0.1,0.0,-0.1,-0.252,-12.4,-2.0,-14.4,0.0,0.0
19623,gateska01,2023,Kaiser Gates,27.0,NOP,SF,1.0,0.0,7.0,0.0,4.0,0.000,0.0,2.0,0.000,0.0,2.0,0.000,0.000,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-19.9,0.000,0.500,0.000,0.0,15.7,8.0,0.0,0.0,0.0,0.0,24.9,-0.1,0.0,-0.1,-0.558,-18.6,-10.1,-28.8,-0.1,0.0
19625,harpero02,2023,Ron Harper Jr.,23.0,TOR,PF,1.0,0.0,4.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,,,,0.0,0.0,0.0,28.6,0.0,0.0,,0.0,0.0,0.0,0.0,0.087,-11.7,2.8,-8.8,0.0,0.0
19627,skapidm01,2023,Dmytro Skapintsev,25.0,NYK,C,2.0,0.0,1.0,0.0,0.5,0.000,0.0,0.0,,0.0,0.5,0.000,0.000,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-19.3,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,0.0,0.0,0.0,-0.483,-16.0,-9.8,-25.9,0.0,0.0


In [41]:
stats_collapsed_df = (
    stats_df
    .groupby(['PLAYER_ID', 'SEASON_START'], as_index=False)[stats_df.columns]
    .apply(lambda group: group[group['TEAM'].str.contains('TM')]
           if group['TEAM'].str.contains('TM').any()
           else group)
    .reset_index(drop=True)
)

In [42]:
stats_collapsed_df

Unnamed: 0,PLAYER_ID,SEASON_START,PLAYER_NAME,AGE,TEAM,POS,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,SALARY
0,abdelal01,1990,Alaa Abdelnaby,22.0,POR,PF,43.0,0.0,6.7,1.3,2.7,0.474,0.0,0.0,,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,13.1,0.499,0.000,0.379,10.4,23.4,17.0,5.8,0.7,2.5,14.0,22.1,0.0,0.5,0.5,0.079,-3.4,-1.2,-4.6,-0.2,395000.0
1,abdelal01,1991,Alaa Abdelnaby,23.0,POR,PF,71.0,1.0,13.2,2.5,5.1,0.493,0.0,0.0,,2.5,5.1,0.493,0.493,1.1,1.4,0.752,1.1,2.5,3.7,0.4,0.4,0.2,0.9,1.9,6.1,13.5,0.533,0.000,0.280,9.5,20.9,15.2,4.7,1.3,1.1,14.0,20.6,0.6,1.5,2.1,0.110,-2.3,-0.4,-2.6,-0.1,494000.0
2,abdelal01,1992,Alaa Abdelnaby,24.0,2TM,PF,75.0,52.0,17.5,3.3,6.3,0.518,0.0,0.0,0.00,3.3,6.3,0.519,0.518,1.2,1.5,0.759,1.7,2.8,4.5,0.4,0.3,0.3,1.3,2.5,7.7,,,,,,,,,,,,,,,,,,,,,
3,abdelal01,1993,Alaa Abdelnaby,25.0,BOS,PF,13.0,0.0,12.2,1.8,4.2,0.436,0.0,0.0,,1.8,4.2,0.436,0.436,1.2,1.9,0.640,0.9,2.6,3.5,0.2,0.2,0.2,1.3,1.5,4.9,9.2,0.485,0.000,0.455,8.5,24.2,16.3,2.7,0.6,1.2,20.5,22.6,-0.2,0.1,-0.1,-0.032,-5.3,-2.2,-7.4,-0.2,805000.0
4,abdelal01,1994,Alaa Abdelnaby,26.0,2TM,PF,54.0,0.0,9.4,2.2,4.3,0.511,0.0,0.0,0.00,2.2,4.2,0.515,0.511,0.4,0.6,0.571,0.7,1.4,2.1,0.2,0.3,0.2,0.8,1.9,4.7,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15827,zubaciv01,2019,Ivica Zubac,22.0,LAC,C,72.0,70.0,18.4,3.3,5.3,0.613,0.0,0.0,0.00,3.3,5.3,0.616,0.613,1.7,2.3,0.747,2.7,4.8,7.5,1.1,0.2,0.9,0.8,2.3,8.3,21.7,0.651,0.005,0.431,15.9,26.4,21.3,9.1,0.6,4.4,11.8,16.4,4.4,2.3,6.6,0.241,1.9,0.9,2.8,1.6,6481482.0
15828,zubaciv01,2020,Ivica Zubac,23.0,LAC,C,72.0,33.0,22.3,3.6,5.5,0.652,0.0,0.1,0.25,3.6,5.4,0.656,0.654,1.9,2.4,0.789,2.6,4.6,7.2,1.3,0.3,0.9,1.1,2.6,9.0,19.1,0.693,0.010,0.434,13.6,22.4,18.1,7.9,0.7,3.4,14.7,15.1,4.8,2.1,6.9,0.206,0.6,0.4,1.0,1.2,7000000.0
15829,zubaciv01,2021,Ivica Zubac,24.0,LAC,C,76.0,76.0,24.4,4.1,6.5,0.626,0.0,0.0,,4.1,6.5,0.626,0.626,2.2,3.0,0.727,2.9,5.6,8.5,1.6,0.5,1.0,1.5,2.7,10.3,19.2,0.660,0.000,0.459,12.8,23.7,18.4,9.8,1.0,3.5,16.1,16.8,4.3,2.9,7.2,0.187,-0.1,0.8,0.7,1.3,7518518.0
15830,zubaciv01,2022,Ivica Zubac,25.0,LAC,C,76.0,76.0,28.6,4.3,6.8,0.634,0.0,0.0,0.00,4.3,6.7,0.637,0.634,2.2,3.1,0.697,3.1,6.8,9.9,1.0,0.4,1.3,1.5,2.9,10.8,16.7,0.661,0.004,0.463,12.3,26.5,19.5,5.1,0.7,4.0,15.9,14.8,3.8,2.9,6.7,0.149,-1.1,0.2,-0.9,0.6,10123457.0


In [43]:
stats_collapsed_df['TEAM'] = (
    stats_df
    .groupby(['PLAYER_ID', 'SEASON_START'])['TEAM']
    .agg(list)
    .reset_index(drop=True)
)

stats_collapsed_df['SALARY'] = (
    stats_df
    .groupby(['PLAYER_ID', 'SEASON_START'])['SALARY']
    .agg(list)
    .reset_index(drop=True)
)

In [44]:
stats_collapsed_df

Unnamed: 0,PLAYER_ID,SEASON_START,PLAYER_NAME,AGE,TEAM,POS,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,SALARY
0,abdelal01,1990,Alaa Abdelnaby,22.0,[POR],PF,43.0,0.0,6.7,1.3,2.7,0.474,0.0,0.0,,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,13.1,0.499,0.000,0.379,10.4,23.4,17.0,5.8,0.7,2.5,14.0,22.1,0.0,0.5,0.5,0.079,-3.4,-1.2,-4.6,-0.2,[395000.0]
1,abdelal01,1991,Alaa Abdelnaby,23.0,[POR],PF,71.0,1.0,13.2,2.5,5.1,0.493,0.0,0.0,,2.5,5.1,0.493,0.493,1.1,1.4,0.752,1.1,2.5,3.7,0.4,0.4,0.2,0.9,1.9,6.1,13.5,0.533,0.000,0.280,9.5,20.9,15.2,4.7,1.3,1.1,14.0,20.6,0.6,1.5,2.1,0.110,-2.3,-0.4,-2.6,-0.1,[494000.0]
2,abdelal01,1992,Alaa Abdelnaby,24.0,"[2TM, MIL, BOS]",PF,75.0,52.0,17.5,3.3,6.3,0.518,0.0,0.0,0.00,3.3,6.3,0.519,0.518,1.2,1.5,0.759,1.7,2.8,4.5,0.4,0.3,0.3,1.3,2.5,7.7,,,,,,,,,,,,,,,,,,,,,"[nan, nan, 500000.0]"
3,abdelal01,1993,Alaa Abdelnaby,25.0,[BOS],PF,13.0,0.0,12.2,1.8,4.2,0.436,0.0,0.0,,1.8,4.2,0.436,0.436,1.2,1.9,0.640,0.9,2.6,3.5,0.2,0.2,0.2,1.3,1.5,4.9,9.2,0.485,0.000,0.455,8.5,24.2,16.3,2.7,0.6,1.2,20.5,22.6,-0.2,0.1,-0.1,-0.032,-5.3,-2.2,-7.4,-0.2,[805000.0]
4,abdelal01,1994,Alaa Abdelnaby,26.0,"[2TM, SAC, PHI]",PF,54.0,0.0,9.4,2.2,4.3,0.511,0.0,0.0,0.00,2.2,4.2,0.515,0.511,0.4,0.6,0.571,0.7,1.4,2.1,0.2,0.3,0.2,0.8,1.9,4.7,,,,,,,,,,,,,,,,,,,,,"[nan, 650000.0, nan]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15827,zubaciv01,2019,Ivica Zubac,22.0,[LAC],C,72.0,70.0,18.4,3.3,5.3,0.613,0.0,0.0,0.00,3.3,5.3,0.616,0.613,1.7,2.3,0.747,2.7,4.8,7.5,1.1,0.2,0.9,0.8,2.3,8.3,21.7,0.651,0.005,0.431,15.9,26.4,21.3,9.1,0.6,4.4,11.8,16.4,4.4,2.3,6.6,0.241,1.9,0.9,2.8,1.6,[6481482.0]
15828,zubaciv01,2020,Ivica Zubac,23.0,[LAC],C,72.0,33.0,22.3,3.6,5.5,0.652,0.0,0.1,0.25,3.6,5.4,0.656,0.654,1.9,2.4,0.789,2.6,4.6,7.2,1.3,0.3,0.9,1.1,2.6,9.0,19.1,0.693,0.010,0.434,13.6,22.4,18.1,7.9,0.7,3.4,14.7,15.1,4.8,2.1,6.9,0.206,0.6,0.4,1.0,1.2,[7000000.0]
15829,zubaciv01,2021,Ivica Zubac,24.0,[LAC],C,76.0,76.0,24.4,4.1,6.5,0.626,0.0,0.0,,4.1,6.5,0.626,0.626,2.2,3.0,0.727,2.9,5.6,8.5,1.6,0.5,1.0,1.5,2.7,10.3,19.2,0.660,0.000,0.459,12.8,23.7,18.4,9.8,1.0,3.5,16.1,16.8,4.3,2.9,7.2,0.187,-0.1,0.8,0.7,1.3,[7518518.0]
15830,zubaciv01,2022,Ivica Zubac,25.0,[LAC],C,76.0,76.0,28.6,4.3,6.8,0.634,0.0,0.0,0.00,4.3,6.7,0.637,0.634,2.2,3.1,0.697,3.1,6.8,9.9,1.0,0.4,1.3,1.5,2.9,10.8,16.7,0.661,0.004,0.463,12.3,26.5,19.5,5.1,0.7,4.0,15.9,14.8,3.8,2.9,6.7,0.149,-1.1,0.2,-0.9,0.6,[10123457.0]


In [46]:
salary_df.to_csv('./bbref_salary_data.csv')

In [22]:
for i, comment in enumerate(soup.find_all(string=lambda text: isinstance(text, Comment))):
    if 'all_salaries' in comment:
        print(i)

55


In [21]:
soup.find_all(string=lambda text: isinstance(text, Comment))[42]

'     <div class="section_content" id="div_9815445181">\n\t    <ul class="news_stories">\n<li><strong>11/5</strong> <a rel="nofollow noopener" target="_blank"  href="https://theleadsm.com/the-worst-trades-in-magic-history/?utm_source=rss&utm_medium=rss&utm_campaign=the-worst-trades-in-magic-history" onclick="sr_record_analytics_event(\'newsfeed_click\',\'The Lead\', sr_record_directory(),\'sr_tracker\');">The Lead: The Worst Trades in Magic History</a>:&nbsp;<em>Welcome to the Worst Trades for Each Team in NBA History— a series</em>...\n</li><li><strong>10/18</strong> <a rel="nofollow noopener" target="_blank"  href="https://hoopswire.com/pistons-preview-cade-cunningham-nba-news-analysis/" onclick="sr_record_analytics_event(\'newsfeed_click\',\'HoopsWire\', sr_record_directory(),\'sr_tracker\');">HoopsWire: Pistons Preview: Is Cade Cunningham Cornerstone Detroit Needs, Or Just Another False Start?</a>:&nbsp;<em>Pistons Preview A look at the Detroit Pistons entering the 2024-25</em>...\

In [16]:
player_id

'fournev01'

In [15]:
salary_df

Unnamed: 0,SEASON_START,TEAM,PLAYER_ID,SALARY
0,1984,CHI,jordami01,550000.0
1,1985,CHI,jordami01,630000.0
2,1986,CHI,jordami01,737500.0
3,1987,CHI,jordami01,845000.0
4,1988,CHI,jordami01,2000000.0
...,...,...,...,...
7,2019,MIA,leoname01,11286515.0
8,2020,OKC,leoname01,9400000.0
9,2022,MIL,leoname01,284911.0
10,2022,MIL,leoname01,105522.0


In [None]:
# for each player_id, scrape their salary data and merge into stats_df

stats_df['SALARY'] = 0.0

for player_id in tqdm(['gibsota01']):
# for player_id in tqdm(stats_df[stats_df['SALARY']==0]['PLAYER_ID'].unique()):
    response = requests.get(f'https://www.basketball-reference.com/players/{player_id[0]}/{player_id}.html')
    html = response.content
    sleep(3)
    soup = BeautifulSoup(html, 'html.parser')

    # salary data is hidden in comments
    found_table = False
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        if 'salaries' in comment:
            table = BeautifulSoup(comment).find('table', attrs={'id': 'all_salaries'})
            found_table = True
            break

    if found_table:
        # get salary data for given player_id
        salary_dict = {'SEASON_START': [],
                    'TEAM': [],
                    'PLAYER_ID': [],
                    'SALARY': []}
        for tr in table.find('tbody').find_all('tr'):
            salary_dict['SEASON_START'].append(int(tr.find('th').get_text()[:4]))
            salary_dict['TEAM'].append(
                re.search(r'/teams/([^/]+)/', tr.find('td', attrs={'data-stat': 'team_name'})
                        .find('a')['href'])
                        .group(1)
            )
            salary_dict['PLAYER_ID'].append(player_id)
            salary_dict['SALARY'].append(currency_to_float(tr.find('td', attrs={'data-stat': 'salary'}).get_text()))
        salary_df = pd.DataFrame(salary_dict)

        # in case the same season_start, team pair has more than one entry
        if len(salary_df) != len(stats_df.loc[stats_df.index[stats_df['PLAYER_ID']==player_id], 'SALARY']):
            salary_df = (
                salary_df
                .groupby(['SEASON_START', 'TEAM', 'PLAYER_ID'], as_index=False)['SALARY']
                .agg('sum')
                .reset_index(drop=True)
            )

        # merge salary data for given player_id into stats_df
        stats_df.loc[stats_df.index[stats_df['PLAYER_ID']==player_id], 'SALARY'] = (
            pd.merge(stats_df[stats_df['PLAYER_ID']==player_id].drop(columns=['SALARY']),
                     salary_df,
                     how='left',
                     on=['SEASON_START', 'TEAM'])['SALARY'].values
        )
    else:
        print(f'No salary data for {player_id}')

In [None]:
stats_df[(stats_df['PLAYER_ID']=='gibsota01')]

In [None]:
salary_df

In [None]:
stats_df['PLAYER_ID'].unique()[1640:1645]

In [None]:
salary_df

In [None]:
salary_df.groupby(['SEASON_START', 'TEAM', 'PLAYER_ID'], as_index=False)['SALARY'].agg('sum').reset_index(drop=True)

In [None]:
stats_df[stats_df['SALARY']!=0]

In [None]:
stats_df[stats_df['PLAYER_ID']=='gibsota01']

In [None]:
stats_df.iloc[10120:10130]

In [None]:
stats_df.to_csv('./bbref_data.csv')

In [None]:
player_id = 'duranke01'

response = requests.get(f'https://www.basketball-reference.com/players/{player_id[0]}/{player_id}.html')
html = response.content
sleep(3)
soup = BeautifulSoup(html, 'html.parser')

salary_df = pd.DataFrame()

In [None]:
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    if 'table' in str(comment):
        try:
            salary_df = pd.read_html(StringIO(comment), attrs={'id': 'all_salaries'})[0]
            break
        except:
            continue

In [None]:
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    if 'salaries' in comment:
        table = BeautifulSoup(comment).find('table', attrs={'id': 'all_salaries'})
        break

In [None]:
salary_dict = {'SEASON_START': [],
               'TEAM': [],
               'PLAYER_ID': []}

for tr in table.find('tbody').find_all('tr'):
    salary_dict['SEASON_START'].append(int(tr.find('th').get_text()[:4]))
    salary_dict['TEAM'].append(
        re.search(r'/teams/([^/]+)/', tr.find('td', attrs={'data-stat': 'team_name'})
                  .find('a')['href'])
                  .group(1)
    )
    salary_dict['PLAYER_ID'] = int(tr.find('td', attrs={'data-stat': 'salary'})['csk'])

In [None]:
pd.DataFrame(salary_dict)

In [None]:
data

In [None]:
print(table.find('tbody').find_all('tr')[0].prettify())

In [None]:
tr = table.find('tbody').find_all('tr')[0]

In [None]:
int(tr.find('td', attrs={'data-stat': 'salary'})['csk'])
# .find('a')['href']

In [None]:
path = tr.find('td', attrs={'data-stat': 'team_name'}).find('a')['href']
re.search(r'/teams/([^/]+)/', path).group(1)

In [None]:
salary_df

In [None]:
salary_df = salary_df.drop(len(salary_df)-1)
salary_df = salary_df.drop(columns=['Lg'])
salary_df['Season'] = salary_df['Season'].apply(lambda x: x[:4]).astype(int)
salary_df = salary_df[salary_df['Season']>=1990]
salary_df['Salary'] = salary_df['Salary'].apply(currency_to_float)
salary_df = salary_df.rename(columns={'Season': 'SEASON_START', 'Team': 'TEAM', 'Salary': 'SALARIES_LIST'})

In [None]:
salary_df

##### Collapsing rows for each PLAYER_ID, SEASON_START pair

In [None]:
stats_collapsed_df = (
    stats_df
    .groupby(['PLAYER_ID', 'SEASON_START'], as_index=False)[stats_df.columns]
    .apply(lambda group: group[group['TEAM'].str.contains('TM')]
           if group['TEAM'].str.contains('TM').any()
           else group)
    .reset_index(drop=True)
)

In [None]:
stats_collapsed_df['TEAM'] = (
    stats_df
    .groupby(['PLAYER_ID', 'SEASON_START'])['TEAM']
    .agg(list)
    .reset_index(drop=True)
)

stats_collapsed_df = stats_collapsed_df.rename(columns={'TEAM': 'TEAMS_LIST'})

In [None]:
stats_collapsed_df

In [None]:
stats_collapsed_df['TEAMS_LIST'] = stats_collapsed_df['TEAMS_LIST'].apply(
    lambda x: x[1:] if 'TM' in x[0] else x
)

In [None]:
stats_collapsed_df

### Salaries

Now for each player, we will scrape their salary data and merge it into `stats_collapsed_df`.

In [None]:
stats_collapsed_df['PLAYER_ID'].nunique()

In [None]:
def currency_to_float(x):
    """Converts a currency string to a float."""
    try:
        return float(x.replace('$', '').replace(',', ''))
    except:
        return None

In [None]:
stats_collapsed_df['SALARIES_LIST'] = np.empty((len(stats_collapsed_df), 0)).tolist()

for player_id in stats_collapsed_df['PLAYER_ID'].unique():
    response = requests.get(f'https://www.basketball-reference.com/players/{player_id[0]}/{player_id}.html')
    html = response.content
    sleep(3)
    soup = BeautifulSoup(html, 'html.parser')

    salary_df = pd.DataFrame()

    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        if 'table' in str(comment):
            try:
                salary_df = pd.read_html(StringIO(comment), attrs={'id': 'all_salaries'})[0]
                break
            except:
                continue

    print(f'Attempting {player_id}')

    if len(salary_df) > 0:
        salary_df = salary_df.drop(len(salary_df)-1)
        salary_df = salary_df.drop(columns=['Team', 'Lg'])
        salary_df['Season'] = salary_df['Season'].apply(lambda x: x[:4]).astype(int)
        salary_df = salary_df[salary_df['Season']>=1990]
        salary_df['Salary'] = salary_df['Salary'].apply(currency_to_float)
        salary_df = salary_df.groupby(['Season'])['Salary'].apply(list).reset_index()
        salary_df = salary_df.rename(columns={'Season': 'SEASON_START', 'Salary': 'SALARIES_LIST'})

        stats_collapsed_df.loc[stats_collapsed_df.index[stats_collapsed_df['PLAYER_ID']==player_id], 'SALARIES_LIST'] = (
            pd.merge(stats_collapsed_df[stats_collapsed_df['PLAYER_ID']==player_id].drop(columns=['SALARIES_LIST']),
                    salary_df,
                    how='left',
                    on='SEASON_START')['SALARIES_LIST'].values
        )
        print(f'Completed {player_id}')
    else:
        print(f'No salary data for {player_id}')

In [None]:
salary_df