# Mining NBA Contracts
Source: [Sporttrac](https://www.spotrac.com/nba/contracts/)

In [1]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pprint
import re 
import pandas as pd
import numpy as np
import time

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Sample of one page

In [20]:
# get html, convert to bs4 object
start = 2016
url = f'https://www.spotrac.com/nba/contracts/sort-value/all-time/start-{start}/limit-500/'
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')

In [16]:
players = []
category = 'other'
for row in soup.find_all('tr'):
#     cell = [re.sub('\s+',' ',item.strip()) for item in row.find_all('td') if item not None]
    row_data = [category]
    for cell in row.find_all('td'):
        if cell is None:
            continue
        else:
            # look through this cell
            if '\n' in cell.text.strip(): # this cell is the 'name' cell
                for cell_text in cell.text.strip().split('\n'):
                    if cell_text != '\n':
                        row_data.append(cell_text)
                    if len(row_data) > 14:
                        break
            else:
                row_data.append(cell.text.strip())
#     pprint.pprint(row_data)

    row_data = [r for r in row_data if len(r) > 0]
    players.append(row_data)
    
# drop first frow
players = players[1:]

In [17]:
df = pd.DataFrame(players)
df.columns = ['category','categoryRank', 'lastName', 'fullName', 'position', 'empty', 'term',
              'signAge', 'contractLength', 'totalValue', 'aav', 'signBonus']
df = df.drop(['empty'], axis=1)

In [18]:
df

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
0,other,1,Conley,Mike Conley,Point Guard,2016-2020 (FA: 2021),28,5,"$152,605,578","$30,521,116",0-
1,other,2,Lillard,Damian Lillard,Point Guard,2016-2020 (FA: 0),24,5,"$139,888,445","$27,977,689",0-
2,other,3,DeRozan,DeMar DeRozan,Shooting Guard,2016-2020 (FA: 2021),26,5,"$139,000,000","$27,800,000",0-
3,other,4,Davis,Anthony Davis,Power Forward,2016-2020 (FA: 2021),22,5,"$127,171,313","$25,434,263",0-
4,other,4,Beal,Bradley Beal,Shooting Guard,2016-2020 (FA: 2021),23,5,"$127,171,313","$25,434,263",0-
...,...,...,...,...,...,...,...,...,...,...,...
410,other,411,Williams,Troy Williams,Small Forward,2016-2016 (FA: 2017),22,1,"$76,725","$76,725",0-
411,other,412,Alexander,Cliff Alexander,Power Forward,2016-2016 (FA: 0),20,1,"$75,000","$75,000",0-
412,other,413,Ibeh,Prince Ibeh,Center,2016-2016 (FA: 2017),22,1,"$28,772","$28,772",0-
413,other,414,Jones,Dahntay Jones,Small Forward,2016-2016 (FA: 2017),36,1,"$9,127","$9,127",0-


### Automate for all pages - All Contracts

In [73]:
def download_html(start):
    url = f'https://www.spotrac.com/nba/contracts/sort-value/all-time/start-{start}/limit-500/'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def extract_players(soup, category):
    players = []
    for row in soup.find_all('tr'):
        row_data = [category]
        for cell in row.find_all('td'):
            if cell is None:
                continue
            else:
                # look through this cell
                if '\n' in cell.text.strip(): # this cell is the 'name' cell
                    for cell_text in cell.text.strip().split('\n'):
                        if cell_text != '\n':
                            row_data.append(cell_text)
                        if len(row_data) > 14:
                            break
                else:
                    row_data.append(cell.text.strip())

        row_data = [r for r in row_data if len(r) > 0]
        players.append(row_data)

    # drop first frow
    return players[1:]

In [112]:
years = [y for y in range(2014,2022)]
players = []

for year in years:
    try:
        soup = download_html(year)
        players.extend(extract_players(soup, 'other'))
        time.sleep(0.5) # don't get caught scraping (we don't want to get banned)
    except:
        print('failed on ' + category)
    

In [113]:
df = pd.DataFrame(players)
df.columns = ['category','categoryRank', 'lastName', 'fullName', 'position', 'empty', 'term',
              'signAge', 'contractLength', 'totalValue', 'aav', 'signBonus']
df = df.drop(['empty'], axis=1)
df['position'] = df['position'].str.strip()
df['term'] = df['term'].str.strip()
df['signBonus'] = np.where(df['signBonus']=='0-', '$0', df['signBonus'])

In [114]:
df.head()

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
0,other,1,Anthony,Carmelo Anthony,Power Forward,2014-2018 (FA: 2019),30,5,"$124,064,681","$24,812,936",$0
1,other,2,Bosh,Chris Bosh,Power Forward,2014-2018 (FA: 2019),29,5,"$118,705,300","$23,741,060",$0
2,other,3,George,Paul George,Shooting Guard,2014-2018 (FA: 2018),23,5,"$91,572,660","$18,314,532",$0
3,other,4,Wall,John Wall,Point Guard,2014-2018 (FA: 0),22,5,"$84,789,500","$16,957,900",$0
4,other,5,Bledsoe,Eric Bledsoe,Point Guard,2014-2018 (FA: 0),24,5,"$70,000,000","$14,000,000",$0


### Automate for all pages - Rookie Contracts

In [115]:
def download_html_rookie(start):
    url = f'https://www.spotrac.com/nba/contracts/sort-value/type-entry-level/all-time/start-{start}/limit-500/'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [116]:
years = [y for y in range(2014,2022)]
players_rookies = []

for year in years:
    try:
        soup = download_html_rookie(year)
        players_rookies.extend(extract_players(soup, 'entry-level'))
        time.sleep(0.5) # don't get caught scraping (we don't want to get banned)
    except:
        print('failed on ' + category)

In [117]:
df_rookie = pd.DataFrame(players_rookies)
df_rookie.columns = ['category','categoryRank', 'lastName', 'fullName', 'position', 'empty', 'term',
                      'signAge', 'contractLength', 'totalValue', 'aav', 'signBonus']
df_rookie = df_rookie.drop(['empty'], axis=1)
df_rookie['position'] = df_rookie['position'].str.strip()
df_rookie['term'] = df_rookie['term'].str.strip()
df_rookie['signBonus'] = np.where(df_rookie['signBonus']=='0-', '$0', df_rookie['signBonus'])

In [118]:
df_rookie.head()

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
0,entry-level,1,Wiggins,Andrew Wiggins,Small Forward,2014-2017 (FA: 2018),18,4,"$24,850,243","$6,212,561",$0
1,entry-level,2,Parker,Jabari Parker,Power Forward,2014-2016 (FA: 2018),18,3,"$22,239,712","$7,413,237",$0
2,entry-level,3,Embiid,Joel Embiid,Center,2014-2017 (FA: 0),19,4,"$19,981,026","$4,995,257",$0
3,entry-level,4,Gordon,Aaron Gordon,Power Forward,2014-2017 (FA: 2018),18,4,"$18,019,460","$4,504,865",$0
4,entry-level,5,Mirotic,Nikola Mirotic,Power Forward,2014-2016 (FA: 2017),23,3,"$16,631,175","$5,543,725",$0


In [119]:
df_rookie[df_rookie['fullName']=='Lonzo Ball']

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
226,entry-level,2,Ball,Lonzo Ball,Point Guard,2017-2020 (FA: 2021),19,4,"$33,471,622","$8,367,906",$0


In [120]:
df[df['fullName']=='Lonzo Ball']

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
1215,other,33,Ball,Lonzo Ball,Point Guard,2017-2020 (FA: 2021),19,4,"$33,471,622","$8,367,906",$0
2620,other,20,Ball,Lonzo Ball,Point Guard,2021-2024 (FA: 2025),23,4,"$80,000,000","$20,000,000",$0


### merge df's to identify rookie contracts

In [121]:
df_tmp = df.drop('category',axis=1).merge(df_rookie[['category', 'fullName', 'term']],
                                          on=['fullName', 'term'], how='left')
df_tmp['isEntryLevel'] = np.where(df_tmp['category']=='entry-level', 1, 0)
df_tmp.drop(['categoryRank', 'category'], axis=1, inplace=True)

In [122]:
df_tmp[df_tmp['isEntryLevel']==1].head()

Unnamed: 0,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus,isEntryLevel
24,Wiggins,Andrew Wiggins,Small Forward,2014-2017 (FA: 2018),18,4,"$24,850,243","$6,212,561",$0,1
28,Parker,Jabari Parker,Power Forward,2014-2016 (FA: 2018),18,3,"$22,239,712","$7,413,237",$0,1
31,Embiid,Joel Embiid,Center,2014-2017 (FA: 0),19,4,"$19,981,026","$4,995,257",$0,1
35,Gordon,Aaron Gordon,Power Forward,2014-2017 (FA: 2018),18,4,"$18,019,460","$4,504,865",$0,1
40,Mirotic,Nikola Mirotic,Power Forward,2014-2016 (FA: 2017),23,3,"$16,631,175","$5,543,725",$0,1


In [123]:
df_tmp[df_tmp['fullName']=='Lonzo Ball']

Unnamed: 0,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus,isEntryLevel
1215,Ball,Lonzo Ball,Point Guard,2017-2020 (FA: 2021),19,4,"$33,471,622","$8,367,906",$0,1
2620,Ball,Lonzo Ball,Point Guard,2021-2024 (FA: 2025),23,4,"$80,000,000","$20,000,000",$0,0


In [124]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2973 entries, 0 to 2972
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   lastName        2973 non-null   object
 1   fullName        2973 non-null   object
 2   position        2973 non-null   object
 3   term            2973 non-null   object
 4   signAge         2973 non-null   object
 5   contractLength  2973 non-null   object
 6   totalValue      2973 non-null   object
 7   aav             2973 non-null   object
 8   signBonus       2973 non-null   object
 9   isEntryLevel    2973 non-null   int64 
dtypes: int64(1), object(9)
memory usage: 255.5+ KB


In [130]:
df_tmp[df_tmp['fullName']=='LaMelo Ball']

Unnamed: 0,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus,isEntryLevel
2333,Ball,LaMelo Ball,Point Guard,2020-2023 (FA: 2024),19,4,"$35,596,275","$8,899,069",$0,1


In [131]:
df = df_tmp

In [132]:
df.to_csv('../data/contracts.csv', index=False)