# Mining NBA Contracts
Source: [Basketball-Reference](https://www.basketball-reference.com/players/j/jamesle01.html)

In [1]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pprint
import re 
import pandas as pd
import numpy as np
import time

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Get links to all active player pages

In [2]:
# get html, convert to bs4 object
url = 'https://www.basketball-reference.com/contracts/players.html'
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')

In [14]:
players_links = {}

# get table containing player names
table = soup.find("table", {"id": "player-contracts"})
for row in soup.find_all('td', {"data-stat": "player"}):
    players_links[row.text] = row.find('a').get('href')

In [237]:
# pprint.pprint(players_links)

### Get stats for each player

In [174]:
def download_html(link):
    url = f'https://www.basketball-reference.com{link}'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def get_bio(soup, name):
    bio = {}
    info = soup.find('div', {"id": "info"})
    
    bio['name'] = name
    bio['height'] = info.find('span', {"itemprop": "height"}).text
    bio['weight'] = info.find('span', {"itemprop": "weight"}).text
    bio['birthDate'] = info.find('span', {"itemprop": "birthDate"}).get('data-birth')
    
    # get draft info
    bio['draftRound'] = -1
    bio['draftOverall'] = -1
    bio['draftYear'] = -1
    bio['undrafted'] = False
    draft_text = ''
    for p in info.find_all('p'):
        if 'NBA Draft' in p.text:
            draft_text = p.text
    if draft_text == '':
        bio['undrafted'] = True
    else:
        draft_data = draft_text.split(',')[1:]
        bio['draftRound'] = draft_data[0].split()[0]
        bio['draftOverall'] = draft_data[1].split()[0]
        bio['draftYear'] = draft_data[2].split()[0]
    
    return bio

# def get_awards(soup):
#     awards_list = soup.find('div', {"id": "info"}).find('ul', {'id': 'bling'}).find_all('li')
#     return [award.text for award in awards_list]

def get_stats_headers(soup):
    return ['Name'] + [header.text for header in soup.find('table', {"id": "per_game"}).find('thead').find_all('th')]

def get_stats(soup, name):
    table_data = []
    for row in soup.find('table', {"id": "per_game"}).find('tbody').find_all('tr'):
        row_data = [name] + [element.text for element in row.find_all(['th','td'])]
        table_data.append(row_data)
    return table_data

def get_adv_stats_headers(soup):
    headers = ['Name'] + [header.text for header in soup.find('table', {"id": "advanced"}).find('thead').find_all('th')]
    headers = [h for h in headers if h != '\xa0']
    return headers 

def get_adv_stats(soup, name):
    table_data = []
    for row in soup.find('table', {"id": "advanced"}).find('tbody').find_all('tr'):
        row_data = [name] + [element.text for element in row.find_all(['th','td'])]
        row_data = [row for row in row_data if row != '']
        table_data.append(row_data)
    return table_data

In [284]:
# pl = {'John Wall': '/players/w/walljo01.html'}

In [303]:
players_bios = []
players_stats_headers = []
players_adv_stats_headers = []
players_stats = []
players_adv_stats = []

for player, link in players_links.items():    
    try:
        soup = download_html(link)
        players_bios.append(get_bio(soup, player))
        players_stats.extend(get_stats(soup, player))
        players_adv_stats.extend(get_adv_stats(soup, player))

        # get headers for tables if we don't have them yet
        if len(players_stats_headers) == 0:
            players_stats_headers = get_stats_headers(soup)
            players_adv_stats_headers = get_adv_stats_headers(soup)

        if len(players_bios) % 50 == 0:
            print(str(len(player_bios)) + 'th player paged scraped')
    except:
        print('failed at ' + player)

1th player paged scraped
1th player paged scraped
1th player paged scraped
1th player paged scraped
1th player paged scraped
1th player paged scraped
1th player paged scraped
1th player paged scraped
failed at Jason Preston
1th player paged scraped
1th player paged scraped
failed at Luca Vildoza


In [313]:
len(players_bios)

515

### Create df and calculate lags

In [306]:
df_basic = pd.DataFrame(players_stats, columns=players_stats_headers)
df_adv = pd.DataFrame(players_adv_stats, columns=players_adv_stats_headers)
df_adv.drop(['MP'], axis=1, inplace=True)

df_stats = df_basic.merge(df_adv, on=['Name', 'Season', 'Age', 'Tm', 'Pos', 'G', 'Lg'])
df_stats.dropna(inplace=True)
df_stats.drop_duplicates(['Name', 'Season', 'Lg'], keep='first', inplace=True)

In [307]:
# convert cols to numeric where possible
cols = df_stats.columns.drop(['Name', 'Season', 'Tm', 'Pos', 'Lg'])
df_stats[cols] = df_stats[cols].apply(pd.to_numeric, errors='coerce')

In [308]:
# save stats to csv
df_stats.to_csv('../data/playerStats.csv',index=False)

In [309]:
# calculate lagged effects
df_complete = pd.concat([df_stats, 
                         df_stats.groupby('Name').shift().add_suffix('_prev1'), 
                         df_stats.groupby('Name').shift(2).add_suffix('_prev2')], axis=1).fillna(0)


df_complete.drop(['Season_prev1', 'Age_prev1', 'Tm_prev1', 'Lg_prev1', 'Pos_prev1',
                  'Season_prev2', 'Age_prev2', 'Tm_prev2', 'Lg_prev2', 'Pos_prev2'],
                axis=1, inplace=True)


df_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 3548
Columns: 141 entries, Name to VORP_prev2
dtypes: float64(133), int64(3), object(5)
memory usage: 3.1+ MB


In [310]:
# df_complete.info(verbose=True)

In [311]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df_complete)
df_complete[['Name', 'Season','G', 'G_prev1', 'G_prev2']]
# df_complete

Unnamed: 0,Name,Season,G,G_prev1,G_prev2
0,Stephen Curry,2009-10,80,0.0,0.0
1,Stephen Curry,2010-11,74,80.0,0.0
2,Stephen Curry,2011-12,26,74.0,80.0
3,Stephen Curry,2012-13,78,26.0,74.0
4,Stephen Curry,2013-14,78,78.0,26.0
...,...,...,...,...,...
3542,Malcolm Hill,2021-22,19,0.0,0.0
3545,Skylar Mays,2020-21,33,0.0,0.0
3546,Skylar Mays,2021-22,28,33.0,0.0
3547,Terry Taylor,2021-22,33,0.0,0.0


In [312]:
df_complete.to_csv('../data/playerStatsWithLags.csv',index=False)