In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sys import argv
from urllib.request import urlopen
from urllib.error import HTTPError
import requests
import time

In [13]:
def get_info(soup: BeautifulSoup):
    
    # Input: BeautifulSoup webpage
    # Output: List of Dictionaries of Each Team on the Page's Info
    
    
    team_dict_list = list()
    
    # Find List of Teams Under 'Division Standings'
    teams = soup.find_all('th', scope='row', class_='left')[:30]
    
    for team in teams:
        
        # Get URL of Team
        URL = re.search('<a href="(.*.html)', str(team)).group(1)
        
        # Get Year
        year = re.search('/(\d{4}).html', str(team)).group(1)
        
        # Get Team Abbreviation
        team_abb = re.search('teams/(.*)/2', str(team)).group(1)
        
        # Make Soup of the Team's Page
        new_url = 'https://www.basketball-reference.com' + URL
        
        time.sleep(1)
        
        season_usgs, playoff_usgs = get_usages(new_url, playoffs)
        
        team_dict = {'Team': team_abb,
                     'Year': year,}
        
        # Add Information to List
        team_dict_list.append(team_dict)
        
    return team_dict_list


In [17]:
def scrape_players(url):
    # Make Soup
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Save These Subsections for Later
    advanced_html = page.text[page.text.find('<h2>Advanced</h2>'):]
    salary_html = page.text[page.text.find('<h2>Salaries</h2>'):]
    
    # List for Storing Player Dictionaries
    dict_list = []
      
    for i in range(len(soup.find_all('tr'))):
        if i==0: continue
            
        player_str = str(soup.find_all('tr')[i])
        
        # Get Player Name
        name = re.search('.html">([\D\'\.\-]{1,50})</a>', player_str).group(1)
        
        # Get Team Abbreviation
        team_abb = re.search('teams/(.*)/\d{4}', url).group(1)
        
        # Get Year
        year = re.search('/(\d{4}).html', url).group(1)
        
        # Get Position
        pos = re.search('"pos">([A-Z]{1,2})<', player_str).group(1)
        
        # Get Height (George Kar, SAS 1977 is 6-2, not 0)
        if re.search('"height">([\d,-]+)<', player_str) is None: height = 0
        else: height = re.search('"height">([\d,-]+)<', player_str).group(1)
        
        # Get Years of Experience
        exp = re.search('"years_experience">([\d,R]+)<', player_str).group(1)
        
        
        #### Get Win Shares ####
        
        # Trim to Relevant HTML
        player_html = advanced_html[advanced_html.find(name):]
        
        if re.search('data-stat="ows" >([\d\.]{1,5})<', player_html) is None: ows = 0.0
        else: ows = float(re.search('data-stat="ows" >([\d\.]{1,5})<', player_html).group(1))
            
        if re.search('data-stat="dws" >([\d\.]{1,5})<', player_html) is None: dws = 0.0
        else: dws = float(re.search('data-stat="dws" >([\d\.]{1,5})<', player_html).group(1))
            
        if re.search('data-stat="ws_per_48" >([\d\.]{1,5})<', player_html) is None: ws_per_48 = 0.0
        else: ws_per_48 = float(re.search('data-stat="ws_per_48" >([\d\.]{1,5})<', player_html).group(1))
            
        
        #### Get Salaries ####
    
        # Trim to Relevant HTML
        player_html = salary_html[salary_html.find(name):]
        
        if re.search('csk="(\d{1,10})', player_html) is None: continue
        else: salary = int(re.search('csk="(\d{1,10})', player_html).group(1))
            
        ##########################################    
    
        # Write Data to Dictionary
        player_dict = {'Player': name,
                       'Team': team_abb,
                       'Year': year,
                       'Pos': pos,
                       'Height': height,
                       'Exp': exp,
                       'Salary': salary,
                       'OWS': ows,
                       'DWS': dws,
                       'WS': ows+dws,
                       'WS_Per_48': ws_per_48}
        
        # Add New Row to DataFrame
        dict_list.append(player_dict)
        
    return pd.DataFrame(data=dict_list)


In [18]:
def get_team_urls(year: int):

    # Make Soup
    url = 'https://www.basketball-reference.com/leagues/NBA_' + str(year) + '.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    url_list = []
    
    # For Each Team
    teams = soup.find_all('th', scope='row', class_='left')
    if len(teams) > 32: teams = teams[:int(len(teams)/2)]
        
    for team in teams:
        
        # Get URL of Team
        team_url = 'https://www.basketball-reference.com' + re.search('<a href="(.*.html)', str(team)).group(1)
        
        # Append to List
        url_list.append(team_url)
        
    return url_list

In [19]:
# DataFrame Containing Player Info, Salaries
players_df = pd.DataFrame()

for year in range(1977,2019):
    for url in get_team_urls(year):
        
        # Append Info to DataFrame
        players_df = players_df.append(scrape_players(url))
        time.sleep(2)
players_df.head(20)

Unnamed: 0,DWS,Exp,Height,OWS,Player,Pos,Salary,Team,WS,WS_Per_48,Year
0,2.8,3,6-4,3.9,Danny Ainge,SG,400000,BOS,6.7,0.124,1985
1,5.2,5,6-9,10.5,Larry Bird,SF,1800000,BOS,15.7,0.238,1985
2,1.2,8,6-3,0.0,Quinn Buckner,PG,239000,BOS,1.2,0.027,1985
3,0.5,9,6-6,0.0,M.L. Carr,SF,175000,BOS,0.5,0.05,1985
4,0.7,1,6-4,0.0,Carlos Clark,SG,65000,BOS,0.7,0.044,1985
5,2.8,8,6-4,3.5,Dennis Johnson,PG,405000,BOS,6.3,0.103,1985
6,0.4,1,6-11,0.0,Greg Kite,C,120000,BOS,0.4,0.006,1985
7,1.4,7,6-8,3.2,Cedric Maxwell,SF,830000,BOS,4.6,0.147,1985
8,3.4,4,6-10,7.6,Kevin McHale,PF,1000000,BOS,11.0,0.199,1985
9,4.0,8,7-0,5.0,Robert Parish,C,700000,BOS,9.0,0.151,1985


In [20]:
# Write To CSV
players_df.to_csv('PlayerSalaries_v2.csv')