## Import Library

---

In [1]:
from bs4 import BeautifulSoup
import urllib
import requests
import json
import pandas as pd
from matplotlib import pyplot as plt

## Get Data

---

In [16]:
# take year, return dataframe of player salary for that year
def player_salary(year):
    for page in range(1, 20):
        salary_url = f'http://www.espn.com/nba/salaries/_/year/{year}/page/{page}/seasontype/4'
        req = urllib.request.Request(salary_url , headers={'User-Agent': 'Mozilla/5.0'})
        content = urllib.request.urlopen(req).read()
        table = pd.read_html(content)[0]
        if page == 1:
            df = table.copy()
        else:
            df = pd.concat([df, table], axis = 0)
    
    # set df columns and drop index
    salary_cols = df.iloc[0,:].values
    df.columns = salary_cols
    df = df.loc[df['RK'] != 'RK']
    df.reset_index(drop = True, inplace = True)
    
    df['year'] = year
    
    return df

In [19]:
# get salary data from 2000 to 2020
salary_df = player_salary('2000')
for year in range(2001, 2021):
    temp_df =  player_salary(str(year))
    salary_df = pd.concat([salary_df, temp_df], axis = 0)

salary_df.drop(['RK'], axis = 1, inplace = True)
salary_df['NAME'] = salary_df['NAME'].apply(lambda x: x.split(',')[0])
salary_df['SALARY'] = salary_df['SALARY'].replace('[\$,]', '', regex=True).astype(float)
salary_df.reset_index(inplace = True)
salary_df.to_csv('nba_salary_Mayer.csv', index=False)

In [2]:
# take year, return dataframe of players stats for that year
def nba_player_data(year):
    # get player data from nba website in json format
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Referer': 'https://stats.nba.com/players/drives/',
        'x-nba-stats-origin': 'stats',
        'x-nba-stats-token': 'true',
    }
    player_url = f'https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season={year}&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight='
    req_url = requests.get(player_url, headers=headers)
    player_data = req_url.json()
    
    # get data by key: resultSets
    player_results = player_data['resultSets'][0]
    
    # get data feature names
    player_col_headers = player_results['headers']
    
    # get observations and set feature names
    player_results = json.dumps(player_results['rowSet'])
    df = pd.read_json(player_results)
    df.columns = player_col_headers
    df['year'] = year
    
    return df

In [None]:
# get player data from 1996 to 2020
player_df = nba_player_data('1996-97')
for year in range(1997, 2020):
    temp_df =  nba_player_data(str(year) + '-' + str(year + 1)[-2:])
    player_df = pd.concat([player_df, temp_df], axis = 0)

# drop null values and reset index
player_df.dropna(axis = 0, inplace = True)
player_df.reset_index(inplace = True)
player_df.drop(['index'], axis = 1, inplace = True)
player_df.to_csv('nba_player_stats_Mayer.csv', index=False)