In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import Counter
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import numpy as np

In [7]:
### Scrape Data from 1980 through this year
### Function can get any available stat type from Basketball-Reference
from datetime import datetime


def get_stats(stat_type = 'advanced'):
    current_year = int(datetime.now().year)
    url_template = 'https://www.basketball-reference.com/leagues/NBA_{year}_{stat_type}.html'
    player_pg_list = []
    errors_list = []

    ### 3PT Line introduced
    for year in range(1980, current_year+1): 

        # Use try/except block to catch and inspect any urls that cause an error
        try:
            # get the draft url
            url = url_template.format(year=year,stat_type=stat_type)

            # get the html
            html = urlopen(url)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html, "lxml")     

            # Scrape table element of html
            table = soup.find('table')
            table_rows = table.find_all('tr')
            row_list = list()
            for tr in table_rows:
                td = tr.find_all('td')
                th = tr.find_all('th')
                row = [i.text for i in td]
                row_list.append(row)
            df = pd.DataFrame(row_list)
            col_head = [h.text for h in table_rows[0].find_all('th')]
            df = df[1:]
            df.columns = col_head[1:]
            df['Year'] = year
            
            ## For Players that changed teams, make sure that their season totals are taken
            for index, row in df[df.Player.duplicated()].iterrows():
                if row.Tm != 'TOT':
                    df.drop(index,inplace = True)

            player_pg_list.append(df)

        except Exception as e:
            # Store the url and the error it causes in a list
            error =[url, e] 
            # then append it to the list of errors
            errors_list.append(error)

    player_pg = pd.concat(player_pg_list, ignore_index=True)
    player_pg = player_pg.apply(pd.to_numeric, errors="ignore")
    
    player_pg.dropna(axis = 1, how = 'all',inplace=True)

    


   
    return player_pg

In [8]:
# Get advanced and traditional per-game stats
adv_stats = get_stats()
gen_stats = get_stats("per_game")

In [9]:
adv_stat_copy = adv_stats
gen_stats_copy = gen_stats

In [10]:
# Clean the names of non-alphabetical characters
def clean_names(df):
    return [re.sub(pattern="\*",string=str(player),repl="") for player in df.Player]
adv_stats.Player = clean_names(adv_stats)
gen_stats.Player = clean_names(gen_stats)

In [11]:
## Get Experience level of player
gen_stats['Exp'] = gen_stats.groupby('Player')['Player'].transform('count')


In [12]:
## This just removes any hypens and picks the first position as the player's main position
def define_position(df):
    pos_list = list()
    for index, row in df.iterrows():
        if row.Pos is None:
            df.drop(index,inplace = True)
        elif "-" in str(row.Pos):
            df.loc[index,"Pos"] = (re.sub(pattern="-.+",repl="",string=row.Pos))
    
    #df["Pos1"] = pd.Series(pos_list,name="Pos1")
    return df

gen_stats1 = define_position(gen_stats)
adv_stats1 = define_position(adv_stats)

In [13]:
adv_stats1.Pos.value_counts()

PF    3460
C     3405
SG    3354
PG    3207
SF    3168
Name: Pos, dtype: int64

In [14]:
#### The next few cells find how many total All-NBA selections a player has
all_nba = pd.read_csv('ALL-NBA.csv')
all_nba.drop(columns={'Lg','Tm'},inplace=True)
all_nba_players = list()

In [15]:
all_nba.Season = [re.sub(pattern = '.{2}-',string=all_nba.Season[season],repl = '') for season in range(len(all_nba.Season))]
for player_list in all_nba.iloc[:,1:].values:
    year_list = list()
    for player in player_list:
        year_list.append(re.sub("\s[A-Z]$","",player)) 
    all_nba_players.append(year_list)
all_nba.iloc[:,1:] = all_nba_players

In [16]:
from collections import Counter
all_NBA_count = Counter(all_nba.iloc[:,1:].values.flatten())

In [17]:
# Match player with the amount of times he was selected
adv_stats1["All-NBA Selections"] = adv_stats1['Player'].map(all_NBA_count)

In [18]:
## Joining datasets from 'traditional' and 'advanced' stats
cols_to_use = gen_stats1.columns.difference(adv_stats1.columns)
df = pd.merge(adv_stats1, gen_stats1[cols_to_use],left_index=True,right_index=True, how='outer')

In [19]:
## Save for later use
#adv_stats1.to_csv('Advanced Player Stats.csv')
#gen_stats1.to_csv('Traditional Player Stats.csv')
df.to_csv('Joined Player Stats.csv')