# Preliminary Imports and Defining Constants

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Create Custom Error to Handle Entries Incompatible for the Table

In [2]:
class UnableToCreateTableError(Exception):
    def __init__(self, message = 'The number of entries in the scraped column does not equal the number of columns provided; thus, Pandas won\'t be able to create this table.'):
        self.message = message
        super().__init__(self.message)

## Create the Column Names for the Table 

In [3]:
#Column names from the website

advanced = 'Player, Pos, Age, Tm, G, MP, PER, TS%, 3PAr, FTr, ORB%, DRB%, TRB%, AST%, STL%, BLK%, TOV%, '
advanced += 'USG%, , OWS, DWS, WS, WS/48, , OBPM, DBPM, BPM, VORP'
adv_cols = advanced.split(', ')

per100 = 'Player, Pos, Age, Tm, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, FT, '
per100 += 'FTA, FT%, ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PTS, , ORtg, DRtg'
cols_100 = per100.split(', ')

# Scrape the Data for Two Different Relevant Pages

In [4]:
def scrape_to_df(nba_url, columns):
    '''
    Inputs:
    nba_url - A string representing the URL of the NBA page you are trying to scrape
    columns - A list of strings containing the names of the columns in the DataFrame
              you're trying to create
    timeline - A string with the part of the season in which you're scraping the data
    
    This function scrapes the page given by the URL and finds the table object. It then
    creates a pandas DataFrame and saves it to a .csv file in the same directory.
    
    Returns:
    None
    '''
    
    #Get the html for the given url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #Find tables in the html
    table = soup.find('table')
    table_body = table.find('tbody')

    #Get the rows of the table
    rows = table_body.find_all('tr')
    
    #Parse each row of data and convert entries to the correct data types
    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        if len(cols) == 0:
            continue
        else:
            try:
                assert len(cols) == len(columns)
            except:
                raise UnableToCreateTableError()
            for i in range(len(cols)):
                if i not in [0,1,3]:
                    if cols[i] == '':
                        cols[i] = 0.0
                    else:
                        cols[i] = float(cols[i])
            data.append(cols)
            
    nba = pd.DataFrame(data = data, columns = columns)
    nba = nba.drop_duplicates(subset = 'Player', keep = 'last')
    return nba

In [5]:
season100 = 'https://www.basketball-reference.com/leagues/NBA_2020_per_poss.html'
season_adv = 'https://www.basketball-reference.com/leagues/NBA_2020_advanced.html'
playoffs100 = 'https://www.basketball-reference.com/playoffs/NBA_2020_per_poss.html'
playoffs_adv = 'https://www.basketball-reference.com/playoffs/NBA_2020_advanced.html'

urls = [season100, season_adv, playoffs100, playoffs_adv]
cols = [cols_100, adv_cols, cols_100, adv_cols]

dfs = []
for url, names in zip(urls, cols):
    temp = scrape_to_df(url, names)
    dfs.append(temp)

assert len(dfs) == 4, len(dfs)

In [6]:
for i in range(2):
    temp = dfs[2*i][['Player', 'Tm', 'ORtg', 'DRtg']]
    temp['%Usage'] = dfs[2*i + 1]['USG%']
    if i == 0:
        temp.to_csv('Star_Player_Statistics_Season.csv', index = False)
    else:
        temp.to_csv('Star_Player_Statistics_Playoffs.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['%Usage'] = dfs[2*i + 1]['USG%']
