# Preliminary Imports and Defining Constants

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Create Custom Error to Handle Entries Incompatible for the Table

In [2]:
class UnableToCreateTableError(Exception):
    def __init__(self, message = 'The number of entries in the scraped column does not equal the number of columns provided; thus, Pandas won\'t be able to create this table.'):
        self.message = message
        super().__init__(self.message)

## Create the Column Names for the Table 

In [3]:
#Column names from the website
column_names = 'Player, Position, Age, Team, Games, GS, Minutes, '
column_names += 'FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, FG%, FT, FTA, FT%, '
column_names += 'ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PT'
columns = column_names.split(', ')

# Scrape the Data for Two Different Relevant Pages

In [4]:
def scrape_and_write(nba_url, columns, timeline):
    '''
    Inputs:
    nba_url - A string representing the URL of the NBA page you are trying to scrape
    columns - A list of strings containing the names of the columns in the DataFrame
              you're trying to create
    timeline - A string with the part of the season in which you're scraping the data
    
    This function scrapes the page given by the URL and finds the table object. It then
    creates a pandas DataFrame and saves it to a .csv file in the same directory.
    
    Returns:
    None
    '''
    out_filename = 'nba_player_statistics_2019_2020_{}.csv'.format(timeline)
    
    #Get the html for the given url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #Find tables in the html
    table = soup.find('table')
    table_body = table.find('tbody')

    #Get the rows of the table
    rows = table_body.find_all('tr')
    
    #Parse each row of data and convert entries to the correct data types
    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        if len(cols) == 0:
            continue
        else:
            try:
                assert len(cols) == len(columns)
            except:
                raise UnableToCreateTableError()
            for i in range(len(cols)):
                if i not in [0,1,3]: 
                    if i in [2,4,5]:
                        cols[i] = int(cols[i])
                    else:
                        if cols[i] == '':
                            cols[i] = 0.0
                        else:
                            cols[i] = float(cols[i])
            data.append(cols)
            
    nba = pd.DataFrame(data = data, columns = columns)
    nba = nba.drop_duplicates(subset = 'Player', keep = 'last')
    nba.to_csv(out_filename, index = False)
    print('Finished ' + timeline)
    return

In [5]:
season = 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'
playoffs = 'https://www.basketball-reference.com/playoffs/NBA_2020_per_game.html'

urls = [season, playoffs]
timelines = ['reg_season', 'playoffs']

for url, time in zip(urls, timelines):
    scrape_and_write(url, columns, time)

Finished reg_season
Finished playoffs
