In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Set the maximum display option for columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# PER GAME STATS

In [3]:
# NBA season we will be analyzing
year = 2020

# URL page we will scraping- {} is for the year
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)

# this is the HTML from the given URL
html = urlopen(url)

#BeautifulSoup function passed through the entire web page in order to convert it into an object
soup = BeautifulSoup(html)

In [4]:
# use findALL() to get the column headers ,'tr' is the HTML tag for the table row, limit = 2 is for the first two rows
soup.findAll('tr', limit=2)

# use getText()to extract the text we need into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]

In [5]:
# Next step, we will extract the data from the cells of the table in order to add it to our DataFrame. 
# Although it is similar to extracting data from column header, the data within the cell, 
# in this case player stats, is in a 2-dimensional format. Therefore, we must set up a 2-dimensional list:

# avoid the first header row
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [6]:
# Combine the data and the clumn headers to create a dataframe
stats = pd.DataFrame(player_stats, columns = headers)

# ADVANCED STATS

In [7]:
# NBA season we will be analyzing
year = 2020

# URL page we will scraping- {} is for the year
url_adv = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year) 

# this is the HTML from the given URL
html_adv = urlopen(url_adv)

#BeautifulSoup function passed through the entire web page in order to convert it into an object
soup_adv = BeautifulSoup(html_adv)

In [8]:
# use findALL() to get the column headers ,'tr' is the HTML tag for the table row, limit = 2 is for the first two rows
soup_adv.findAll('tr', limit=2)

# use getText()to extract the text we need into a list
headers_adv = [th.getText() for th in soup_adv.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers_adv = headers_adv[1:]

In [9]:
# Next step, we will extract the data from the cells of the table in order to add it to our DataFrame. 
# Although it is similar to extracting data from column header, the data within the cell, 
# in this case player stats, is in a 2-dimensional format. Therefore, we must set up a 2-dimensional list:

# avoid the first header row
rows_adv = soup_adv.findAll('tr')[1:]
player_stats_adv = [[td.getText() for td in rows_adv[i].findAll('td')]
            for i in range(len(rows_adv))]

In [10]:
# Combine the data and the clumn headers to create a dataframe
stats_adv = pd.DataFrame(player_stats_adv, columns = headers_adv)

# COMBINE

In [11]:
# Merge the two dataframes
stats_2020 = pd.merge(stats, stats_adv,  how='left', left_on=['Player','Tm'], right_on = ['Player','Tm'])

In [12]:
# Drop duplicate columns
stats_2020.drop(['Pos_y','Age_y','G_y', 'MP_y'], axis=1, inplace=True)

# Rename columns
stats_2020 = stats_2020.rename(columns={'Pos_x': 'Pos','Age_x': 'Age','G_x': 'G','MP_x': 'MP'})

In [13]:
# Remove duplicates and reset index
stats_2020 = stats_2020.drop_duplicates().reset_index(drop=True)

In [14]:
# Replace empty values and nas with 0
stats_2020 = stats_2020.replace('', 0)
stats_2020 = stats_2020.fillna(0)

In [15]:
# Convert the categorical values
stats_2020['Player'] = stats_2020['Player'].astype('category')
stats_2020['Pos'] = stats_2020['Pos'].astype('category')
stats_2020['Age'] = stats_2020['Age'].astype('category')
stats_2020['Tm'] = stats_2020['Tm'].astype('category')

In [16]:
# Convert the numeric values
for col in ['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP']:
    stats_2020[col] =pd.to_numeric(stats_2020[col])

In [17]:
# Remove the empty columns
stats_2020.drop(stats_2020.select_dtypes(['object']), inplace=True, axis=1)

In [18]:
stats_2020['Year']= 2020

In [19]:
# Remove rows with TOT and rename Team
stats_2020 = stats_2020[stats_2020['Tm'] != 'TOT']
stats_2020 =stats_2020.rename(columns={'Tm': 'Team'}) 

In [20]:
# Normalise the data and filter with rows with GP >10 and MP >5
stats_2020 = stats_2020[(stats_2020['G'] >= 10) & (stats_2020['MP'] >= 5)]

In [21]:
# Subset the numeric values
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
test = stats_2020.select_dtypes(include=numerics)

In [22]:
# Remove the two empty columns by index
cols = [41,46]
stats_2020.drop(stats_2020.columns[cols],axis=1,inplace=True)

In [23]:
#stats_2020.to_csv('/data/Season_Stats(2020).csv')