In [1]:
## Importing the relevant packages and modules 
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
current_season = 2023
season_range = range(current_season - 20,current_season)
stats = ['per_game','advanced']

def scrape_season(season,stats):

    int_frames = []
    
    for i in stats:

        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{i}.html'

        html = urlopen(url)
        soup = BeautifulSoup(html,features = 'lxml')

        ## Getting the column headers
        headers = soup.findAll('th')
        header_text = [th.text for th in headers]
        header_text.insert(0,'Season')
        header_text.remove('Rk')
        header_final = header_text[:header_text.index('1')]

        ## Extracting rows from the table 
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]


        # Cleaning the rows data 
        for i in range(len(rows_data)): 
            try: 
                rows_data[i].insert(0, f'{season}')
            except: 
                rows_data.remove(rows_data[i])

        for i in range(len(rows_data)): 
            try:
                if len(rows_data[i])==1: 
                    rows_data.remove(rows_data[i])
            except: 
                pass

        # Append the rows_data into the intermediate frame
        df = pd.DataFrame(rows_data, columns = header_final)
        int_frames.append(df) 
        
    return pd.concat(int_frames, axis = 1)


In [3]:
# Scraping data from the past 20 NBA seasons 

agg_frames = []

for i in season_range: 
    result = scrape_season(i,stats)
    agg_frames.append(result)

final_df = pd.concat(agg_frames)


In [4]:
## Cut the redundant columns in the dataframe 

final_df.drop('Season', axis=1)
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

final_df2 = final_df.dropna(axis=1)
final_df2.head()


Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,Unnamed: 13,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,2003,Tariq Abdul-Wahad,SG,28,DAL,14,0,14.6,1.9,4.1,...,15.0,,0.2,0.2,0.4,0.104,-1.6,0.2,-1.4,0.0
1,2003,Shareef Abdur-Rahim,PF,26,ATL,81,81,38.1,7.0,14.6,...,24.2,,7.4,2.3,9.7,0.151,2.3,-0.7,1.6,2.8
2,2003,Courtney Alexander,PG,25,NOH,66,7,20.6,2.9,7.7,...,21.3,,0.1,1.0,1.1,0.04,-3.3,-1.2,-4.5,-0.9
3,2003,Malik Allen,PF,24,MIA,80,73,29.0,4.2,9.9,...,19.7,,-1.7,2.6,0.9,0.018,-3.9,-0.4,-4.4,-1.4
4,2003,Ray Allen*,SG,27,TOT,76,75,37.9,7.9,17.9,...,27.8,,7.6,1.5,9.1,0.152,4.7,-1.0,3.6,4.1


In [25]:
## EDA on the NBA Dataset ## 
nba_df = final_df2

nba_df.head()

## Saving the dataframe to a local path on this computer 
nba_df.to_csv('/Users/krist/Downloads/nba_df.csv', index=False) 

In [7]:
## Data cleaning checks ## 

# Checking for empty data
# Here we can see that there are no null values in any of the columns. Special attention to column 42, which is without a name, that I 
# am trying to fix but am still searching for solutions. 

nba_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12211 entries, 0 to 811
Data columns (total 51 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Season  12211 non-null  object
 1   Player  12211 non-null  object
 2   Pos     12211 non-null  object
 3   Age     12211 non-null  object
 4   Tm      12211 non-null  object
 5   G       12211 non-null  object
 6   GS      12211 non-null  object
 7   MP      12211 non-null  object
 8   FG      12211 non-null  object
 9   FGA     12211 non-null  object
 10  FG%     12211 non-null  object
 11  3P      12211 non-null  object
 12  3PA     12211 non-null  object
 13  3P%     12211 non-null  object
 14  2P      12211 non-null  object
 15  2PA     12211 non-null  object
 16  2P%     12211 non-null  object
 17  eFG%    12211 non-null  object
 18  FT      12211 non-null  object
 19  FTA     12211 non-null  object
 20  FT%     12211 non-null  object
 21  ORB     12211 non-null  object
 22  DRB     12211 non-null  

In [8]:
# Troubleshooting for unique values 

# We notice some peculiarities that we will look into further. For example, we see 17 unique values for position, which might
# indicate that there is something wrong with the data formatting 

nba_df.describe()

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,Unnamed: 13,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
count,12211,12211,12211,12211,12211,12211,12211,12211.0,12211.0,12211.0,...,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0
unique,20,2053,17,27,36,85,84,415.0,114.0,236.0,...,373.0,1.0,158.0,82.0,187.0,688.0,340.0,204.0,415.0,115.0
top,2022,Trevor Ariza,SG,24,TOT,82,0,3.0,1.0,3.0,...,17.2,,0.0,0.0,0.0,0.105,-0.9,-0.3,-0.5,0.0
freq,812,26,2546,1262,1227,488,3445,65.0,399.0,193.0,...,118.0,12211.0,1279.0,1353.0,1115.0,100.0,215.0,435.0,194.0,2002.0
