In [1]:
## Importing the relevant packages and modules 
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Step 1: Building the web scraping algorithm for NBA statistics 

## 1.a : Defining the question that we want to answer 

We want to predict the 2023 NBA MVP using a player's seasonal statistical performance (a player's statistical averages over an NBA season). To answer this question we need to define two parameters: the statistical features we want to include in the dataset that we want to build and how many seasons do we want to cover? 

### What data source did I use to build the dataset?  
If we want to scrape seasonal player data, the logical starting place would be to start at this web page: 
<a id=https://www.basketball-reference.com/leagues/NBA_2022_per_game.html > 2021-22 NBA Player Stats </a>.
<br>
This webpage contains different statistical features (per-game, advanced and shooting) for every NBA player in a given season (in the above link, the 2021-2022 NBA season). 
<br>
I chose this website because it represents the most comprehensive raw basketball database. It is perfect for scraping and building customized datasets for further analysis. It is also rudimentary in terms of HTML structure, making it easier for beginners such as myself to scrape it. 

### What features did I choose to build the dataset? 
To build the dataset, we had the choice of features across several categories of data: totals, per-game, per 36-min, per 100 poss, advanced, play-by-play, shooting and adjusted shooting. I chose to only look at per-game and advanced because they cover all of the different performance areas (scoring, shooting, passing, defense, efficiency, etc.), from different angles. This gives us a statistical overview of each player's season while keeping the dataset relatively lean (in terms of columns). I could have used all of the above-mentioned categories but I don't believe it would make for a significantly more meaningful analysis and one that could be easily communicable to audiences. 


### How many seasons did I choose to scrape? 
I think that 20 seasons is a big enough sample size to make for interesting time series analyses (of different statistical trends) while staying in the 'current' era of basketball, where players are more comparable to each other. I could have arguably extended my analysis to the last 30 years but I think you also in yearly comparability once you go back to the 90's and the insights would probably not have been that much more interesting. 
<br>
It is also interesting to mention that the chosen timespan also coincides with the career length of LeBron James. So, If I want to compare the career achievements of various of today's 'great' players, LeBron would force me to look back at least 20 years in the past.  

### How is the scraping algorithm structured? 
The algorithm works in the following way: 
<ol>
    <li>loop through a list of seasons
    <li>open the html link of that season's player stats page
    <li>parse the html link using BeautifulSoup
    <li>extract column headers and table data to build a dataset array
    <li>turn the array into a pandas dataframe and append it to a list of dataframes (one per season)
    <li>merge of all the seasonal dataframes into a single 20-year dataframe
</ol>
   


In [2]:
current_season = 2023
season_range = range(current_season - 20,current_season)
stats = ['per_game','advanced']

def scrape_season(season,stats):

    int_frames = []
    
    for i in stats:

        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{i}.html'

        html = urlopen(url)
        soup = BeautifulSoup(html,features = 'lxml')

        ## Getting the column headers
        headers = soup.findAll('th')
        header_text = [th.text for th in headers]
        header_text.insert(0,'Season')
        header_text.remove('Rk')
        header_final = header_text[:header_text.index('1')]

        ## Extracting rows from the table 
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]


        # Cleaning the rows data by removing empty rows and adding the season data
        for i in range(len(rows_data)): 
            try: 
                rows_data[i].insert(0, f'{season}')
            except: 
                rows_data.remove(rows_data[i])

        for i in range(len(rows_data)): 
            try:
                if len(rows_data[i])==1: 
                    rows_data.remove(rows_data[i])
            except: 
                pass

        # Append the rows_data into the intermediate frame
        df = pd.DataFrame(rows_data, columns = header_final)
        int_frames.append(df) 
        
    return pd.concat(int_frames, axis = 1)


In [3]:
# Scraping data from the past 20 NBA seasons 

agg_frames = []

for i in season_range: 
    result = scrape_season(i,stats)
    agg_frames.append(result)

final_df = pd.concat(agg_frames)


In [4]:
## Cleaning the dataframe by cutting any redundant columns and null columns 

final_df.drop('Season', axis=1)
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

final_df2 = final_df.dropna(axis=1)
final_df2.head()


Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,Unnamed: 13,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,2003,Tariq Abdul-Wahad,SG,28,DAL,14,0,14.6,1.9,4.1,...,15.0,,0.2,0.2,0.4,0.104,-1.6,0.2,-1.4,0.0
1,2003,Shareef Abdur-Rahim,PF,26,ATL,81,81,38.1,7.0,14.6,...,24.2,,7.4,2.3,9.7,0.151,2.3,-0.7,1.6,2.8
2,2003,Courtney Alexander,PG,25,NOH,66,7,20.6,2.9,7.7,...,21.3,,0.1,1.0,1.1,0.04,-3.3,-1.2,-4.5,-0.9
3,2003,Malik Allen,PF,24,MIA,80,73,29.0,4.2,9.9,...,19.7,,-1.7,2.6,0.9,0.018,-3.9,-0.4,-4.4,-1.4
4,2003,Ray Allen*,SG,27,TOT,76,75,37.9,7.9,17.9,...,27.8,,7.6,1.5,9.1,0.152,4.7,-1.0,3.6,4.1


## 1.b Inspecting the dataset for quality issues 

### Saving the dataframe to a csv file on my computer 
I encountered numerous instances of the HTTP 429 (sending too many requests to the server) error when refreshing the scraping program, which paralyzed my progress for two days at a time. In order to eliminate this risk and pursue the project, I decided to save the dataframe to a csv file, which I could read back into a dataframe from another notebook. 

### Checking the dataframe for empty data 
We see that there are no discrepancies representing null data in the dataframe. 
<br>
However, we also notice that all of the statistical features that we will later want to analyze/visualize have the Object datatype. We will later turn these into numeric data in order to manipulate them with certain methods. 

### Checking the dataframe for anomalous values
We use the describe() method to check for anomalous values, such as totals that might be too low or too high for what me might initially expect. In this dataset, we notice that the column 'Pos' (Position) has 17 distinct values, while there are only 5 registered NBA positions (PG/SG/SF/PF/C). In Part 2 of this project we will explore why this may be the case and see how we could fix this problem. 

In [25]:
## Inspecting the data for quality issues 

# Renaming the dataset for greater readability 
nba_df = final_df2

## Saving the dataframe to a local path on this computer 
nba_df.to_csv('/Users/krist/Downloads/nba_df.csv', index=False) 

In [7]:
 # Checking for empty data 

nba_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12211 entries, 0 to 811
Data columns (total 51 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Season  12211 non-null  object
 1   Player  12211 non-null  object
 2   Pos     12211 non-null  object
 3   Age     12211 non-null  object
 4   Tm      12211 non-null  object
 5   G       12211 non-null  object
 6   GS      12211 non-null  object
 7   MP      12211 non-null  object
 8   FG      12211 non-null  object
 9   FGA     12211 non-null  object
 10  FG%     12211 non-null  object
 11  3P      12211 non-null  object
 12  3PA     12211 non-null  object
 13  3P%     12211 non-null  object
 14  2P      12211 non-null  object
 15  2PA     12211 non-null  object
 16  2P%     12211 non-null  object
 17  eFG%    12211 non-null  object
 18  FT      12211 non-null  object
 19  FTA     12211 non-null  object
 20  FT%     12211 non-null  object
 21  ORB     12211 non-null  object
 22  DRB     12211 non-null  

In [8]:
# Checking the dataset for anomalous values, eg. total values that are either too low or too high

nba_df.describe()

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,Unnamed: 13,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
count,12211,12211,12211,12211,12211,12211,12211,12211.0,12211.0,12211.0,...,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0,12211.0
unique,20,2053,17,27,36,85,84,415.0,114.0,236.0,...,373.0,1.0,158.0,82.0,187.0,688.0,340.0,204.0,415.0,115.0
top,2022,Trevor Ariza,SG,24,TOT,82,0,3.0,1.0,3.0,...,17.2,,0.0,0.0,0.0,0.105,-0.9,-0.3,-0.5,0.0
freq,812,26,2546,1262,1227,488,3445,65.0,399.0,193.0,...,118.0,12211.0,1279.0,1353.0,1115.0,100.0,215.0,435.0,194.0,2002.0
