In [88]:
# 2022 will not be included in the range function
years = list(range(1991, 2022))
print(years, end = " ")

[1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] 

In [89]:
# original url is awards_2001.html or the year chosen.
# {} is used to replace the year in the range in the url when scraping
# Replace the url with each year we want to scrape data for

url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [90]:
import requests # make a request to the webpage to download it
import time

for year in years:
    # create a url for a specific year
    url = url_start.format(year)
    data = requests.get(url)
    
    # W+ opens file in write mode and if it already exists it will just override.
    with open("MVP/{}.html".format(year), "w+", encoding = "utf-8") as f:
        time.sleep(3)
        f.write(data.text) #text saves files as html

### Parsing the votes table with Beautiful Soup

Extract the data in the table from each html file.

In [91]:
!pip install beautifulsoup4



In [92]:
from bs4 import BeautifulSoup

### Example of parsing a single page

- Remove top row of the table as it will create an extra header row when loaded in pandas
- An id is a globally unique property in html that only one element should have. We'll find the table we want using the id

In [93]:
# read the html data
with open("MVP/1991.html", encoding="utf-8") as f:
    page = f.read()

In [94]:
# create a parser class to extract table from the page
soup = BeautifulSoup(page, "html.parser")

In [95]:
# remove top row of the table
soup.find("tr", class_="over_header").decompose()
print("Header row removed successfully")

Header row removed successfully


In [96]:
# remove all other page elements and only find specific table we want
mvp_table = soup.find(id="mvp")

In [97]:
# read table into pandas
import pandas as pd

# convert table into a string
# you'll get a list of dataframes so just get the first index.
mvp_1991 = pd.read_html(str(mvp_table))[0]

In [98]:
mvp_1991

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,19.4,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,25.6,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,27.6,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,29.0,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225
5,6,Clyde Drexler,28,POR,1.0,75.0,960,0.078,82,34.8,21.5,6.7,6.0,1.8,0.7,0.482,0.319,0.794,12.4,0.209
6,7,Kevin Johnson,24,PHO,0.0,32.0,960,0.033,77,36.0,22.2,3.5,10.1,2.1,0.1,0.516,0.205,0.843,12.7,0.22
7,8,Dominique Wilkins,31,ATL,0.0,29.0,960,0.03,81,38.0,25.9,9.0,3.3,1.5,0.8,0.47,0.341,0.829,11.4,0.177
8,9T,Larry Bird,34,BOS,0.0,25.0,960,0.026,60,38.0,19.4,8.5,7.2,1.8,1.0,0.454,0.389,0.891,6.6,0.14
9,9T,Terry Porter,27,POR,0.0,25.0,960,0.026,81,32.9,17.0,3.5,8.0,2.0,0.1,0.515,0.415,0.823,13.0,0.235


### Load all the other years

After confirming it has worked well for one year, do the same for the rest of the years.

In [99]:
dfs = []
for year in years:
    # read the html data
    with open("MVP/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()

        # create a parser class to extract table from the page
        soup = BeautifulSoup(page, "html.parser")

        # remove top row of the table
        soup.find("tr", class_="over_header").decompose()

        # remove all other page elements and only find specific table we want
        mvp_table = soup.find(id="mvp")

        # read table into pandas dataframe
        mvp = pd.read_html(str(mvp_table))[0]
        
        # create year column to know where data came from
        mvp["Year"] = year

        dfs.append(mvp)

In [100]:
mvps = pd.concat(dfs)

In [101]:
pd.pandas.set_option('display.max_columns', None) #display all column names
mvps.sample(5)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
4,5,Patrick Ewing,29,NYK,0.0,100.0,960,0.104,82,38.4,24.0,11.2,1.9,1.1,3.0,0.522,0.167,0.738,13.0,0.198,1992
5,6,Gary Payton,33,SEA,1.0,54.0,1260,0.043,82,40.3,22.1,4.8,9.0,1.6,0.3,0.467,0.314,0.797,12.6,0.183,2002
6,7,Karl Malone,32,UTA,1.0,85.0,1130,0.075,82,38.0,25.7,9.8,4.2,1.7,0.7,0.519,0.4,0.723,15.1,0.233,1996
0,1,LeBron James,27,MIA,85.0,1074.0,1210,0.888,62,37.5,27.1,7.9,6.2,1.9,0.8,0.531,0.362,0.771,14.5,0.298,2012
12,13,Isiah Thomas,29,DET,0.0,11.0,960,0.011,48,34.5,16.2,3.3,9.3,1.6,0.2,0.435,0.292,0.782,3.4,0.098,1991


In [102]:
# store in a csv file
mvps.to_csv("mvps.csv")

### Download Player Stats

In [103]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

url = player_stats_url.format(1991)
data = requests.get(url)
with open("PLAYERS/1991.html", "w+", encoding="utf-8") as f:
    f.write(data.text)

### Using selenium to scrape a JavaScript page

- Install selenium
- Download chrome driver for your browsers' version from [this website](chromedriver.chromium.org/downloads). To find your chrome browser version, click the three dor menu at the top right corner of your browser, go to settings and go to About Chrome

In [104]:
from selenium import webdriver

driver = webdriver.Chrome(executable_path="C:/Users/Wekesa/Downloads/chromedriver_win32/chromedriver")

  driver = webdriver.Chrome(executable_path="C:/Users/Wekesa/Downloads/chromedriver_win32/chromedriver")


In [106]:
import time

year = 1991

url = player_stats_url.format(year)

# render url in the browser
driver.get(url)

# add js to tell the browser to scroll down to be able to render the entire table
driver.execute_script("window.scrollTo(1, 1000)")

# get the html of the page
html = driver.page_source

In [107]:
with open("PLAYERS/{}.html".format(year), "w+", encoding="utf-8") as f:
    f.write(html)

In [108]:
for year in years:
    url = player_stats_url.format(year)

    # render url in the browser
    driver.get(url)

    # add js to tell the browser to scroll down to be able to render the entire table
    driver.execute_script("window.scrollTo(1, 1000)")
    time.sleep(2)

    # get the html of the page
    html = driver.page_source
    
    with open("PLAYERS/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(html)

In [109]:
player_df = []
for year in years:

    with open("PLAYERS/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()

    # create a parser class to extract table from the page
    soup = BeautifulSoup(page, "html.parser")

    # remove top row of the table
    soup.find("tr", class_="thead").decompose()

    # remove all other page elements and only find specific table we want
    player_table = soup.find(id="per_game_stats")

    # convert table into a string
    # you'll get a list of dataframes so just get the first index.
    player = pd.read_html(str(player_table))[0]
    player["Year"] = year
    player_df.append(player)

AttributeError: 'NoneType' object has no attribute 'decompose'

In [110]:
players = pd.concat(player_df)

In [111]:
# pd.pandas.set_option('display.max_columns', None) #display all column names
players.sample(5)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
218,173,Al Harrington,C,32,ORL,10,0,11.9,2.0,5.7,0.351,0.8,3.0,0.267,1.2,2.7,0.444,0.421,0.3,0.4,0.75,0.6,2.1,2.7,1.0,0.4,0.1,0.7,1.8,5.1,2013
47,45,Chauncey Billups,SG,35,LAC,20,20,30.4,4.2,11.4,0.364,2.4,6.3,0.384,1.8,5.2,0.34,0.469,4.3,4.8,0.895,0.2,2.3,2.5,4.0,0.5,0.2,1.9,2.0,15.0,2012
498,424,Qyntel Woods,PF,21,POR,53,0,6.3,1.1,2.2,0.5,0.1,0.2,0.333,1.1,2.1,0.514,0.513,0.1,0.4,0.35,0.3,0.7,1.0,0.2,0.3,0.0,0.4,0.7,2.4,2003
333,285,Mark Randall,PF,25,TOT,37,0,6.7,1.1,2.2,0.5,0.0,0.2,0.125,1.1,1.9,0.542,0.506,0.4,0.7,0.615,0.7,0.8,1.5,0.3,0.1,0.1,0.5,0.9,2.6,1993
604,442,Wang Zhizhi,C,26,LAC,2,0,4.5,0.0,0.5,0.0,0.0,0.0,,0.0,0.5,0.0,0.0,2.0,2.0,1.0,0.0,2.0,2.0,0.0,0.0,0.5,0.0,0.0,2.0,2004


In [112]:
players.shape

(13324, 31)

In [113]:
players.to_csv("player_stats.csv")

### Scraping Team Data

In [114]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [115]:
for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)

    with open("TEAM/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

### Parsing with BeautifulSoup

In [126]:
dfs = []
for year in years:
    with open("team/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    e_table = soup.find_all(id="divs_standings_E")[0]
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    w_table = soup.find_all(id="divs_standings_W")[0]
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

In [127]:
teams = pd.concat(dfs)

In [124]:
teams

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year,Eastern Conference,W,L,W/L%,GB,PS/G,PA/G,SRS,Team,Western Conference
0,1,Michael Jordan,27.0,CHI,77.0,891.0,960.0,0.928,82.0,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991,,,,,,,,,,
1,2,Magic Johnson,31.0,LAL,10.0,497.0,960.0,0.518,79.0,37.1,19.4,7.0,12.5,1.3,0.2,0.477,0.320,0.906,15.4,0.251,1991,,,,,,,,,,
2,3,David Robinson,25.0,SAS,6.0,476.0,960.0,0.496,82.0,37.7,25.6,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991,,,,,,,,,,
3,4,Charles Barkley,27.0,PHI,2.0,222.0,960.0,0.231,67.0,37.3,27.6,10.1,4.2,1.6,0.5,0.570,0.284,0.722,13.4,0.258,1991,,,,,,,,,,
4,5,Karl Malone,27.0,UTA,0.0,142.0,960.0,0.148,82.0,40.3,29.0,11.8,3.3,1.1,1.0,0.527,0.286,0.770,15.5,0.225,1991,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,,,,,,,,,,,,,,,,,,,,,2021,,42,30,.583,—,112.4,110.2,2.26,Dallas Mavericks*,
14,,,,,,,,,,,,,,,,,,,,,2021,,38,34,.528,4.0,113.3,112.3,1.07,Memphis Grizzlies*,
15,,,,,,,,,,,,,,,,,,,,,2021,,33,39,.458,9.0,111.1,112.8,-1.58,San Antonio Spurs,
16,,,,,,,,,,,,,,,,,,,,,2021,,31,41,.431,11.0,114.6,114.9,-0.20,New Orleans Pelicans,


In [128]:
teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,0.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,0.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,0.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,0.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets


In [129]:
teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
13,42,30,0.583,—,112.4,110.2,2.26,2021,Dallas Mavericks*
14,38,34,0.528,4.0,113.3,112.3,1.07,2021,Memphis Grizzlies*
15,33,39,0.458,9.0,111.1,112.8,-1.58,2021,San Antonio Spurs
16,31,41,0.431,11.0,114.6,114.9,-0.2,2021,New Orleans Pelicans
17,17,55,0.236,25.0,108.8,116.7,-7.5,2021,Houston Rockets


In [130]:
teams.to_csv("teams.csv")