In [68]:
# I will be scraping data from basketballreference.com on MVP votes, Player stats, and Team stats
import pandas as pd
# This is where I choose which years to scrape data for
# I chose the last 42 years since the 3pt shot became a part of the NBA
years = list(range(1980,2023))

In [47]:
# This is the url of the website that I will get the data from
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [48]:
import requests

# This for loop is used to iterate through the years
for year in years:
    url = url_start.format(year)
    data = requests.get(url)
# This is where I created the mvp folder which contains the NBA MVP voting webpage for each year     
    with open("mvp/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [49]:
# BeautifulSoup is used to extract the data in the table from each html file
from bs4 import BeautifulSoup

In [50]:
# Reading only the 1980 mvp page
with open("mvp/1980.html") as f:
    page = f.read()

In [51]:
# Parser class is used to extract the table from the page
soup = BeautifulSoup(page, "html.parser")

In [52]:
# Removing overheader 
soup.find('tr', class_= "over_header").decompose()

In [53]:
# Find only the mvp table
mvp_table = soup.find(id = "mvp")

In [54]:
mvp_1980 = pd.read_html(str(mvp_table))[0]

In [55]:
# Here is the 1991 MVP table
mvp_1980

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Kareem Abdul-Jabbar,32,LAL,147.0,147.0,221,0.665,82,38.3,24.8,10.8,4.5,1.0,3.4,0.604,0.0,0.765,14.8,0.227
1,2,Julius Erving,29,PHI,31.5,31.5,221,0.143,78,36.1,26.9,7.4,4.6,2.2,1.8,0.519,0.2,0.787,12.5,0.213
2,3,George Gervin,27,SAS,19.0,19.0,221,0.086,78,37.6,33.1,5.2,2.6,1.4,1.0,0.528,0.314,0.852,10.6,0.173
3,4,Larry Bird,23,BOS,15.0,15.0,221,0.068,82,36.0,21.3,10.4,4.5,1.7,0.6,0.474,0.406,0.836,11.2,0.182
4,5T,Tiny Archibald,31,BOS,2.0,2.0,221,0.009,80,35.8,14.1,2.5,8.4,1.3,0.1,0.482,0.222,0.83,8.9,0.148
5,5T,Dennis Johnson,25,SEA,2.0,2.0,221,0.009,81,36.3,19.0,5.1,4.1,1.8,1.0,0.422,0.207,0.78,7.4,0.12
6,5T,Dan Roundfield,26,ATL,2.0,2.0,221,0.009,81,32.0,16.5,10.3,2.3,1.2,1.7,0.499,0.0,0.71,9.1,0.169
7,8,Gus Williams,26,SEA,1.5,1.5,221,0.007,82,36.2,22.1,3.4,4.8,2.4,0.5,0.482,0.194,0.788,11.6,0.187
8,9,Moses Malone,24,HOU,1.0,1.0,221,0.005,82,38.3,25.8,14.5,1.8,1.0,1.3,0.502,0.0,0.719,11.9,0.183


In [56]:
# This is where I set dfs to be a list of data frames for each year
dfs = []

# Same steps as before but the for loop is used to read all the mvp pages.
for year in years:
    with open("mvp/{}.html".format(year)) as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_= "over_header").decompose()
    mvp_table = soup.find(id = "mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    
    dfs.append(mvp)

In [57]:
# Used concat to combine all the dataframes into one for the mvp votings
mvps = pd.concat(dfs)

In [72]:
mvps.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
10,11,Russell Westbrook,32,WAS,0.0,5.0,1010,0.005,65,36.4,...,11.5,11.7,1.4,0.4,0.439,0.315,0.656,3.7,0.075,2021
11,12,Ben Simmons,24,PHI,0.0,3.0,1010,0.003,58,32.4,...,7.2,6.9,1.6,0.6,0.557,0.3,0.613,6.0,0.153,2021
12,13T,James Harden,31,TOT,0.0,1.0,1010,0.001,44,36.6,...,7.9,10.8,1.2,0.8,0.466,0.362,0.861,7.0,0.208,2021
13,13T,LeBron James,36,LAL,0.0,1.0,1010,0.001,45,33.4,...,7.7,7.8,1.1,0.6,0.513,0.365,0.698,5.6,0.179,2021
14,13T,Kawhi Leonard,29,LAC,0.0,1.0,1010,0.001,52,34.1,...,6.5,5.2,1.6,0.4,0.512,0.398,0.885,8.8,0.238,2021


In [59]:
# Saved mvps dataframe in csv format
mvps.to_csv("nbaMvps.csv")

In [73]:
# Here is where I get the all of the players stats from 1991 to 2021 through basketballreference.com
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

url = player_stats_url.format(1980)
data = requests.get(url)
# This is where the I created the player folder
with open("player/1980.html", "w+") as f:
    f.write(data.text)
    
# All of the rows are not showing in the table due to a rendering issue

In [74]:
# Added in selenium to fix a rendering issue
from selenium import webdriver

In [75]:
# Initialized the web driver to automate the browser
driver = webdriver.Chrome(executable_path = "/Users/harveyjames/Downloads/chromedriver")

  driver = webdriver.Chrome(executable_path = "/Users/harveyjames/Downloads/chromedriver")


In [83]:
# Used to executed the javacript
import time

year = 1980
url = player_stats_url.format(year)

# Executing Javascript
driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(2)

html = driver.page_source

In [84]:
with open("player/{}.html".format(year), "w+") as f:
    f.write(html)
# Now all of the rows are showing for 1980

In [78]:
# This used to download all of the pages from 1980 to 2022
for year in years:
    url = player_stats_url.format(year)

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)

    html = driver.page_source
    with open("player/{}.html".format(year), "w+") as f:
        f.write(html)

WebDriverException: Message: target frame detached
  (Session info: chrome=100.0.4896.88)
Stacktrace:
0   chromedriver                        0x00000001063563d9 chromedriver + 5104601
1   chromedriver                        0x00000001062e6bf3 chromedriver + 4647923
2   chromedriver                        0x0000000105ed79ff chromedriver + 391679
3   chromedriver                        0x0000000105ec2a5c chromedriver + 305756
4   chromedriver                        0x0000000105ec1b6a chromedriver + 301930
5   chromedriver                        0x0000000105ec206c chromedriver + 303212
6   chromedriver                        0x0000000105ec1fc4 chromedriver + 303044
7   chromedriver                        0x0000000105ec912c chromedriver + 332076
8   chromedriver                        0x0000000105ec3358 chromedriver + 308056
9   chromedriver                        0x0000000105ec3918 chromedriver + 309528
10  chromedriver                        0x0000000105ec36ca chromedriver + 308938
11  chromedriver                        0x0000000105ec2c67 chromedriver + 306279
12  chromedriver                        0x0000000105ec2423 chromedriver + 304163
13  chromedriver                        0x0000000105ec22b3 chromedriver + 303795
14  chromedriver                        0x0000000105ed8fe2 chromedriver + 397282
15  chromedriver                        0x0000000105f3c83c chromedriver + 804924
16  chromedriver                        0x0000000105f2a5b3 chromedriver + 730547
17  chromedriver                        0x0000000105f00139 chromedriver + 557369
18  chromedriver                        0x0000000105f01165 chromedriver + 561509
19  chromedriver                        0x000000010631539d chromedriver + 4838301
20  chromedriver                        0x000000010632dcde chromedriver + 4938974
21  chromedriver                        0x0000000106332b5e chromedriver + 4959070
22  chromedriver                        0x000000010632e94a chromedriver + 4942154
23  chromedriver                        0x000000010630a33c chromedriver + 4793148
24  chromedriver                        0x00000001063483b8 chromedriver + 5047224
25  chromedriver                        0x000000010634853f chromedriver + 5047615
26  chromedriver                        0x000000010635d705 chromedriver + 5134085
27  libsystem_pthread.dylib             0x00007ff812afe4f4 _pthread_start + 125
28  libsystem_pthread.dylib             0x00007ff812afa00f thread_start + 15


In [85]:
# concat dataframes
dfs = []
for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()
    
# Initialize beautifulsoup on the page
    soup = BeautifulSoup(page, "html.parser")
# Delete unneeded rows
    soup.find('tr', class_= "thead").decompose()
# Assign per_game_stats to player_table
    player_table = soup.find(id = "per_game_stats")
# Read it into pandas
    player = pd.read_html(str(player_table))[0]
# Assign the year
    player["Year"] = year
    dfs.append(player)

In [86]:
# Used concat to combine all the dataframes into one for the player stats
players = pd.concat(dfs)

In [87]:
# Shows the 23,000 rows of players from 1800 - 2022
players

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Kareem Abdul-Jabbar*,C,32,LAL,82,,38.3,10.2,16.9,...,2.3,8.5,10.8,4.5,1.0,3.4,3.6,2.6,24.8,1980
1,2,Tom Abernethy,PF,25,GSW,67,,18.2,2.3,4.7,...,0.9,1.9,2.9,1.3,0.5,0.2,0.6,1.8,5.4,1980
2,3,Alvan Adams,C,25,PHO,75,,28.9,6.2,11.7,...,2.1,6.0,8.1,4.3,1.4,0.7,2.9,3.2,14.9,1980
3,4,Tiny Archibald*,PG,31,BOS,80,80,35.8,4.8,9.9,...,0.7,1.7,2.5,8.4,1.3,0.1,3.0,2.7,14.1,1980
4,5,Dennis Awtrey,C,31,CHI,26,,21.5,1.0,2.3,...,1.1,3.3,4.4,1.5,0.5,0.6,1.0,2.5,3.3,1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,601,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,2022
837,602,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022
838,603,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022
839,604,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022


In [89]:
# Saved players dataframe in csv format
players.to_csv("nbaPlayers.csv")

In [90]:
# Here is the team record page which is broken down with standings
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [91]:
for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)
# This is where the I created the team folder
    with open("team/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [92]:
# Here is where the team data is processed to extract the tables
# concat dataframes
dfs = []

for year in years:
    with open("team/{}.html".format(year)) as f:
        page = f.read()

# Parsing the 1st table (Eastern Conference)
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_= "thead").decompose()
    team_table = soup.find(id = "divs_standings_E")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
# Assigning eastern conference column to the team column
    team["Team"] = team["Eastern Conference"]
    dfs.append(team)
# Deleting Eastern Conference column because it was assigned to the team colummn
    del team["Eastern Conference"]
    
# Parsing the 2nd table (Western Conference)
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_= "thead").decompose()
    team_table = soup.find(id = "divs_standings_W")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
# Assigning western conference column to the team column
    team["Team"] = team["Western Conference"]
# Deleting Eastern Conference column because it was assigned to the team colummn
    del team["Western Conference"]
    dfs.append(team)

In [93]:
# Used concat to combine all the dataframes into one for the team stats
teams = pd.concat(dfs)

In [94]:
teams

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,61,21,.744,—,113.5,105.7,7.37,1980,Boston Celtics*
1,59,23,.720,2.0,109.1,104.9,4.04,1980,Philadelphia 76ers*
2,39,43,.476,22.0,107.0,109.5,-2.27,1980,Washington Bullets*
3,39,43,.476,22.0,114.0,115.1,-0.96,1980,New York Knicks
4,34,48,.415,27.0,108.3,109.5,-0.98,1980,New Jersey Nets
...,...,...,...,...,...,...,...,...,...
13,56,26,.683,—,115.6,109.9,5.37,2022,Memphis Grizzlies* (2)
14,52,30,.634,4.0,108.0,104.7,3.12,2022,Dallas Mavericks* (4)
15,36,46,.439,20.0,109.3,110.3,-0.84,2022,New Orleans Pelicans* (9)
16,34,48,.415,22.0,113.2,113.0,0.02,2022,San Antonio Spurs (10)


In [95]:
# Saved teams dataframe in csv format
teams.to_csv("nbaTeams.csv")