In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from io import StringIO
import warnings
from selenium import webdriver
import time

In [2]:
years = list(range(1980,2025))
start_url = "https://www.basketball-reference.com/awards/awards_{}.html"

In [10]:
for year in years:
    url = start_url.format(year)
    data = requests.get(url)

    with open("mvp/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [28]:
warnings.filterwarnings("ignore",category = FutureWarning)
data = []
for year in years:
    with open("mvp/{}.html".format(year)) as f:
          page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "over_header").decompose()
    mvp_table = soup.find(id = "mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    data.append(mvp)

In [29]:
mvps = pd.concat(data)
mvps.to_csv("Historic-MVP-Data.csv")
mvps

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Kareem Abdul-Jabbar,32,LAL,147.0,147.0,221,0.665,82,38.3,...,10.8,4.5,1.0,3.4,0.604,0.000,0.765,14.8,0.227,1980
1,2,Julius Erving,29,PHI,31.5,31.5,221,0.143,78,36.1,...,7.4,4.6,2.2,1.8,0.519,0.200,0.787,12.5,0.213,1980
2,3,George Gervin,27,SAS,19.0,19.0,221,0.086,78,37.6,...,5.2,2.6,1.4,1.0,0.528,0.314,0.852,10.6,0.173,1980
3,4,Larry Bird,23,BOS,15.0,15.0,221,0.068,82,36.0,...,10.4,4.5,1.7,0.6,0.474,0.406,0.836,11.2,0.182,1980
4,5T,Tiny Archibald,31,BOS,2.0,2.0,221,0.009,80,35.8,...,2.5,8.4,1.3,0.1,0.482,0.222,0.830,8.9,0.148,1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,5,Jalen Brunson,27,NYK,0.0,142.0,990,0.143,77,35.4,...,3.6,6.7,0.9,0.2,0.479,0.401,0.847,11.2,0.198,2024
5,6,Jayson Tatum,25,BOS,0.0,86.0,990,0.087,74,35.7,...,8.1,4.9,1.0,0.6,0.471,0.376,0.833,10.4,0.189,2024
6,7,Anthony Edwards,22,MIN,0.0,18.0,990,0.018,79,35.1,...,5.4,5.1,1.3,0.5,0.461,0.357,0.836,7.5,0.130,2024
7,8,Domantas Sabonis,27,SAC,0.0,3.0,990,0.003,82,35.7,...,13.7,8.2,0.9,0.6,0.594,0.379,0.704,12.6,0.206,2024


In [11]:
player_stat_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [23]:
def fetch_page(years):
    driver = webdriver.Safari()
    try:
        for year in years:
            url = player_stat_url.format(year)
            driver.get(url)
            driver.execute_script("window.scrollTo(1,1000000)")
            time.sleep(2)
            html = driver.page_source
            
            with open("player/{}.html".format(year), "w+") as f:
                    f.write(html)
    except Exception as e:
        print(f"Error has occured: {e}")
    finally:
        driver.quit()
fetch_page(years)

In [35]:
player_data = []
for year in years:
    with open("player/{}.html".format(year)) as f:
          page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "thead").decompose()
    player_data_table = soup.find(id = "per_game_stats")
    player_table = pd.read_html(str(player_data_table))[0]
    player_table["Year"] = year
    player_data.append(player_table)
    

In [37]:
player_df = pd.concat(player_data)
player_df
player_df.to_csv("Historic-Player-Data.csv")

In [39]:
standings_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    url = standings_url.format(year)
    data = requests.get(url)

    with open("standings/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [43]:
standings_data = []
for year in years:
    with open("standings/{}.html".format(year)) as f:
          page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "thead").decompose()
    standings_table = soup.find(id = "divs_standings_E")
    standings = pd.read_html(str(standings_table))[0]
    standings["Year"] = year
    standings["Team"] = standings["Eastern Conference"]
    del standings["Eastern Conference"]
    standings_data.append(standings)
    

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "thead").decompose()
    standings_table = soup.find(id = "divs_standings_W")
    standings = pd.read_html(str(standings_table))[0]
    standings["Year"] = year
    standings["Team"] = standings["Western Conference"]
    del standings["Western Conference"]
    standings_data.append(standings)

In [44]:
standings_df = pd.concat(standings_data)
standings_df

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,61,21,.744,—,113.5,105.7,7.37,1980,Boston Celtics*
1,59,23,.720,2.0,109.1,104.9,4.04,1980,Philadelphia 76ers*
2,39,43,.476,22.0,107.0,109.5,-2.27,1980,Washington Bullets*
3,39,43,.476,22.0,114.0,115.1,-0.96,1980,New York Knicks
4,34,48,.415,27.0,108.3,109.5,-0.98,1980,New Jersey Nets
...,...,...,...,...,...,...,...,...,...
13,50,32,.610,—,117.9,115.6,2.30,2024,Dallas Mavericks* (5)
14,49,33,.598,1.0,115.1,110.7,4.46,2024,New Orleans Pelicans* (7)
15,41,41,.500,9.0,114.3,113.2,1.24,2024,Houston Rockets (11)
16,27,55,.329,23.0,105.8,112.8,-6.57,2024,Memphis Grizzlies (13)


In [45]:
standings_df.to_csv("Historic-Standings-Data.csv")