In [3]:
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup
import time
import random

# Create a directory to store the downloaded HTML files
if not os.path.exists("mvp"):
    os.makedirs("mvp")

# Download the MVP voting results for each year from 1991 to 2021
years = list(range(1991, 2024))
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"
for year in years:
    filename = "mvp/{}.html".format(year)
    if os.path.exists(filename):
        print(f"{filename} already exists, skipping download")
    else:
        url = url_start.format(year)
        response = requests.get(url)
        with open(filename, "w+", encoding="utf-8") as f:
            f.write(response.text)
        print(f"{filename} downloaded")
    time.sleep(random.randint(6, 10))

# Extract the MVP voting results for each year and store them in a list of dataframes
mvp_dataframes = []
for year in years:
    try:
        with open("mvp/{}.html".format(year), encoding="utf-8") as f:
            page = f.read()
        soup = BeautifulSoup(page, "html.parser")
        over_header = soup.find("tr", class_="over_header")
        if over_header is not None:
            over_header.decompose()
        mvp_table = soup.find_all(id="mvp")
        if len(mvp_table) > 0:
            mvp_table = mvp_table[0]
            mvp_df = pd.read_html(str(mvp_table))[0]
            mvp_df.insert(0, "Year", year)
            mvp_dataframes.append(mvp_df)
        else:
            print(f"No MVP voting results found for {year}")
    except Exception as e:
        print(f"Error processing {year}: {e}")
    time.sleep(random.randint(6, 10))

# Combine all the dataframes into a single dataframe
if len(mvp_dataframes) == 0:
    print("No data to concatenate")
else:
    mvp_df = pd.concat(mvp_dataframes, ignore_index=True)

    # Convert columns to the correct data types
    mvp_df[["Age", "Pts Won", "Pts Max"]] = mvp_df[["Age", "Pts Won", "Pts Max"]].apply(pd.to_numeric, errors="coerce")
    mvp_df[["Age", "Pts Won", "Pts Max"]] = mvp_df[["Age", "Pts Won", "Pts Max"]].astype("Int64")
    mvp_df[["G", "MP", "PTS", "TRB", "AST", "STL", "BLK", "FG%", "3P%", "FT%", "WS"]] = mvp_df[["G", "MP", "PTS", "TRB", "AST", "STL", "BLK", "FG%", "3P%", "FT%", "WS"]].astype(float)

    # Save the cleaned data to a CSV file
    mvp_df.to_csv("mvp_results.csv", index=False)
    
    # Print the cleaned data
    print(mvp_df)

mvp/1991.html already exists, skipping download
mvp/1992.html already exists, skipping download
mvp/1993.html already exists, skipping download
mvp/1994.html already exists, skipping download
mvp/1995.html already exists, skipping download
mvp/1996.html already exists, skipping download
mvp/1997.html already exists, skipping download
mvp/1998.html already exists, skipping download
mvp/1999.html already exists, skipping download
mvp/2000.html already exists, skipping download
mvp/2001.html already exists, skipping download
mvp/2002.html already exists, skipping download
mvp/2003.html already exists, skipping download
mvp/2004.html already exists, skipping download
mvp/2005.html already exists, skipping download
mvp/2006.html already exists, skipping download
mvp/2007.html already exists, skipping download
mvp/2008.html already exists, skipping download
mvp/2009.html already exists, skipping download
mvp/2010.html already exists, skipping download
mvp/2011.html already exists, skipping d

In [4]:
import requests
import os
import shutil
from bs4 import BeautifulSoup
import pandas as pd

In [7]:
with open("D:\\pythonProject\\ML_Project\\mvp\\1991.html", encoding="iso-8859-1") as f:
    page = f.read()

soup = BeautifulSoup(page, 'html.parser')
soup.find('tr', class_="over_header").decompose()

In [8]:
mvp_table = soup.find_all(id="mvp")[0]

In [9]:
mvp_1991 = pd.read_html(str(mvp_table))[0]

In [10]:
mvp_1991.head(1)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321


In [11]:
mvp_1991["Year"] = 1991

In [12]:
mvp_1991.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991


In [18]:
dfs = []
for year in years:
    with open("D:\\pythonProject\\ML_Project\\mvp\\{}.html".format(year), encoding='iso-8859-1') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year
    
    dfs.append(mvp_df)

In [24]:
mvps = pd.concat(dfs)

mvps.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991


In [20]:
mvps.to_csv("mvps.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'player/1991.html'