In [None]:
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup
import time
import random

# Create a directory to store the downloaded HTML files
if not os.path.exists("mvp"):
    os.makedirs("mvp")

# Download the MVP voting results for each year from 1991 to 2021
years = list(range(1991, 2024))
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"
for year in years:
    filename = "mvp/{}.html".format(year)
    if os.path.exists(filename):
        print(f"{filename} already exists, skipping download")
    else:
        url = url_start.format(year)
        response = requests.get(url)
        with open(filename, "w+", encoding="utf-8") as f:
            f.write(response.text)
        print(f"{filename} downloaded")
    time.sleep(random.randint(6, 10))

# Extract the MVP voting results for each year and store them in a list of dataframes
mvp_dataframes = []
for year in years:
    try:
        with open("mvp/{}.html".format(year), encoding="utf-8") as f:
            page = f.read()
        soup = BeautifulSoup(page, "html.parser")
        over_header = soup.find("tr", class_="over_header")
        if over_header is not None:
            over_header.decompose()
        mvp_table = soup.find_all(id="mvp")
        if len(mvp_table) > 0:
            mvp_table = mvp_table[0]
            mvp_df = pd.read_html(str(mvp_table))[0]
            mvp_df.insert(0, "Year", year)
            mvp_dataframes.append(mvp_df)
        else:
            print(f"No MVP voting results found for {year}")
    except Exception as e:
        print(f"Error processing {year}: {e}")
    time.sleep(random.randint(6, 10))

# Combine all the dataframes into a single dataframe
if len(mvp_dataframes) == 0:
    print("No data to concatenate")
else:
    mvp_df = pd.concat(mvp_dataframes, ignore_index=True)

    # Convert columns to the correct data types
    mvp_df[["Age", "Pts Won", "Pts Max"]] = mvp_df[["Age", "Pts Won", "Pts Max"]].apply(pd.to_numeric, errors="coerce")
    mvp_df[["Age", "Pts Won", "Pts Max"]] = mvp_df[["Age", "Pts Won", "Pts Max"]].astype("Int64")
    mvp_df[["G", "MP", "PTS", "TRB", "AST", "STL", "BLK", "FG%", "3P%", "FT%", "WS"]] = mvp_df[["G", "MP", "PTS", "TRB", "AST", "STL", "BLK", "FG%", "3P%", "FT%", "WS"]].astype(float)

    # Save the cleaned data to a CSV file
    mvp_df.to_csv("mvp_results.csv", index=False)
    
    # Print the cleaned data
    print(mvp_df)

mvp/1991.html already exists, skipping download
mvp/1992.html already exists, skipping download


In [1]:
import requests
import os
import shutil
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
with open("D:\\pythonProject\\ML_Project\\mvp\\1991.html", encoding="iso-8859-1") as f:
    page = f.read()

soup = BeautifulSoup(page, 'html.parser')
soup.find('tr', class_="over_header").decompose()

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\pythonProject\\ML_Project\\mvp\\1991.html'

In [None]:
mvp_table = soup.find_all(id="mvp")[0]

In [None]:
mvp_1991 = pd.read_html(str(mvp_table))[0]

In [None]:
mvp_1991.head(1)

In [None]:
mvp_1991["Year"] = 1991

In [None]:
mvp_1991.head()

In [None]:
dfs = []
for year in years:
    with open("D:\\pythonProject\\ML_Project\\mvp\\{}.html".format(year), encoding='iso-8859-1') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year
    
    dfs.append(mvp_df)

In [None]:
mvps = pd.concat(dfs)

mvps.head()

In [None]:
mvps.to_csv("mvps.csv")