<a href="https://colab.research.google.com/github/HanLi05/Predicting-NBA-MVP/blob/main/Han_Li_NBA_MVP_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount to google drive, import libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import zipfile
import requests
from bs4 import BeautifulSoup
from sklearn.metrics import r2_score
import os

# Scrape basketball reference for team stats from 2001-2023

In [3]:
data = []
# loop over every season from 2000-01 to 2022-23
for year in range(2001, 2023):
  # create url
  url = f"https://www.basketball-reference.com/leagues/NBA_{year}.html"

  # send HTTP request and parse HTML content
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # loop over the two tables on the webpage
  for num in range(2):
    # extract table headers
    table = soup.find_all('table')[num]
    headers = table.find_all('thead')[0].find_all('th')
    header_names = ['Year'] + [header.get_text() for header in headers[0:-1]] + ['League']

    # extract table rows
    rows = table.find_all('tbody')[0].find_all('tr')
    for row in rows:
      # skip rows with no data
      if len(row.find_all('td')) == 0:
        continue
      # extract data from cells in row
      row_data = [year] + [cell.get_text() for cell in row.find_all('td')] + [row.find_all('td')[-1].get_text()] + [cell.get_text() for cell in row.find_all('th')] + [row.find_all('th')[-1].get_text()]
      data.append(row_data)

  # add data to dataframe, drop unnecessary columns
  team_stats_df = pd.DataFrame(data, columns=['Year', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'League', '?', 'Team Name', 'yfv'])
  team_stats_df = team_stats_df.drop(columns=['League', '?', 'yfv'])

# save to csv
team_stats_df.to_csv('team_stats.csv', index=False)

# Read data from google drive datasets (player stats, MVP voting)


In [4]:
# read data from all_seasons file - player statistics for each year
zf = zipfile.ZipFile('/content/drive/MyDrive/ML/archive (1).zip')
adv_df = pd.read_csv(zf.open('all_seasons.csv'))

# read data from folder - contains mvp voting results for each year in separate csv file
folder_path = '/content/drive/MyDrive/ML/folder'

file_list = os.listdir(folder_path)
df_list = []

# for every file in folder, append it to a dataframe df
for file_name in file_list:
  df = pd.read_csv(os.path.join(folder_path, file_name))
  df_list.append(df)
df = pd.concat(df_list, ignore_index=True)

# fix columns
df.columns = df.iloc[0]
df = df.drop(df.index[0])

# files were ordered from most recently to least recent
# add a season value to each row when the rank resets to 1 (indicates start of new season)
num = 23
df['Season'] = 0
for index, row in df.iterrows():
    if row['Rank'] == '1':
        num-=1
    # account for different logic during 2000-2009 versus 2010 onwards
    if num > 10:
      df.at[index, 'Season'] = '20'+str(num-1)+'-'+str(num)
    else:
      df.at[index, 'Season'] = '200'+str(num-1)+'-0'+str(num)

# drop rows that are not useful
for index, row in df.iterrows():
  if row['Tm'] == 'TOT':
    df = df.drop(index)
  if row['Rank'] == 'Rank':
    df = df.drop(index)

In [5]:
# dictionary to convert datasets which use
nba_teams = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BRK": "Brooklyn Nets",
    "CHA": "Charlotte Hornets",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NOP": "New Orleans Pelicans",
    "NYK": "New York Knicks",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHO": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "WAS": "Washington Wizards"
}

# team_stats_df = pd.read_csv('team_stats.csv')
df1 = adv_df.drop(adv_df.columns[0], axis=1)

# remove *'s in the Team Name column for team_stats_df
for i in range(len(team_stats_df)):
    team_stats_df.loc[i, 'Team Name'] = team_stats_df.loc[i, 'Team Name'].replace('*', '')

# map team abbreviations in df to their full names
df['Tm'] = df['Tm'].map(nba_teams)
df = df.dropna()

# construct 'Yr' from 'Season' so dataframes can merge on years (eg: 'Yr' = 2000, 'Season' = 2000-01)
df['Yr'] = df['Season'].str[:2] + df['Season'].str[-2:]
team_stats_df['Year'] = team_stats_df['Year'].astype(str)

# merge to get combined df with team that the mvp candidates played on
merged_df = pd.merge(df, team_stats_df, left_on=['Tm', 'Yr'], right_on=['Team Name', 'Year'])
# extract the W/L% of each of the mvp candidates
df['Wins'] = merged_df['W/L%']
df = df.drop(columns = ['Yr'])

# merge to get combined df with stats of mvp candidates
df['Net Rating'] = 0
df['TS%'] = 0
df['USG%'] = 0
merged_df = pd.merge(df, adv_df, left_on=['Player', 'Season'], right_on=['player_name', 'season'])
# extract the net rating, true shooting percentages, and usage percentages
# these statistics are more indicative of one's value, and is some of what I used for analysis
df['Net Rating'] = merged_df['net_rating']
df['TS%'] = merged_df['ts_pct']
df['USG%'] = merged_df['usg_pct']

df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Yr'] = df['Season'].str[:2] + df['Season'].str[-2:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Wins'] = merged_df['W/L%']


In [6]:
# select features (X) and target variable (y)
# features include points, win shares, field goal %, their plus/minus rating, true shooting %, usage %, and team wins
X = df[['PTS', 'WS', 'FG%', 'Net Rating', 'TS%', 'USG%', 'Wins']]
y = df['Share']

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=28)

# create, train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# performance of linear regression model on training and testing sets
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

# R-squared scores - how well variance in target variable is explained by features of regression model
print("Train R2:", train_score)
print("Test R2:", test_score)

# perform lasso regression with different alphas
# penalize less relevant features, favors those with stronger relationship, prevents overfitting
# different alphas dictate how much to penalize weaker features
alphas = [0, 0.0001, 0.1]
for alpha in alphas:
  print()
  # create and train lasso model
  lasso = Lasso(alpha=alpha)
  lasso.fit(X_train, y_train)

  # print r-squared score for training data
  train_pred = lasso.predict(X_train)
  train_r2 = r2_score(y_train, train_pred)
  print('Train, r2 for alpha=', alpha, ': ', train_r2)

  # print r-squared score for testing data
  test_pred = lasso.predict(X_test)
  test_r2 = r2_score(y_test, test_pred)
  print('Test, r2 for alpha=', alpha, ':', test_r2)

Train R2: 0.5088332170910203
Test R2: 0.4808847437947158

Train, r2 for alpha= 0 :  0.5088332170910204
Test, r2 for alpha= 0 : 0.4808847437947159

Train, r2 for alpha= 0.0001 :  0.5086112277533077
Test, r2 for alpha= 0.0001 : 0.48484175714162425

Train, r2 for alpha= 0.1 :  0.45552719460775337
Test, r2 for alpha= 0.1 : 0.4998028982681524


  lasso.fit(X_train, y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
