### Data Collection & Pre Proccessing

In [153]:
import numpy as np
import pandas as pd
from datetime import date 
from tqdm import tqdm
import time
from urllib.error import HTTPError
from basketball_reference_web_scraper import client

import warnings
warnings.filterwarnings('ignore')

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#tools/metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import mutual_info_regression

#modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge



### Get MVP Table

In [4]:
def get_mvp_candidates(year):
    url = f"https://www.basketball-reference.com/awards/awards_{year}.html"
    try:
        time.sleep(3)
        mvp_table = pd.read_html(url)[0]
        if isinstance(mvp_table.columns, pd.MultiIndex):
                mvp_table.columns = mvp_table.columns.get_level_values(-1)
        #mvp_table = mvp_table[['Player','Tm','Share', 'Rank']]
        mvp_table['year'] = year
        print(f"Got MVP data for {year}")
        time.sleep(3)
    except HTTPError as err:
        print(f'no mvp race has been found for year {year}')
    return mvp_table 


### Get Team Info

In [5]:
team_abbrev = {"Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BRK",
    "Charlotte Bobcats": "CHA",
    "Charlotte Hornets": "CHH",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "Kansas City Kings": "KCK",
    "Los Angeles Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves":"MIN",
    "New Jersey Nets": "NJN",
    "New Orleans Hornets": "NOH",
    "New Orleans Pelicans": "NOP",
    "New York Knicks": "NYK",
    "Brooklyn Nets": "BKN",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHO",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "San Diego Clippers": "SDC",
    "Seattle SuperSonics": "SEA",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Washington Bullets": "WSB",
    "Washington Wizards": "WAS"
}

In [6]:
def get_team_stats(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
    #east
    time.sleep(3)
    team_east_standing_table = pd.read_html(url)[0]
    team_east_standing_table = team_east_standing_table.rename({'Eastern Conference': 'team'}, axis=1)
    team_east_standing_table['seed'] = pd.Series(list(range(1,16)))

    #west
    team_west_standing_table = pd.read_html(url)[1]
    team_west_standing_table = team_west_standing_table.rename({'Western Conference': 'team'}, axis=1)
    team_west_standing_table['seed'] = pd.Series(list(range(1,16)))

    #combine east and west 
    team_standing_table = pd.concat([team_east_standing_table, team_west_standing_table])
    
    #remove * in team column
    team_standing_table.team = team_standing_table.team.str.replace('*', '')
    
    #map abbreviation to full team name
    team_standing_table['Tm'] = team_standing_table['team'].map(team_abbrev)
    
    #filter only needed columns
    team_standing_table_sub = team_standing_table[['Tm','team', 'W', 'W/L%', 'seed']]
    return team_standing_table_sub

### Get Advanced Stats

In [7]:
def get_advanced_stats(year):
    advanced_stats_df = pd.DataFrame(client.players_advanced_season_totals(season_end_year=year))
    advanced_stats_df['year'] = year
    
    #advanced_stats_df = advanced_stats_df[filter_advanced]
    advanced_stats_df = advanced_stats_df.rename(columns={'name':'Player'})
    return advanced_stats_df

### Get All Historical Data

In [8]:
cols = ['Player','games_played','minutes_played','player_efficiency_rating','true_shooting_percentage',
        'three_point_attempt_rate','free_throw_attempt_rate','offensive_rebound_percentage',
        'defensive_rebound_percentage','total_rebound_percentage','assist_percentage','steal_percentage',
        'block_percentage','turnover_percentage','usage_percentage','offensive_win_shares','defensive_win_shares',
        'win_shares','win_shares_per_48_minutes','offensive_box_plus_minus','defensive_box_plus_minus',
        'box_plus_minus','value_over_replacement_player']

In [91]:
tables = []
max_year = date.today().year
years = list(range(2000,max_year+1))

print("extracting historical data for MVP candidates")

for year in tqdm(years):
    time.sleep(1)
    mvp_table = get_mvp_candidates(year)
    time.sleep(1)
    team_stats = get_team_stats(year)

    table = pd.merge(mvp_table,team_stats,how="left",on="Tm")
    time.sleep(1)
    advanced_stats = get_advanced_stats(year)
    advanced_stats = advanced_stats[cols]

    table = pd.merge(table,advanced_stats,how="left",on="Player")

    tables.append(table)

print("Historical Data Acquired!")
    

extracting historical data for MVP candidates


  0%|          | 0/26 [00:00<?, ?it/s]

Got MVP data for 2000


  4%|▍         | 1/26 [00:12<05:24, 12.96s/it]

Got MVP data for 2001


  8%|▊         | 2/26 [00:25<05:11, 12.97s/it]

Got MVP data for 2002


 12%|█▏        | 3/26 [00:38<04:57, 12.95s/it]

Got MVP data for 2003


 15%|█▌        | 4/26 [00:51<04:44, 12.95s/it]

Got MVP data for 2004


 19%|█▉        | 5/26 [01:04<04:32, 12.98s/it]

Got MVP data for 2005


 23%|██▎       | 6/26 [01:17<04:20, 13.01s/it]

Got MVP data for 2006


 27%|██▋       | 7/26 [01:30<04:06, 13.00s/it]

Got MVP data for 2007


 31%|███       | 8/26 [01:44<03:55, 13.06s/it]

Got MVP data for 2008


 35%|███▍      | 9/26 [01:57<03:42, 13.10s/it]

Got MVP data for 2009


 38%|███▊      | 10/26 [02:10<03:29, 13.09s/it]

Got MVP data for 2010


 42%|████▏     | 11/26 [02:23<03:16, 13.08s/it]

Got MVP data for 2011


 46%|████▌     | 12/26 [02:36<03:04, 13.16s/it]

Got MVP data for 2012


 50%|█████     | 13/26 [02:50<02:52, 13.30s/it]

Got MVP data for 2013


 54%|█████▍    | 14/26 [03:03<02:39, 13.26s/it]

Got MVP data for 2014


 58%|█████▊    | 15/26 [03:16<02:25, 13.25s/it]

Got MVP data for 2015


 62%|██████▏   | 16/26 [03:30<02:13, 13.33s/it]

Got MVP data for 2016


 65%|██████▌   | 17/26 [03:44<02:02, 13.63s/it]

Got MVP data for 2017


 69%|██████▉   | 18/26 [03:57<01:48, 13.56s/it]

Got MVP data for 2018


 73%|███████▎  | 19/26 [04:11<01:34, 13.45s/it]

Got MVP data for 2019


 77%|███████▋  | 20/26 [04:24<01:20, 13.40s/it]

Got MVP data for 2020


 81%|████████  | 21/26 [04:37<01:06, 13.39s/it]

Got MVP data for 2021


 85%|████████▍ | 22/26 [04:51<00:53, 13.39s/it]

Got MVP data for 2022


 88%|████████▊ | 23/26 [05:04<00:39, 13.33s/it]

Got MVP data for 2023


 92%|█████████▏| 24/26 [05:17<00:26, 13.28s/it]

Got MVP data for 2024


 96%|█████████▌| 25/26 [05:30<00:13, 13.24s/it]

Got MVP data for 2025


100%|██████████| 26/26 [05:43<00:00, 13.22s/it]

Historical Data Acquired!





In [92]:
master_table = pd.concat(tables)

In [93]:
master_table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365 entries, 0 to 11
Data columns (total 46 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Rank                           365 non-null    object 
 1   Player                         365 non-null    object 
 2   Age                            365 non-null    int64  
 3   Tm                             365 non-null    object 
 4   First                          365 non-null    int64  
 5   Pts Won                        365 non-null    int64  
 6   Pts Max                        365 non-null    int64  
 7   Share                          365 non-null    float64
 8   G                              365 non-null    int64  
 9   MP                             365 non-null    float64
 10  PTS                            365 non-null    float64
 11  TRB                            365 non-null    float64
 12  AST                            365 non-null    float64
 

In [95]:
#check rows of NaN seeding stats
master_table[master_table['team'].isna()]

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player
12,13,Vince Carter,28,TOT,0,3,1270,0.002,77,36.7,...,6.2,26.7,0.5,0.4,0.9,0.068,1.9,0.4,2.3,0.7
13,13,Vince Carter,28,TOT,0,3,1270,0.002,77,36.7,...,9.4,32.9,5.4,3.1,8.5,0.184,6.2,0.8,7.0,5.1
5,6,Chauncey Billups,32,TOT,0,33,1210,0.027,79,35.3,...,13.6,20.0,0.1,0.1,0.2,0.124,0.4,0.5,1.0,0.1
6,6,Chauncey Billups,32,TOT,0,33,1210,0.027,79,35.3,...,13.0,21.8,7.8,2.1,9.9,0.176,3.9,-0.5,3.4,3.7
13,12T,Stephen Jackson,31,TOT,0,1,1230,0.001,81,38.6,...,14.8,25.0,-0.1,0.2,0.0,0.008,0.2,-1.5,-1.3,0.1
14,12T,Stephen Jackson,31,TOT,0,1,1230,0.001,81,38.6,...,13.8,27.8,0.6,4.4,5.0,0.085,0.3,0.3,0.6,1.8
8,9,Derrick Rose,32,TOT,1,10,1010,0.01,50,25.6,...,12.3,30.0,0.0,0.3,0.4,0.051,0.7,-0.2,0.6,0.2
9,9,Derrick Rose,32,TOT,1,10,1010,0.01,50,25.6,...,9.6,24.3,1.5,1.2,2.8,0.142,1.9,0.5,2.4,1.0
13,13T,James Harden,31,TOT,0,1,1010,0.001,44,36.6,...,17.4,28.7,0.8,0.2,1.0,0.159,5.4,-0.6,4.8,0.5
14,13T,James Harden,31,TOT,0,1,1010,0.001,44,36.6,...,16.7,28.4,4.5,1.5,6.0,0.219,6.4,1.3,7.7,3.2


In [None]:
#drop rows of players who were traded mid season
master_table = master_table[master_table['Tm'].str.contains('TOT')==False]
master_table['3P%'] = master_table['3P%'].fillna(0)

In [97]:
master_table.to_csv("master_table.csv")

### Get Current Year MVP Candidates

In [None]:
advanced_stats_df = get_advanced_stats(2026)

In [143]:
advanced_stats_df.columns

Index(['slug', 'Player', 'positions', 'age', 'team', 'games_played',
       'minutes_played', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player', 'is_combined_totals',
       'year'],
      dtype='object')

In [None]:
current_team_standings = get_team_stats(2026)
current_team_standings['team'] = current_team_standings['team'].str.replace(r"\s*\(\d+\)$", "", regex=True)
current_team_standings['Tm'] = current_team_standings['team'].map(team_abbrev)

Unnamed: 0,Tm,team,W,W/L%,seed
0,DET,Detroit Pistons,15,0.833,1
1,TOR,Toronto Raptors,14,0.737,2
2,MIA,Miami Heat,13,0.684,3
3,NYK,New York Knicks,11,0.647,4
4,CLE,Cleveland Cavaliers,12,0.632,5
5,ATL,Atlanta Hawks,11,0.579,6
6,ORL,Orlando Magic,11,0.579,7
7,BOS,Boston Celtics,10,0.556,8
8,CHI,Chicago Bulls,9,0.529,9
9,PHI,Philadelphia 76ers,9,0.529,10


In [None]:
#pulled as of 11/28/2025
url = "https://www.basketball-reference.com/friv/mvp.html"
current_mvp_table = pd.read_html(url)[0]
current_mvp_table = current_mvp_table.rename(columns={"Team":"Tm"})
current_mvp_table = current_mvp_table.iloc[:,:-2]
current_mvp_table

Unnamed: 0,Rk,Player,Tm,W,L,W/L%,G,GS,MP,FG,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Nikola Jokić,DEN,13,4,0.765,17,17,34.8,10.8,...,0.853,3.3,9.5,12.8,11.1,1.6,0.8,3.5,3.0,29.6
1,2,Shai Gilgeous-Alexander,OKC,18,1,0.947,19,19,33.3,10.9,...,0.897,0.5,4.4,4.9,6.6,1.6,0.8,1.7,1.8,32.6
2,3,Luka Dončić,LAL,13,4,0.765,13,13,37.2,10.8,...,0.789,0.8,7.9,8.8,9.2,1.9,0.5,4.2,2.7,35.2
3,4,Cade Cunningham,DET,15,3,0.833,15,15,36.3,9.7,...,0.828,1.3,4.8,6.1,9.3,1.3,0.7,3.6,3.7,28.1
4,5,Austin Reaves,LAL,13,4,0.765,14,14,36.3,8.6,...,0.879,0.9,4.9,5.7,7.0,1.4,0.1,3.2,2.5,27.9
5,6,Isaiah Hartenstein,OKC,18,1,0.947,19,19,27.8,5.3,...,0.64,3.3,7.4,10.7,3.4,1.3,0.8,2.1,2.8,12.2
6,7,Jalen Duren,DET,15,3,0.833,16,16,29.0,7.1,...,0.752,4.4,7.4,11.8,2.0,0.9,0.9,2.4,3.3,19.8
7,8,Alperen Şengün,HOU,12,4,0.75,16,16,36.6,8.0,...,0.726,3.0,6.4,9.4,7.1,1.1,0.9,3.4,3.3,22.0
8,9,Giannis Antetokounmpo,MIL,8,11,0.421,13,13,31.8,12.0,...,0.636,3.7,7.2,10.8,6.8,0.9,1.2,3.5,2.6,31.2
9,10,Tyrese Maxey,PHI,9,8,0.529,17,17,39.9,10.8,...,0.878,0.3,4.1,4.4,7.5,1.6,0.8,2.7,2.4,32.2


In [135]:
current_mvp_candidates = pd.merge(current_mvp_table,team_stats[['Tm','seed']],on="Tm",how="left")


In [136]:
current_mvp_candidates

Unnamed: 0,Rk,Player,Tm,W,L,W/L%,G,GS,MP,FG,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,seed
0,1,Nikola Jokić,DEN,13,4,0.765,17,17,34.8,10.8,...,3.3,9.5,12.8,11.1,1.6,0.8,3.5,3.0,29.6,4
1,2,Shai Gilgeous-Alexander,OKC,18,1,0.947,19,19,33.3,10.9,...,0.5,4.4,4.9,6.6,1.6,0.8,1.7,1.8,32.6,1
2,3,Luka Dončić,LAL,13,4,0.765,13,13,37.2,10.8,...,0.8,7.9,8.8,9.2,1.9,0.5,4.2,2.7,35.2,3
3,4,Cade Cunningham,DET,15,3,0.833,15,15,36.3,9.7,...,1.3,4.8,6.1,9.3,1.3,0.7,3.6,3.7,28.1,6
4,5,Austin Reaves,LAL,13,4,0.765,14,14,36.3,8.6,...,0.9,4.9,5.7,7.0,1.4,0.1,3.2,2.5,27.9,3
5,6,Isaiah Hartenstein,OKC,18,1,0.947,19,19,27.8,5.3,...,3.3,7.4,10.7,3.4,1.3,0.8,2.1,2.8,12.2,1
6,7,Jalen Duren,DET,15,3,0.833,16,16,29.0,7.1,...,4.4,7.4,11.8,2.0,0.9,0.9,2.4,3.3,19.8,6
7,8,Alperen Şengün,HOU,12,4,0.75,16,16,36.6,8.0,...,3.0,6.4,9.4,7.1,1.1,0.9,3.4,3.3,22.0,2
8,9,Giannis Antetokounmpo,MIL,8,11,0.421,13,13,31.8,12.0,...,3.7,7.2,10.8,6.8,0.9,1.2,3.5,2.6,31.2,5
9,10,Tyrese Maxey,PHI,9,8,0.529,17,17,39.9,10.8,...,0.3,4.1,4.4,7.5,1.6,0.8,2.7,2.4,32.2,13


In [137]:
current_mvp_candidates = pd.merge(current_mvp_candidates,advanced_stats_df,on="Player",how="left")

In [138]:
current_mvp_candidates

Unnamed: 0,Rk,Player,Tm,W,L,W/L%,G,GS,MP,FG,...,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player,is_combined_totals,year
0,1,Nikola Jokić,DEN,13,4,0.765,17,17,34.8,10.8,...,4.0,1.1,5.1,0.413,12.8,5.4,18.2,3.0,False,2026
1,2,Shai Gilgeous-Alexander,OKC,18,1,0.947,19,19,33.3,10.9,...,3.8,1.5,5.2,0.396,10.3,3.8,14.0,2.5,False,2026
2,3,Luka Dončić,LAL,13,4,0.765,13,13,37.2,10.8,...,1.7,0.7,2.4,0.237,8.2,1.7,9.9,1.5,False,2026
3,4,Cade Cunningham,DET,15,3,0.833,15,15,36.3,9.7,...,1.2,0.7,1.9,0.167,4.5,0.2,4.8,0.9,False,2026
4,5,Austin Reaves,LAL,13,4,0.765,14,14,36.3,8.6,...,1.8,0.5,2.3,0.217,5.0,0.0,5.0,0.9,False,2026
5,6,Isaiah Hartenstein,OKC,18,1,0.947,19,19,27.8,5.3,...,1.3,1.5,2.8,0.253,1.4,2.8,4.1,0.8,False,2026
6,7,Jalen Duren,DET,15,3,0.833,16,16,29.0,7.1,...,1.6,0.8,2.5,0.257,3.5,-0.1,3.4,0.6,False,2026
7,8,Alperen Şengün,HOU,12,4,0.75,16,16,36.6,8.0,...,1.4,0.8,2.2,0.18,4.1,1.4,5.5,1.1,False,2026
8,9,Giannis Antetokounmpo,MIL,8,11,0.421,13,13,31.8,12.0,...,1.8,0.5,2.3,0.27,9.2,2.2,11.3,1.4,False,2026
9,10,Tyrese Maxey,PHI,9,8,0.529,17,17,39.9,10.8,...,2.2,0.6,2.8,0.194,6.3,0.4,6.7,1.5,False,2026


In [145]:
current_mvp_candidates.to_csv("current_mvp_candidates.csv")