In [1]:
import pandas as pd

In [2]:
# Load Dataset
stats = pd.read_csv("player_mvp_stats.csv", index_col=0)
stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,Doc Rivers,PG,29,ATL,79,79,32.7,5.6,12.9,0.435,...,0.0,0.00,Atlanta Hawks,43,39,0.524,18.0,109.8,109.0,0.72
1,Dominique Wilkins,SF,31,ATL,81,81,38.0,9.5,20.2,0.470,...,960.0,0.03,Atlanta Hawks,43,39,0.524,18.0,109.8,109.0,0.72
2,Duane Ferrell,SF,25,ATL,78,2,14.9,2.2,4.6,0.489,...,0.0,0.00,Atlanta Hawks,43,39,0.524,18.0,109.8,109.0,0.72
3,Gary Leonard,C,23,ATL,4,0,2.3,0.0,0.0,0.000,...,0.0,0.00,Atlanta Hawks,43,39,0.524,18.0,109.8,109.0,0.72
4,John Battle,SG,28,ATL,79,2,23.6,5.0,10.9,0.461,...,0.0,0.00,Atlanta Hawks,43,39,0.524,18.0,109.8,109.0,0.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15231,Monte Morris,PG,27,WAS,62,61,27.3,4.0,8.3,0.480,...,0.0,0.00,Washington Wizards,35,47,0.427,9.0,113.2,114.4,-1.06
15232,Quenton Jackson,PG,24,WAS,9,0,15.0,2.1,4.7,0.452,...,0.0,0.00,Washington Wizards,35,47,0.427,9.0,113.2,114.4,-1.06
15233,Taj Gibson,C,37,WAS,49,2,9.8,1.3,2.6,0.520,...,0.0,0.00,Washington Wizards,35,47,0.427,9.0,113.2,114.4,-1.06
15234,Vernon Carey Jr.,C,21,WAS,11,0,2.5,0.2,0.7,0.250,...,0.0,0.00,Washington Wizards,35,47,0.427,9.0,113.2,114.4,-1.06


In [3]:
# I cleaned this during scraping, so I dont need to ammend anything here
pd.isnull(stats).sum()

Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
2P         0
2PA        0
2P%        0
eFG%       0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
Year       0
Pts Won    0
Pts Max    0
Share      0
Team       0
W          0
L          0
W/L%       0
GB         0
PS/G       0
PA/G       0
SRS        0
dtype: int64

In [4]:
# Training the Machine Model
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [5]:
# I will only use columns with numerical values that are condusive to being an MVP
# Remove certain statistics to prevent overfitting (Pts Won', 'Pts Max', as it correlates with share)
predictors = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%',
       'GB', 'PS/G', 'PA/G', 'SRS']


In [6]:
# Set training and testing df
train = stats[stats["Year"] < 2023]
test = stats[stats["Year"] == 2023]

In [7]:
# Form of linear regression designed to prevent overfitting
from sklearn.linear_model import Ridge
reg = Ridge(alpha=.1)

In [8]:
reg.fit(train[predictors], train["Share"])

In [9]:
predictions = reg.predict(test[predictors])

In [10]:
# Displays Numpy Array, so we have to convert into a pd df
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [11]:
predictions

Unnamed: 0,predictions
506,-0.018614
507,-0.023026
508,-0.005941
509,-0.010201
510,0.027308
...,...
15231,0.002135
15232,-0.003488
15233,-0.013604
15234,-0.004369


In [12]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [13]:
combination

Unnamed: 0,Player,Share,predictions
506,AJ Griffin,0.0,-0.018614
507,Aaron Holiday,0.0,-0.023026
508,Bogdan Bogdanović,0.0,-0.005941
509,Bruno Fernando,0.0,-0.010201
510,Clint Capela,0.0,0.027308
...,...,...,...
15231,Monte Morris,0.0,0.002135
15232,Quenton Jackson,0.0,-0.003488
15233,Taj Gibson,0.0,-0.013604
15234,Vernon Carey Jr.,0.0,-0.004369


In [14]:
combination.sort_values("Share", ascending=False).head(20)

Unnamed: 0,Player,Share,predictions
11333,Joel Embiid,0.915,0.20273
3821,Nikola Jokić,0.674,0.173181
8327,Giannis Antetokounmpo,0.606,0.225694
1031,Jayson Tatum,0.28,0.136305
10296,Shai Gilgeous-Alexander,0.046,0.150951
2763,Donovan Mitchell,0.03,0.087769
12843,Domantas Sabonis,0.027,0.092824
3302,Luka Dončić,0.01,0.194363
4855,Stephen Curry,0.005,0.106164
7820,Jimmy Butler,0.003,0.112081


In [15]:
# Identifying an Error Metric 
from sklearn.metrics import mean_squared_error
mean_squared_error(combination["Share"], combination["predictions"])

0.0026601929079442537

In [16]:
# Vast majority didnt get MVP votes, so mse is not that helpful
combination["Share"].value_counts()

Share
0.000    526
0.001      2
0.280      1
0.030      1
0.010      1
0.674      1
0.005      1
0.003      1
0.606      1
0.046      1
0.915      1
0.002      1
0.027      1
Name: count, dtype: int64

In [17]:
# Add Rk Column and use 1 based indexing
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1,combination.shape[0]+1))

In [18]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
11333,Joel Embiid,0.915,0.20273,1
3821,Nikola Jokić,0.674,0.173181,2
8327,Giannis Antetokounmpo,0.606,0.225694,3
1031,Jayson Tatum,0.28,0.136305,4
10296,Shai Gilgeous-Alexander,0.046,0.150951,5
2763,Donovan Mitchell,0.03,0.087769,6
12843,Domantas Sabonis,0.027,0.092824,7
3302,Luka Dončić,0.01,0.194363,8
4855,Stephen Curry,0.005,0.106164,9
7820,Jimmy Butler,0.003,0.112081,10


In [19]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [20]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
8327,Giannis Antetokounmpo,0.606,0.225694,3,1
11333,Joel Embiid,0.915,0.20273,1,2
3302,Luka Dončić,0.01,0.194363,8,3
3821,Nikola Jokić,0.674,0.173181,2,4
10296,Shai Gilgeous-Alexander,0.046,0.150951,5,5
11839,Kevin Durant,0.0,0.140954,234,6
6913,Anthony Davis,0.0,0.140513,415,7
12332,Damian Lillard,0.0,0.13672,225,8
1031,Jayson Tatum,0.28,0.136305,4,9
6920,LeBron James,0.0,0.132678,441,10


In [21]:
# Error Metric: Average Precision
# We only care about the top 5 vote getters

combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
11333,Joel Embiid,0.915,0.20273,1,2
3821,Nikola Jokić,0.674,0.173181,2,4
8327,Giannis Antetokounmpo,0.606,0.225694,3,1
1031,Jayson Tatum,0.28,0.136305,4,9
10296,Shai Gilgeous-Alexander,0.046,0.150951,5,5
2763,Donovan Mitchell,0.03,0.087769,6,22
12843,Domantas Sabonis,0.027,0.092824,7,18
3302,Luka Dončić,0.01,0.194363,8,3
4855,Stephen Curry,0.005,0.106164,9,14
7820,Jimmy Butler,0.003,0.112081,10,13


In [22]:
# Function to sort and take top 5 winners

def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index,row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found / seen)
        seen += 1

    return sum(ps) / len(ps)

In [23]:
ap = find_ap(combination)
ap

0.8211111111111112

In [24]:
# Backtesting to predict each year
years = list(range(1991,2024))

aps = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors],train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))


In [25]:
sum(aps) / len(aps)

0.7189767597180119

In [26]:
def add_ranks(predictions):
    predictions = predictions.sort_values("predictions", ascending=False)
    predictions["Predicted_Rk"] = list(range(1,predictions.shape[0]+1))
    predictions = predictions.sort_values("Share", ascending=False)
    predictions["Rk"] = list(range(1,predictions.shape[0]+1))
    predictions["Diff"] = (predictions["Rk"] - predictions["Predicted_Rk"])
    return predictions

In [27]:
add_ranks(all_predictions[-1])[add_ranks(all_predictions[-1])["Rk"] <= 5].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
8327,Giannis Antetokounmpo,0.606,0.225694,1,3,2
10296,Shai Gilgeous-Alexander,0.046,0.150951,5,5,0
11333,Joel Embiid,0.915,0.20273,2,1,-1
3821,Nikola Jokić,0.674,0.173181,4,2,-2
1031,Jayson Tatum,0.28,0.136305,9,4,-5


In [28]:
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [29]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)
mean_ap

0.7189767597180119

In [30]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
9018,Jason Kidd,0.712,0.027957,52,2,-50
11555,Steve Nash,0.839,0.032686,47,1,-46
12546,Peja Stojaković,0.228,0.035581,39,4,-35
11573,Steve Nash,0.739,0.051338,35,1,-34
2073,Joakim Noah,0.258,0.04739,37,4,-33
11588,Steve Nash,0.785,0.070791,23,2,-21
7397,Tim Hardaway,0.207,0.059992,20,4,-16
11814,Devin Booker,0.216,0.092999,16,4,-12
6727,Kobe Bryant,0.291,0.077829,14,4,-10
13476,Gary Payton,0.372,0.076145,13,3,-10


In [31]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.099448,eFG%
18,0.032759,DRB
28,0.028856,W/L%
17,0.020541,ORB
10,0.016192,2P
21,0.012445,STL
22,0.010485,BLK
15,0.010386,FTA
12,0.00765,2P%
25,0.006978,PTS


In [32]:
# Adding more predictors
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())
stat_ratios

  stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())


Unnamed: 0_level_0,Unnamed: 1_level_0,PTS,AST,STL,BLK,3P,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1991,0,1.692601,2.010078,2.608773,1.346939,5.594452,1.0
1991,1,2.884104,1.542618,2.059558,1.795918,5.085865,1.0
1991,2,0.679268,0.327222,0.549215,0.673469,0.000000,1.0
1991,3,0.055678,0.000000,0.000000,0.673469,0.000000,1.0
1991,4,1.514433,1.262142,0.823823,0.224490,1.017173,1.0
...,...,...,...,...,...,...,...
2023,15231,1.129221,2.561833,1.148205,0.523047,1.312418,1.0
2023,15232,0.679725,0.821720,0.656117,0.261524,0.100955,1.0
2023,15233,0.372752,0.338355,0.492088,0.523047,0.201910,1.0
2023,15234,0.054817,0.145009,0.328058,0.523047,0.000000,1.0
