In [99]:
import pandas as pd

In [100]:
# Load Dataset
stats = pd.read_csv("player_mvp_stats.csv", index_col=0)
stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.340,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15231,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,0.484,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
15232,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,0.286,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
15233,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,0.470,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
15234,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,0.459,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [101]:
# I cleaned this during scraping, so I dont need to ammend anything here
pd.isnull(stats).sum()

Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
2P         0
2PA        0
2P%        0
eFG%       0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
Year       0
Pts Won    0
Pts Max    0
Share      0
Team       0
W          0
L          0
W/L%       0
GB         0
PS/G       0
PA/G       0
SRS        0
dtype: int64

In [102]:
# Training the Machine Model
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [103]:
# I will only use columns with numerical values that are condusive to being an MVP
# Remove certain statistics to prevent overfitting (Pts Won', 'Pts Max', as it correlates with share)
predictors = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%',
       'GB', 'PS/G', 'PA/G', 'SRS']


In [104]:
# Set training and testing df
train = stats[stats["Year"] < 2023]
test = stats[stats["Year"] == 2023]

In [105]:
# Form of linear regression designed to prevent overfitting
from sklearn.linear_model import Ridge
reg = Ridge(alpha=.1)

In [106]:
reg.fit(train[predictors], train["Share"])

In [107]:
predictions = reg.predict(test[predictors])

In [108]:
# Displays Numpy Array, so we have to convert into a pd df
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [109]:
predictions

Unnamed: 0,predictions
211,0.003184
212,0.031703
213,0.042324
214,0.225694
215,0.000021
...,...
15111,-0.012346
15112,0.004314
15113,-0.014854
15114,0.019197


In [110]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [111]:
combination

Unnamed: 0,Player,Share,predictions
211,A.J. Green,0.000,0.003184
212,Bobby Portis,0.000,0.031703
213,Brook Lopez,0.000,0.042324
214,Giannis Antetokounmpo,0.606,0.225694
215,Goran Dragić,0.000,0.000021
...,...,...,...
15111,Mitchell Robinson,0.000,-0.012346
15112,Obi Toppin,0.000,0.004314
15113,Quentin Grimes,0.000,-0.014854
15114,RJ Barrett,0.000,0.019197


In [112]:
combination.sort_values("Share", ascending=False).head(20)

Unnamed: 0,Player,Share,predictions
14849,Joel Embiid,0.915,0.20273
736,Nikola Jokić,0.674,0.173181
214,Giannis Antetokounmpo,0.606,0.225694
2898,Jayson Tatum,0.28,0.136305
1322,Shai Gilgeous-Alexander,0.046,0.150951
13682,Donovan Mitchell,0.03,0.087769
4274,Domantas Sabonis,0.027,0.092824
306,Luka Dončić,0.01,0.194363
6686,Stephen Curry,0.005,0.106164
10869,Jimmy Butler,0.003,0.112081


In [113]:
# Identifying an Error Metric 
from sklearn.metrics import mean_squared_error
mean_squared_error(combination["Share"], combination["predictions"])

0.0026601929079443304

In [114]:
# Vast majority didnt get MVP votes, so mse is not that helpful
combination["Share"].value_counts()

Share
0.000    526
0.001      2
0.606      1
0.010      1
0.674      1
0.046      1
0.280      1
0.002      1
0.027      1
0.005      1
0.003      1
0.030      1
0.915      1
Name: count, dtype: int64

In [115]:
# Add Rk Column and use 1 based indexing
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1,combination.shape[0]+1))

In [116]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
14849,Joel Embiid,0.915,0.20273,1
736,Nikola Jokić,0.674,0.173181,2
214,Giannis Antetokounmpo,0.606,0.225694,3
2898,Jayson Tatum,0.28,0.136305,4
1322,Shai Gilgeous-Alexander,0.046,0.150951,5
13682,Donovan Mitchell,0.03,0.087769,6
4274,Domantas Sabonis,0.027,0.092824,7
306,Luka Dončić,0.01,0.194363,8
6686,Stephen Curry,0.005,0.106164,9
10869,Jimmy Butler,0.003,0.112081,10


In [117]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [118]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
214,Giannis Antetokounmpo,0.606,0.225694,3,1
14849,Joel Embiid,0.915,0.20273,1,2
306,Luka Dončić,0.01,0.194363,8,3
736,Nikola Jokić,0.674,0.173181,2,4
1322,Shai Gilgeous-Alexander,0.046,0.150951,5,5
11528,Kevin Durant,0.0,0.140954,50,6
8563,Anthony Davis,0.0,0.140513,139,7
8083,Damian Lillard,0.0,0.13672,90,8
2898,Jayson Tatum,0.28,0.136305,4,9
8570,LeBron James,0.0,0.132678,132,10


In [119]:
# Error Metric: Average Precision
# We only care about the top 5 vote getters

combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
14849,Joel Embiid,0.915,0.20273,1,2
736,Nikola Jokić,0.674,0.173181,2,4
214,Giannis Antetokounmpo,0.606,0.225694,3,1
2898,Jayson Tatum,0.28,0.136305,4,9
1322,Shai Gilgeous-Alexander,0.046,0.150951,5,5
13682,Donovan Mitchell,0.03,0.087769,6,22
4274,Domantas Sabonis,0.027,0.092824,7,18
306,Luka Dončić,0.01,0.194363,8,3
6686,Stephen Curry,0.005,0.106164,9,14
10869,Jimmy Butler,0.003,0.112081,10,13


In [120]:
# Function to sort and take top 5 winners

def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index,row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found / seen)
        seen += 1

    return sum(ps) / len(ps)

In [121]:
ap = find_ap(combination)
ap

0.821111111111111

In [122]:
# Backtesting to predict each year
years = list(range(1991,2024))

aps = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors],train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))


In [123]:
sum(aps) / len(aps)

0.7189767597180119

In [124]:
def add_ranks(predictions):
    predictions = predictions.sort_values("predictions", ascending=False)
    predictions["Predicted_Rk"] = list(range(1,predictions.shape[0]+1))
    predictions = predictions.sort_values("Share", ascending=False)
    predictions["Rk"] = list(range(1,predictions.shape[0]+1))
    predictions["Diff"] = (predictions["Rk"] - predictions["Predicted_Rk"])
    return predictions

In [125]:
add_ranks(all_predictions[-1])[add_ranks(all_predictions[-1])["Rk"] <= 5].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
214,Giannis Antetokounmpo,0.606,0.225694,1,3,2
1322,Shai Gilgeous-Alexander,0.046,0.150951,5,5,0
14849,Joel Embiid,0.915,0.20273,2,1,-1
736,Nikola Jokić,0.674,0.173181,4,2,-2
2898,Jayson Tatum,0.28,0.136305,9,4,-5


In [126]:
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [127]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)
mean_ap

0.7189767597180119

In [128]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
1441,Jason Kidd,0.712,0.027957,52,2,-50
5674,Steve Nash,0.839,0.032686,47,1,-46
9277,Peja Stojaković,0.228,0.035581,39,4,-35
5692,Steve Nash,0.739,0.051338,35,1,-34
13800,Joakim Noah,0.258,0.04739,37,4,-33
5707,Steve Nash,0.785,0.070791,23,2,-21
5151,Tim Hardaway,0.207,0.059992,20,4,-16
980,Devin Booker,0.216,0.092999,16,4,-12
7330,Kobe Bryant,0.291,0.077829,14,4,-10
1374,Gary Payton,0.372,0.076145,13,3,-10


In [129]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.099448,eFG%
18,0.032759,DRB
28,0.028856,W/L%
17,0.020541,ORB
10,0.016192,2P
21,0.012445,STL
22,0.010485,BLK
15,0.010386,FTA
12,0.00765,2P%
25,0.006978,PTS


In [139]:
# Adding more predictors
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())
stat_ratios

Unnamed: 0_level_0,Unnamed: 1_level_0,PTS,AST,STL,BLK,3P,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1991,0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1991,1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
1991,2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
1991,3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
1991,4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...,...
2023,15111,0.811285,0.435028,1.476263,4.707424,0.000000,1.0
2023,15112,0.811285,0.483365,0.492088,0.523047,1.312418,1.0
2023,15113,1.238854,1.015066,1.148205,1.046094,2.221015,1.0
2023,15114,2.148808,1.353421,0.656117,0.523047,1.716239,1.0
