In [1]:
import pandas as pd

In [2]:
stats = pd.read_csv("player_mvp_stats.csv", index_col=0)

In [3]:
stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.340,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13018,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,0.484,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
13019,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,0.286,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
13020,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,0.470,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
13021,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,0.459,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


# Data Cleaning

In [4]:
# we want to find all the null values and remove them since ML models typically does not like null values
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          47
3P            0
3PA           0
3P%        1996
2P            0
2PA           0
2P%          76
eFG%         47
FT            0
FTA           0
FT%         419
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [5]:
# select the two columns "Players" and "3PA" from the df such that the "3P%" is null
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]].head()
# the list of players outputed have 0 3PA and null values on 3P%

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0


In [6]:
# select the two columns "Players" and "3PA" from the df such that the "FT%" is null
stats[pd.isnull(stats["FT%"])][["Player", "FTA"]].head()
# the list of players outputed have 0 FTA and null values on FT%

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0


In [7]:
# solution: replace the null values with 0
stats = stats.fillna(0)

# Train the Machine Learning Model

In [8]:
# find the possible predictors
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [9]:
# select the relevant predictors (do not include variable that is directly correlated to the prediction such as 'Pts Won', 
# 'Pts Max', 'Share')
predictors = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 
              'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 
              'PTS', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [10]:
# we are trying to use the data before 2019 to predict the mvp for the 2019 season
train = stats[~(stats["Year"] == 2019)]
test = stats[stats["Year"] == 2019]

In [11]:
from sklearn.linear_model import Ridge

# initiablize the Ridge regression model, alpha controls how much the coefficeint is going to shrunk to prevent overfitting 
reg = Ridge(alpha=.1)

In [12]:
# fit the model: use the predictor columns in the train df to predict the variable "Share"
reg.fit(train[predictors],train["Share"])


Ridge(alpha=0.1)

In [13]:
# make prediciton using the model: use the predictor columns in the test df to make predictions
predictions = reg.predict(test[predictors])
# convert it to a pd df for a better look
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [14]:
predictions

Unnamed: 0,predictions
595,0.020067
596,-0.003308
597,0.012158
598,-0.001707
599,-0.018801
...,...
12810,0.002494
12811,0.024479
12812,0.021157
12813,-0.009482


In [15]:
# join the "Player" and "Share" column in test df and predictions df tgt 
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
# outcome: every player in the 2019 season will have its mvp shares and the its predictions value 

In [16]:
combination

Unnamed: 0,Player,Share,predictions
595,Aaron Gordon,0.0,0.020067
596,Amile Jefferson,0.0,-0.003308
597,D.J. Augustin,0.0,0.012158
598,Evan Fournier,0.0,-0.001707
599,Isaiah Briscoe,0.0,-0.018801
...,...,...,...
12810,Ray Spalding,0.0,0.002494
12811,Richaun Holmes,0.0,0.024479
12812,T.J. Warren,0.0,0.021157
12813,Troy Daniels,0.0,-0.009482


In [17]:
# but we are only interested in the players that have positive mvp shares
combination.sort_values("Share", ascending=False).head(20)


Unnamed: 0,Player,Share,predictions
10609,Giannis Antetokounmpo,0.932,0.218571
8974,James Harden,0.768,0.193765
1271,Paul George,0.352,0.117975
10828,Nikola Jokić,0.21,0.117185
3814,Stephen Curry,0.173,0.114872
2732,Damian Lillard,0.068,0.103557
5021,Joel Embiid,0.049,0.167789
3808,Kevin Durant,0.025,0.134524
12019,Kawhi Leonard,0.013,0.131126
1273,Russell Westbrook,0.008,0.132794


# Identify an Error Metric 

In [18]:
from sklearn.metrics import mean_squared_error

# the function takes the actual values("Share") and the predicted values("predictions") -> return an error metric
# error metric: the mean difference between the predictions and the actual values
mean_squared_error(combination["Share"], combination["predictions"])

0.002637764085940138

In [19]:
combination["Share"].value_counts()
# problem: most of the players does not get any mvp votes
# it does not make sense to improve the accurracy of the prediction for the players that have no mvp votes -> we only care 
# about the top 10 -15 players

0.000    518
0.001      2
0.352      1
0.008      1
0.068      1
0.025      1
0.173      1
0.049      1
0.768      1
0.932      1
0.210      1
0.013      1
Name: Share, dtype: int64

In [20]:
# solution: add a new column rank that ranks the players according to "Share":
actual = combination.sort_values("Share", ascending=False)
actual["Rk"] = list(range(1,actual.shape[0]+1))
actual.head(10)

Unnamed: 0,Player,Share,predictions,Rk
10609,Giannis Antetokounmpo,0.932,0.218571,1
8974,James Harden,0.768,0.193765,2
1271,Paul George,0.352,0.117975,3
10828,Nikola Jokić,0.21,0.117185,4
3814,Stephen Curry,0.173,0.114872,5
2732,Damian Lillard,0.068,0.103557,6
5021,Joel Embiid,0.049,0.167789,7
3808,Kevin Durant,0.025,0.134524,8
12019,Kawhi Leonard,0.013,0.131126,9
1273,Russell Westbrook,0.008,0.132794,10


In [21]:
#  add a new column "Predicted_Rk" that ranks the players according to "predictions":
predicted = combination.sort_values("predictions", ascending=False)
predicted["Predicted_Rk"] = list(range(1,predicted.shape[0]+1))
predicted.head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk
10609,Giannis Antetokounmpo,0.932,0.218571,1
8974,James Harden,0.768,0.193765,2
3434,LeBron James,0.001,0.174532,3
5021,Joel Embiid,0.049,0.167789,4
6493,Anthony Davis,0.0,0.163169,5
3808,Kevin Durant,0.025,0.134524,6
1273,Russell Westbrook,0.008,0.132794,7
12019,Kawhi Leonard,0.013,0.131126,8
1271,Paul George,0.352,0.117975,9
10828,Nikola Jokić,0.21,0.117185,10


In [22]:
# merge actual with predicted on "Player"
actual.merge(predicted, on="Player").head(20)

Unnamed: 0,Player,Share_x,predictions_x,Rk,Share_y,predictions_y,Predicted_Rk
0,Giannis Antetokounmpo,0.932,0.218571,1,0.932,0.218571,1
1,James Harden,0.768,0.193765,2,0.768,0.193765,2
2,Paul George,0.352,0.117975,3,0.352,0.117975,9
3,Nikola Jokić,0.21,0.117185,4,0.21,0.117185,10
4,Stephen Curry,0.173,0.114872,5,0.173,0.114872,11
5,Damian Lillard,0.068,0.103557,6,0.068,0.103557,13
6,Joel Embiid,0.049,0.167789,7,0.049,0.167789,4
7,Kevin Durant,0.025,0.134524,8,0.025,0.134524,6
8,Kawhi Leonard,0.013,0.131126,9,0.013,0.131126,8
9,Russell Westbrook,0.008,0.132794,10,0.008,0.132794,7


In [23]:
# a good method for finding the error metric of a ranked data
def find_ap(combination):
    # get the top 5 ranked players in terms of "Share" recieved
    actual = combination.sort_values("Share", ascending=False).head(5)
    # get a sorted list of players sorted in terms of "predictions" 
    predicted = combination.sort_values("predictions", ascending=False)

    ps = []
    found = 0
    seen = 1
    for index,row in predicted.iterrows():
        # see if the player is in our top 5 -> the loner it takes to find all the top 5, the less accurate is our model 
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found / seen)
        seen += 1

    return sum(ps) / len(ps)

In [24]:
ap = find_ap(combination)


In [25]:
# 1 would be perfect, the smaller the value -> the longer it takes for our predictions to find the correct mvps
ap

0.6375757575757576

# Backtesing to predict each year

In [26]:
years = list(range(1991,2020))


In [27]:
aps = []
all_predictions = []
for year in years[5:]:
    # assign the train data for every 5 years and test data to be last year among the 5 years
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    
    # fit the model: use the predictor columns in the train df to predict the variable "Share"
    reg.fit(train[predictors],train["Share"])
    # make prediciton using the model: use the predictor columns in the test df to make predictions
    predictions = reg.predict(test[predictors])
    # convert it to a pd df for a better look
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    
    
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [28]:
# calculate the mean average precision 
sum(aps) / len(aps)


0.6992185272708215

In [29]:
def add_ranks(predictions):
    # add a new column "Predicted_Rk" that ranks the players according to "predictions":
    predictions = predictions.sort_values("predictions", ascending=False)
    predictions["Predicted_Rk"] = list(range(1,predictions.shape[0]+1))
    # add a new column "Rk" that ranks the players according to "Share":
    predictions = predictions.sort_values("Share", ascending=False)
    predictions["Rk"] = list(range(1,predictions.shape[0]+1))
    
    # add a new column "Diff" 
    predictions["Diff"] = (predictions["Rk"] - predictions["Predicted_Rk"])
    return predictions

In [30]:
add_ranks(all_predictions[1])
# 

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
1476,Karl Malone,0.857,0.192360,2,1,-1
9825,Michael Jordan,0.832,0.167672,3,2,-1
835,Grant Hill,0.327,0.128664,6,3,-3
4278,Tim Hardaway,0.207,0.059992,20,4,-16
7681,Glen Rice,0.117,0.033122,53,5,-48
...,...,...,...,...,...,...
9437,Horacio Llamas,0.000,0.010225,156,437,281
3346,Ennis Whatley,0.000,0.010287,155,438,283
9895,Kevin Salvadori,0.000,0.010594,154,439,285
1048,Aaron Williams,0.000,0.010608,153,440,287


In [31]:
# create a function so that you can backtest on every model easily 
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [32]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)


In [33]:
mean_ap


0.6992185272708215

# Diagnose Model Performance 

In [34]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(10)


Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
1134,Jason Kidd,0.712,0.027957,52,2,-50
4771,Steve Nash,0.839,0.032686,47,1,-46
7949,Peja Stojaković,0.228,0.035581,39,4,-35
4789,Steve Nash,0.739,0.051338,35,1,-34
11730,Joakim Noah,0.258,0.04739,37,4,-33
4804,Steve Nash,0.785,0.070791,23,2,-21
4278,Tim Hardaway,0.207,0.059992,20,4,-16
864,Allen Iverson,0.27,0.071899,14,4,-10
1067,Gary Payton,0.372,0.076145,13,3,-10
6249,Kobe Bryant,0.291,0.077829,14,4,-10


In [35]:
# reg.coef_ is an array of coef of the parameters: the highest coef -> the most important to the regression think it is 
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)


Unnamed: 0,0,1
13,0.037195,eFG%
18,0.03266,DRB
28,0.027545,W/L%
17,0.020669,ORB
10,0.017794,2P
22,0.01221,BLK
21,0.011484,STL
7,0.010801,3P
15,0.009766,FTA
4,0.008543,FG


# Add more Predictors

In [37]:
# for every column below, divide the values by its mean of the year
# if the player's stat is above the mean -> greater than 1
# if the player's stat is below the mean -> smaller than 1
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [38]:
# add the stat_ratio columns to stats df
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [43]:
stats.head(5)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [39]:
# add them to the list of predictors
predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [40]:
# run backtest on the model again to see if any imporvment
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [41]:
mean_ap

0.70464179827673

In [44]:
# postion and team are categroical strings
# we first turn them into a category and assigin each categroy a code
stats["NPos"] = stats["Pos"].astype("category").cat.codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes

In [45]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R,NPos,NTm
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,2,15
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,11,15
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,2,15
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,2,15
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,8,15


In [46]:
# since position and team are not in linear relationship instead of categorical -> use Radnom Forest
# Random Forest creates a series of decision tress and averages the prediction from the decision trees
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors + ["NPos", "NTm"])

In [47]:
mean_ap

0.7731092436974789

In [48]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)


In [49]:
mean_ap


0.6375757575757576

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

In [None]:
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year].copy()
        test = stats[stats["Year"] == year].copy()
        sc.fit(train[predictors])
        train[predictors] = sc.transform(train[predictors])
        test[predictors] = sc.transform(test[predictors])
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [None]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)

In [None]:
mean_ap


In [None]:
sc.transform(stats[predictors])
