In [59]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [60]:
player_stats = pd.read_csv("player_mvp_stats.csv")

In [61]:
player_stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,14087,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14088,14088,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14089,14089,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14090,14090,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [62]:
del player_stats["Unnamed: 0"]

In [63]:
#null in 3pt, FG%, FT% is beacuse of player not taking any of those shots
pd.isnull(player_stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          50
3P            0
3PA           0
3P%        2042
2P            0
2PA           0
2P%          84
eFG%         50
FT            0
FTA           0
FT%         462
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [64]:
#not 3pt shooters, i think values can be filled with 0
player_stats[pd.isnull(player_stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14061,Evan Eschmeyer,0.0
14062,Gheorghe Mureșan,0.0
14064,Jim McIlvaine,0.0
14070,Mark Hendrickson,0.0


In [65]:
#fill NA
player_stats = player_stats.fillna(0)

In [66]:
#all the columns in the player stats db
player_stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [67]:
#list of predictors
valiables = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', 
             '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 
             'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%','GB', 'PS/G', 'PA/G', 'SRS']

In [68]:
#its imortant not to use the year's stast that we are trying to predict
train = player_stats[player_stats["Year"] < 2021]

In [69]:
test = player_stats[player_stats["Year"] == 2021]

In [70]:
reg = Ridge(alpha=.1) #alpha represent how much the coefficient will be shrunked to avoide overfitting 

In [71]:
reg.fit(train[valiables], train["Share"])

Ridge(alpha=0.1)

In [72]:
predictions = reg.predict(test[valiables])

In [73]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [74]:
predictions

Unnamed: 0,predictions
630,0.014645
631,-0.012943
632,0.003629
633,-0.002785
634,0.012618
...,...
13897,-0.012088
13898,-0.009945
13899,0.016991
13900,-0.018812


In [75]:
combined_db = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [76]:
combined_db

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.014645
631,Austin Rivers,0.0,-0.012943
632,Bol Bol,0.0,0.003629
633,Facundo Campazzo,0.0,-0.002785
634,Greg Whittington,0.0,0.012618
...,...,...,...
13897,Patty Mills,0.0,-0.012088
13898,Quinndary Weatherspoon,0.0,-0.009945
13899,Rudy Gay,0.0,0.016991
13900,Tre Jones,0.0,-0.018812


In [77]:
combined_db.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
641,Nikola Jokić,0.961,0.155154
8624,Joel Embiid,0.58,0.163637
3651,Stephen Curry,0.449,0.14249
9907,Giannis Antetokounmpo,0.345,0.206428
1389,Chris Paul,0.138,0.073983
10997,Luka Dončić,0.042,0.15026
7464,Damian Lillard,0.038,0.116646
3536,Julius Randle,0.02,0.089415
3531,Derrick Rose,0.01,0.035417
11358,Rudy Gobert,0.008,0.095294


In [78]:
#evluation

In [79]:
mean_squared_error(combined_db["Share"], combined_db["predictions"])

0.002666683632003887

In [80]:
combined_db["Share"].value_counts()

0.000    525
0.001      3
0.961      1
0.138      1
0.010      1
0.020      1
0.449      1
0.005      1
0.038      1
0.003      1
0.580      1
0.345      1
0.042      1
0.008      1
Name: Share, dtype: int64

In [81]:
combined_db = combined_db.sort_values("Share", ascending=False)
combined_db["Rk"] = list(range(1,combined_db.shape[0]+1))

In [82]:
combined_db.head(10)

Unnamed: 0,Player,Share,predictions,Rk
641,Nikola Jokić,0.961,0.155154,1
8624,Joel Embiid,0.58,0.163637,2
3651,Stephen Curry,0.449,0.14249,3
9907,Giannis Antetokounmpo,0.345,0.206428,4
1389,Chris Paul,0.138,0.073983,5
10997,Luka Dončić,0.042,0.15026,6
7464,Damian Lillard,0.038,0.116646,7
3536,Julius Randle,0.02,0.089415,8
3531,Derrick Rose,0.01,0.035417,9
11358,Rudy Gobert,0.008,0.095294,10


In [83]:
combined_db = combined_db.sort_values("predictions", ascending=False)
combined_db["Predicted_Rank"] = list(range(1,combined_db.shape[0]+1))

In [84]:
combined_db.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rank
9907,Giannis Antetokounmpo,0.345,0.206428,4,1
8624,Joel Embiid,0.58,0.163637,2,2
641,Nikola Jokić,0.961,0.155154,1,3
10997,Luka Dončić,0.042,0.15026,6,4
3736,LeBron James,0.001,0.147459,15,5
3651,Stephen Curry,0.449,0.14249,3,6
4177,Kevin Durant,0.0,0.141363,531,7
4174,James Harden,0.001,0.139974,13,8
11784,Zion Williamson,0.0,0.129811,251,9
3876,Russell Westbrook,0.005,0.120053,11,10


In [85]:
combined_db.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rank
641,Nikola Jokić,0.961,0.155154,1,3
8624,Joel Embiid,0.58,0.163637,2,2
3651,Stephen Curry,0.449,0.14249,3,6
9907,Giannis Antetokounmpo,0.345,0.206428,4,1
1389,Chris Paul,0.138,0.073983,5,33
10997,Luka Dončić,0.042,0.15026,6,4
7464,Damian Lillard,0.038,0.116646,7,12
3536,Julius Randle,0.02,0.089415,8,23
3531,Derrick Rose,0.01,0.035417,9,71
11358,Rudy Gobert,0.008,0.095294,10,19


In [86]:
def find_ap(combined_db):
    actual = combined_db.sort_values("Share", ascending=False).head(5)
    predicted = combined_db.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row ["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [87]:
find_ap(combined_db)

0.7636363636363636

In [88]:
years = list(range(1991,2022))

In [89]:
#back testintg and stock prediction
aps = []
all_predictions = []
for year in years[5:]:
    train = player_stats[player_stats["Year"] < year]
    test = player_stats[player_stats["Year"] == year]
    reg.fit(train[valiables],train["Share"])
    predictions = reg.predict(test[valiables])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combined_db = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    all_predictions.append(combined_db)
    aps.append(find_ap(combined_db))

In [90]:
sum(aps) / len(aps)

0.7110668523458931

In [91]:
def add_ranks(combined_db):
    combined_db = combined_db.sort_values("predictions", ascending=False)
    combined_db["Predicted_Rk"] = list(range(1,combined_db.shape[0]+1))
    combined_db = combined_db.sort_values("Share", ascending=False)
    combined_db["Rk"] = list(range(1,combined_db.shape[0]+1))
    combined_db["Diff"] = (combined_db["Rk"] - combined_db["Predicted_Rk"])
    return combined_db

In [92]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] < 6].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
1600,Karl Malone,0.857,0.19236,2,1,-1
10524,Michael Jordan,0.832,0.167672,3,2,-1
908,Grant Hill,0.327,0.128664,6,3,-3
4682,Tim Hardaway,0.207,0.059992,20,4,-16
8248,Glen Rice,0.117,0.033122,53,5,-48


In [93]:
def backtest(player_stats, model, years, valiables):
    aps = []
    all_predictions = []
    for year in years:
        train = player_stats[player_stats["Year"] < year]
        test = player_stats[player_stats["Year"] == year]
        model.fit(train[valiables],train["Share"])
        predictions = model.predict(test[valiables])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combined_db = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combined_db = add_ranks(combined_db)
        all_predictions.append(combined_db)
        aps.append(find_ap(combined_db))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [94]:
mean_ap, aps, all_predictions = backtest(player_stats, reg, years[5:], valiables)

In [95]:
mean_ap

0.7110668523458931

In [96]:
#latgest difference between predicted and actual rank
all_predictions[all_predictions["Rk"] <= 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
1224,Jason Kidd,0.712,0.027957,52,2,-50
8248,Glen Rice,0.117,0.033122,53,5,-48
5175,Steve Nash,0.839,0.032686,47,1,-46
8516,Peja Stojaković,0.228,0.035581,39,4,-35
5193,Steve Nash,0.739,0.051338,35,1,-34
12726,Joakim Noah,0.258,0.04739,37,4,-33
3657,Chauncey Billups,0.344,0.051437,34,5,-29
1389,Chris Paul,0.138,0.073983,33,5,-28
5208,Steve Nash,0.785,0.070791,23,2,-21
4682,Tim Hardaway,0.207,0.059992,20,4,-16


In [97]:
#which variable the algorithm is keeying on
pd.concat([pd.Series(reg.coef_), pd.Series(valiables)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.065312,eFG%
18,0.034362,DRB
28,0.0285,W/L%
17,0.02179,ORB
10,0.015914,2P
21,0.012054,STL
15,0.011474,FTA
22,0.011175,BLK
20,0.007435,AST
25,0.006141,PTS


In [98]:
#ratio between a players stast and the average stats for that season
stat_ratios = player_stats[["PTS","AST","STL","BLK","3P","Year"]].groupby("Year").apply(lambda x: x/x.mean())


In [99]:
#adding the above stats
player_stats[["PTS_R", "AST_R", "STL_R","BLK_R","3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]


In [100]:
#adding the variabls to the predictorss
valiables += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]


In [101]:
#prediction code for backtest
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], valiables)


In [102]:
mean_ap
#some improvement

0.7180143406004313

In [103]:
#turning the position and team into a category 
player_stats["NPos"] = player_stats["Pos"].astype("category").cat.codes
player_stats["NTm"] = player_stats["Tm"].astype("category").cat.codes


In [104]:


rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(player_stats, rf, years[28:], valiables + ["NPos", "NTm"])


In [105]:
#random forest mean
mean_ap


0.8544707580001697

In [106]:
#comparing reg with rf
mean_ap, aps, all_predictions = backtest(player_stats, reg, years[28:], valiables)


In [107]:
#regression mean
mean_ap


0.7981818181818182

In [108]:
sc = StandardScaler()

In [109]:
def backtest(player_stats, model, years, valiables):
    aps = []
    all_predictions = []
    for year in years:
        train = player_stats[player_stats["Year"] < year].copy()
        test = player_stats[player_stats["Year"] == year].copy()
        sc.fit(train[valiables])
        train[valiables] = sc.transform(train[valiables])
        test[valiables] = sc.transform(test[valiables])
        model.fit(train[valiables],train["Share"])
        predictions = model.predict(test[valiables])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combined_db = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combined_db = add_ranks(combined_db)
        all_predictions.append(combined_db)
        aps.append(find_ap(combined_db))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [110]:
mean_ap, aps, all_predictions = backtest(player_stats, reg, years[28:], valiables)


In [111]:
mean_ap


0.7981818181818182

In [112]:
sc.transform(player_stats[valiables])


array([[ 0.04758869,  1.15207811, -0.1657698 , ..., -0.05582165,
        -0.27719578, -0.38154061],
       [ 0.51919526,  1.15207811,  1.91917626, ...,  0.93001944,
        -0.27719578,  2.77745153],
       [-1.13142771, -0.03932364, -0.88353811, ..., -1.04166274,
         0.48509261, -0.77641463],
       ...,
       [-0.42401787,  0.04010314, -0.47338479, ...,  0.17152606,
         1.31341113, -0.37208834],
       [-1.83883755,  0.15924332,  0.27856297, ..., -0.97670625,
         0.23224953, -0.27100677],
       [-0.42401787,  1.07265133,  1.85081738, ...,  0.17152606,
        -0.41644743,  1.04305367]])

In [113]:
all_predictions[all_predictions["Rk"] <= 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
1389,Chris Paul,0.138,0.063173,33,5,-28
1361,Paul George,0.352,0.115984,10,3,-7
11696,Nikola Jokić,0.21,0.1075,11,4,-7
4149,Stephen Curry,0.173,0.119782,9,5,-4
3720,LeBron James,0.746,0.155818,4,2,-2
641,Nikola Jokić,0.961,0.143924,3,1,-2
3651,Stephen Curry,0.449,0.140931,5,3,-2
5311,Kawhi Leonard,0.166,0.125352,6,5,-1
11443,Giannis Antetokounmpo,0.932,0.203543,1,1,0
9634,James Harden,0.768,0.189648,2,2,0


In [121]:
all_predictions.sort_values("Rk", ascending=True).head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
11443,Giannis Antetokounmpo,0.932,0.203543,1,1,0
12314,Giannis Antetokounmpo,0.952,0.233183,1,1,0
641,Nikola Jokić,0.961,0.143924,3,1,-2
8624,Joel Embiid,0.58,0.150906,2,2,0
9634,James Harden,0.768,0.189648,2,2,0
3720,LeBron James,0.746,0.155818,4,2,-2
9651,James Harden,0.363,0.183661,2,3,1
1361,Paul George,0.352,0.115984,10,3,-7
3651,Stephen Curry,0.449,0.140931,5,3,-2
9280,Luka Dončić,0.198,0.168997,3,4,1


In [125]:
results = all_predictions[all_predictions["Diff"] == 0].head(10)

In [127]:
results.head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
11443,Giannis Antetokounmpo,0.932,0.203543,1,1,0
9634,James Harden,0.768,0.189648,2,2,0
9744,Avery Bradley,0.0,-0.050237,530,530,0
12314,Giannis Antetokounmpo,0.952,0.233183,1,1,0
11757,Josh Gray,0.0,-0.06703,529,529,0
8624,Joel Embiid,0.58,0.150906,2,2,0
3876,Russell Westbrook,0.005,0.104391,11,11,0
11768,Didi Louzada,0.0,-0.067469,540,540,0


In [128]:
results = all_predictions[all_predictions["Diff"] == 1].head(10)

In [129]:
results.head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
5900,Ish Smith,0.0,-0.00092,271,272,1
9651,James Harden,0.363,0.183661,2,3,1
9280,Luka Dončić,0.198,0.168997,3,4,1
5328,Kawhi Leonard,0.001,0.104183,12,13,1


In [130]:
results = all_predictions[all_predictions["Diff"] == -1].head(10)

In [131]:
results.head(10)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
5311,Kawhi Leonard,0.166,0.125352,6,5,-1
3708,Anthony Davis,0.081,0.125214,7,6,-1
