In [23]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [16]:
#!pip install pybaseball

In [25]:
START = 2002
END = 2022

In [26]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END, qual=200)
    batting.to_csv("batting.csv")

In [28]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [29]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,,0,0.124,0.164,,,
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,,0,0.135,0.223,,,
78,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,...,,,0.0,,0,0.200,0.266,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6861,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,,0.0,,0,0.166,0.252,,,
7019,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,
6655,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,,0.0,,0,0.169,0.295,,,
6962,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,,0,0.130,0.187,,,


In [30]:
#Takes data for single player and creates target
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player


In [32]:
batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [33]:
batting[["Name", "Season","WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5549,Alfredo Amezaga,2006,1.1,2.0
5000,Alfredo Amezaga,2007,2.0,1.2
5243,Alfredo Amezaga,2008,1.2,
1168,Garret Anderson,2002,3.7,5.1
866,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
5980,Owen Miller,2022,0.6,
4880,Andrew Vaughn,2021,-0.3,0.5
2097,Andrew Vaughn,2022,0.5,
6604,Ha-seong Kim,2021,0.5,2.6


In [34]:
null_count = batting.isnull().sum()
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
CSW%           0
xBA         6737
xSLG        6737
xwOBA       6737
Next_WAR    1174
Length: 320, dtype: int64

In [35]:
complete_cols = list(batting.columns[null_count == 0])

In [37]:
#REmove nulls by indexing on complete cols
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [38]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR
5549,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,86,107,113,143,109,63,0,0.188,0.256,2.0
5000,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,92,101,112,109,113,75,0,0.175,0.227,1.2
5243,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,99,101,101,123,111,64,0,0.178,0.244,
1168,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,118,91,80,65,97,129,0,0.137,0.232,5.1
866,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,112,101,80,90,99,109,0,0.164,0.252,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5980,24655,2022,Owen Miller,CLE,25,119,391,433,97,66,...,92,111,99,127,102,82,315,0.191,0.269,
4880,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,87,104,116,84,99,110,321,0.185,0.285,0.5
2097,26197,2022,Andrew Vaughn,CHW,24,118,456,497,132,88,...,88,108,108,93,99,105,382,0.205,0.287,
6604,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,126,99,59,137,96,88,201,0.216,0.303,2.6


In [40]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Hard%+        int64
Events        int64
CStr%       float64
CSW%        float64
Next_WAR    float64
Length: 132, dtype: object

In [41]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [42]:
batting["Dol"]

5549      $5.5
5000     $11.2
5243      $7.2
1168     $14.6
866      $22.0
         ...  
5980      $4.7
4880    ($2.6)
2097      $3.6
6604      $3.9
4920     $21.1
Name: Dol, Length: 6737, dtype: object

In [43]:
del batting["Dol"]

In [44]:
batting["Age Rng"]

5549    28 - 28
5000    29 - 29
5243    30 - 30
1168    30 - 30
866     31 - 31
         ...   
5980    25 - 25
4880    23 - 23
2097    24 - 24
6604    25 - 25
4920    26 - 26
Name: Age Rng, Length: 6737, dtype: object

In [45]:
del batting["Age Rng"]

In [46]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [47]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [48]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit


rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)

In [49]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [50]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])

In [52]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR,team_code
count,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,...,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0,5563.0
mean,5346.361136,2011.143268,0.360701,0.653156,0.479133,0.481446,0.366375,0.290768,0.399673,0.103557,...,0.457554,0.403273,0.410782,0.511008,0.478735,0.172547,0.498866,0.545701,1.787758,0.474051
std,5116.526623,5.601356,0.147526,0.255806,0.242278,0.262085,0.182445,0.13871,0.171662,0.105912,...,0.113984,0.131154,0.121118,0.130367,0.134085,0.273872,0.137239,0.120687,1.989465,0.305009
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.4,0.0
25%,1129.0,2006.0,0.269231,0.478632,0.276978,0.259516,0.211207,0.179245,0.258621,0.043478,...,0.382022,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.3,0.205882
50%,3516.0,2011.0,0.346154,0.709402,0.507194,0.508651,0.37069,0.287736,0.37931,0.086957,...,0.460674,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,1.5,0.470588
75%,8722.0,2016.0,0.461538,0.871795,0.688849,0.711073,0.508621,0.391509,0.517241,0.130435,...,0.52809,0.488722,0.483146,0.594203,0.564626,0.345576,0.591489,0.625551,2.9,0.735294
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [53]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [58]:
selected_columns[sfs.get_support()]

Index(['Age', 'IBB', 'SO', 'SB', 'BU', 'BABIP', 'IFH%', 'WAR', 'Spd', 'PH',
       'CB%', 'Z-Contact%', 'SwStr%', 'wGDP', 'Oppo%', 'SLG+', 'LD+%',
       'Pull%+', 'Soft%+', 'Hard%+'],
      dtype='object')

In [59]:
predictors = list(selected_columns[sfs.get_support()])

In [60]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index) #change to series for easy
        
        combined = pd.concat([test["Next_WAR"], preds], axis=1) #combine preds with actual values
        combined.columns = ["actual", "prediction"] #asign col names
        
        all_predictions.append(combined) #add df to all preds list
        
    return pd.concat(all_predictions) #return single df from results

In [61]:
predictions = backtest(batting, rr, predictors)

In [62]:
predictions

Unnamed: 0,actual,prediction
5000,1.2,1.514187
1923,1.4,0.804184
3108,-0.1,0.587281
5782,0.6,0.890092
1108,4.8,2.307446
...,...,...
1913,1.2,2.697911
5863,1.0,1.926963
7009,0.6,1.545744
4880,0.5,1.646229


In [63]:
#Use summary statistic to create error metric to tell how good our model is
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.7671807143292715

In [64]:
batting["Next_WAR"].describe()

count    5563.000000
mean        1.787758
std         1.989465
min        -3.400000
25%         0.300000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

As a general rule, we want the square root of MSE to be lower than our STD

In [65]:
2.7671807143292715 ** .5

1.6634845097954087

Which is lower than the STD, which is good. It is not that much lower, but it is a direction. 

In [67]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(1, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

In [68]:
batting = batting.groupby("IDfg", group_keys=False).apply(player_history)
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR,team_code,player_season,war_corr,war_diff
5549,1,2006,Alfredo Amezaga,FLA,0.346154,0.735043,0.312950,0.307958,0.245690,0.278302,...,0.652174,0.210884,0.000000,0.582979,0.524229,2.0,0.352941,0,1.000000,1.000000
5000,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,0.323276,0.316038,...,0.710145,0.292517,0.000000,0.527660,0.396476,1.2,0.352941,1,1.000000,1.200000
1168,2,2002,Garret Anderson,ANA,0.423077,0.957265,0.859712,0.826990,0.711207,0.443396,...,0.478261,0.659864,0.000000,0.365957,0.418502,5.1,0.029412,0,1.000000,1.000000
866,2,2003,Garret Anderson,ANA,0.461538,0.965812,0.859712,0.818339,0.737069,0.500000,...,0.507246,0.523810,0.000000,0.480851,0.506608,0.8,0.029412,1,1.000000,1.197183
2573,2,2004,Garret Anderson,ANA,0.500000,0.564103,0.507194,0.475779,0.443966,0.400943,...,0.608696,0.448980,0.000000,0.531915,0.585903,-0.2,0.029412,2,-0.661143,0.494118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,23667,2021,Wander Franco,TBR,0.038462,0.205128,0.217626,0.186851,0.219828,0.179245,...,0.608696,0.394558,0.409015,0.391489,0.352423,1.2,0.911765,0,1.000000,1.000000
5863,24618,2021,Ryan Jeffers,MIN,0.192308,0.333333,0.192446,0.160900,0.099138,0.070755,...,0.347826,0.619048,0.265442,0.514894,0.788546,1.0,0.558824,0,1.000000,1.000000
7009,24655,2021,Owen Miller,CLE,0.192308,0.119658,0.055755,0.003460,0.038793,0.066038,...,0.681159,0.394558,0.230384,0.548936,0.700441,0.6,0.264706,0,1.000000,1.000000
4880,26197,2021,Andrew Vaughn,CHW,0.153846,0.692308,0.462230,0.465398,0.293103,0.226415,...,0.507246,0.530612,0.535893,0.570213,0.651982,0.5,0.205882,0,1.000000,1.000000


In [69]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [70]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [71]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [72]:
predictions = backtest(batting, rr, new_predictors)

In [73]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 

2.6709555619496905

In [74]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.725100
WAR             -1.788656
BABIP           -1.534478
Soft%+          -1.256893
SLG+            -1.227397
SwStr%          -1.047389
BU              -0.975975
PH              -0.738206
SO              -0.707326
Z-Contact%      -0.695398
war_diff        -0.586509
wGDP            -0.477138
Pull%+          -0.231475
LD+%            -0.223573
CB%             -0.214705
war_corr        -0.122821
player_season    0.000056
IFH%             0.380017
Oppo%            0.660879
Spd              0.717269
SB               1.022885
IBB              1.757543
Hard%+           2.256642
war_season       3.436497
dtype: float64

In [75]:
diff = predictions["actual"] - predictions["prediction"]
diff

5000   -0.312362
1923    0.923673
3108   -0.548822
5782   -0.306251
1108    2.689615
          ...   
1913   -1.453435
5863   -0.693463
7009   -0.576410
4880   -0.866456
6604    1.556792
Length: 4115, dtype: float64

In [76]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [77]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [78]:
merged

Unnamed: 0,actual,prediction,IDfg,Season,Name,Team,Age,G,AB,PA,...,Events,CStr%,CSW%,Next_WAR,team_code,player_season,war_corr,war_diff,war_season,diff
5000,1.2,1.512362,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,...,0.000000,0.527660,0.396476,1.2,0.352941,1,1.000000,1.200000,0.998355,0.312362
1923,1.4,0.476327,2,2007,Garret Anderson,LAA,0.615385,0.529915,0.462230,0.432526,...,0.000000,0.442553,0.480176,1.4,0.441176,5,-0.692192,1.371429,0.887427,0.923673
3108,-0.1,0.448822,10,2007,David Eckstein,STL,0.500000,0.606838,0.492806,0.491349,...,0.000000,0.676596,0.436123,-0.1,0.852941,5,-0.694330,0.836735,0.758010,0.548822
5782,0.6,0.906251,11,2007,Darin Erstad,CHW,0.538462,0.350427,0.269784,0.254325,...,0.000000,0.765957,0.691630,0.6,0.205882,4,-0.828562,0.803922,0.758010,0.306251
1108,4.8,2.110385,15,2007,Troy Glaus,TOR,0.423077,0.589744,0.404676,0.442907,...,0.000000,0.634043,0.704846,4.8,0.970588,5,0.231396,0.897059,1.127772,2.689615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,1.2,2.653435,23667,2021,Wander Franco,TBR,0.038462,0.205128,0.217626,0.186851,...,0.409015,0.391489,0.352423,1.2,0.911765,0,1.000000,1.000000,1.053432,1.453435
5863,1.0,1.693463,24618,2021,Ryan Jeffers,MIN,0.192308,0.333333,0.192446,0.160900,...,0.265442,0.514894,0.788546,1.0,0.558824,0,1.000000,1.000000,0.744667,0.693463
7009,0.6,1.176410,24655,2021,Owen Miller,CLE,0.192308,0.119658,0.055755,0.003460,...,0.230384,0.548936,0.700441,0.6,0.264706,0,1.000000,1.000000,0.435903,0.576410
4880,0.5,1.366456,26197,2021,Andrew Vaughn,CHW,0.153846,0.692308,0.462230,0.465398,...,0.535893,0.570213,0.651982,0.5,0.205882,0,1.000000,1.000000,0.563041,0.866456


In [82]:
merged[["IDfg", "Season", "Team", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"], ascending=False).head(50)

Unnamed: 0,IDfg,Season,Team,Name,WAR,Next_WAR,diff
2519,11579,2014,WSN,Bryce Harper,0.310559,9.3,7.523628
873,9166,2010,SFG,Buster Posey,0.459627,10.1,6.653248
3825,1875,2009,TEX,Josh Hamilton,0.291925,8.4,6.538779
3249,5631,2010,LAD,Matt Kemp,0.21118,8.3,6.356862
3165,4810,2007,ATL,Brian McCann,0.304348,8.6,6.337638
1232,7435,2008,TBR,Ben Zobrist,0.304348,8.7,6.272917
1713,9272,2012,BAL,Chris Davis,0.322981,7.1,5.978018
451,15640,2021,NYY,Aaron Judge,0.552795,9.7,5.868964
5117,1213,2009,- - -,Aubrey Huff,0.080745,5.7,5.807724
2512,4727,2009,BOS,Jacoby Ellsbury,0.341615,9.5,5.684369


Doing a quick google search of the players at the top, it is evident that each of those players were injured during those seasons, so adding information regarding that is a valable next step