In [2]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [3]:
START = 2010
END = 2022

In [5]:
#Batters must have 200 plate appearances
batting = batting_stats(START, END, qual=200)

In [None]:
batting.to_csv("batting.csv")

In [6]:
#Splitting dataframe into groups by player, rookies aren't included
batting = batting.groupby("IDfg", group_keys = False).filter(lambda x: x.shape[0] > 1)

In [7]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
1,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246,0.609,404,0.169,0.287,,,,11.3
4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217,0.500,434,0.220,0.270,,,,10.4
33,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,...,,0,,0,0.200,0.266,,,,10.2
75,10155,2012,Mike Trout,LAA,20,139,559,639,182,117,...,,0,,0,0.221,0.293,,,,10.1
86,9166,2012,Buster Posey,SFG,25,148,530,610,178,114,...,,0,,0,0.190,0.251,,,,10.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4364,3448,2019,Jeff Mathis,TEX,36,88,228,244,36,25,...,105.5,37,0.261,142,0.155,0.322,,,,-2.1
1870,2113,2010,Ryan Doumit,PIT,29,124,406,456,102,66,...,,0,,0,0.168,0.258,,,,-2.2
3930,45,2012,Rod Barajas,PIT,36,104,321,361,66,44,...,,0,,0,0.147,0.258,,,,-2.4
4306,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113,0.401,282,0.174,0.316,,,,-3.1


In [8]:
def next_year(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys = False).apply(next_year)

In [9]:
#Can see that the shift allows to put both current and next season in frame
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
3188,Adam Kennedy,2010,0.2,0.4
3842,Adam Kennedy,2011,0.4,0.4
2615,Adam Kennedy,2012,0.4,
3645,Jose Molina,2012,3.6,2.1
4055,Jose Molina,2013,2.1,0.4
...,...,...,...,...
3571,Owen Miller,2022,0.7,
2816,Andrew Vaughn,2021,-0.3,-0.5
1840,Andrew Vaughn,2022,-0.5,
3992,Ha-seong Kim,2021,0.5,3.7


In [10]:
#Some columns are missing data
null_sum = batting.isnull().sum()
null_sum

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         4088
xSLG        4088
xwOBA       4088
L-WAR          0
Next_WAR     831
Length: 321, dtype: int64

In [11]:
#List of complete columns
complete = list(batting.columns[null_sum == 0])

In [12]:
#Creating a copy batting that only has complete columns and the next season WAR
batting = batting[complete + ["Next_WAR"]].copy()

In [13]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Soft%+,Med%+,Hard%+,Barrels,HardHit,Events,CStr%,CSW%,L-WAR,Next_WAR
3188,19,2010,Adam Kennedy,WSN,34,135,342,389,85,65,...,74,125,74,0,0,0,0.187,0.232,0.2,0.4
3842,19,2011,Adam Kennedy,SEA,35,114,380,409,89,58,...,100,118,61,0,0,0,0.164,0.241,0.4,0.4
2615,19,2012,Adam Kennedy,LAD,36,86,168,201,44,33,...,95,120,65,0,0,0,0.173,0.244,0.4,
3645,25,2012,Jose Molina,TBR,37,102,251,274,56,39,...,112,109,76,0,0,0,0.144,0.247,3.6,2.1
4055,25,2013,Jose Molina,TBR,38,99,283,313,66,50,...,135,107,69,0,0,0,0.154,0.266,2.1,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3571,24655,2022,Owen Miller,CLE,25,130,424,472,103,70,...,131,100,83,12,106,340,0.189,0.266,0.1,
2816,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,84,99,110,35,149,321,0.185,0.285,0.2,-0.5
1840,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,92,...,94,100,104,33,203,419,0.201,0.291,0.4,
3992,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,137,96,88,9,64,201,0.216,0.303,0.6,3.7


In [14]:
#Finding which columns are strings
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [15]:
#A column that assigns a dollar value to players
del batting["Dol"]
#A column that gives what age a player is throughout the season
del batting["Age Rng"]

In [16]:
#Turning team names into a number
batting["team_num"] = batting["Team"].astype("category").cat.codes

In [17]:
batting["team_num"]

3188    31
3842    25
2615    15
3645    28
4055    28
        ..
3571     8
2816     6
1840     6
3992    24
2495    24
Name: team_num, Length: 4088, dtype: int8

In [18]:
#Creating a copy before dropping rows without Next_WAR might want them later
batting_full = batting.copy()
batting = batting.dropna().copy()

In [19]:
#Picking the important measures and starting a ridge regression model
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha = 1)
split = TimeSeriesSplit(n_splits = 3)
#Selects features and evaluates which ones are the best untill it has 20
sfs = SequentialFeatureSelector(rr, n_features_to_select = 20, direction = "forward", cv = split, n_jobs = 4)

In [20]:
#Must remove certain columns for sfs to work
removed_col = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_col = batting.columns[~batting.columns.isin(removed_col)]

In [21]:
#Scaling data using min-max scaler so mean is 0 and standard dev is 1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_col] = scaler.fit_transform(batting[selected_col])

In [22]:
#Visual of changes
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Med%+,Hard%+,Barrels,HardHit,Events,CStr%,CSW%,L-WAR,Next_WAR,team_num
count,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,...,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0,3257.0
mean,8014.992631,2015.186368,0.392773,0.637063,0.489954,0.480148,0.412486,0.360741,0.380067,0.140313,...,0.510045,0.4922,0.141552,0.237496,0.296109,0.458887,0.565621,0.376848,1.773258,0.484951
std,5080.857069,3.351623,0.159924,0.269312,0.259591,0.275015,0.213961,0.178606,0.166273,0.144641,...,0.136322,0.137964,0.180934,0.252745,0.303189,0.146854,0.119093,0.139633,1.957282,0.310308
min,19.0,2010.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.9,0.0
25%,3797.0,2012.0,0.26087,0.431034,0.263359,0.241877,0.230769,0.216049,0.241379,0.0625,...,0.42029,0.40146,0.0,0.0,0.0,0.364055,0.488987,0.274074,0.4,0.225806
50%,7435.0,2015.0,0.391304,0.698276,0.507634,0.501805,0.410256,0.351852,0.362069,0.125,...,0.507246,0.50365,0.059524,0.20073,0.262104,0.456221,0.563877,0.355556,1.5,0.483871
75%,11737.0,2018.0,0.478261,0.87069,0.715649,0.722022,0.584615,0.487654,0.5,0.1875,...,0.594203,0.583942,0.238095,0.441606,0.569282,0.557604,0.647577,0.451852,2.8,0.741935
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.6,1.0


In [23]:
#Using selected columns and Next_WAR
sfs.fit(batting[selected_col], batting["Next_WAR"])

In [24]:
#Getting the columns wanted as predictors
predictors = list(selected_col[sfs.get_support()])

In [27]:
#Generate predictions
def backtest(data, model, predictors, start = 5, step = 1):
    preds = []
    #Sort unique seasons in database in order
    years = sorted(batting["Season"].unique())

    #Using data from predecessing years starting 5 years behind 
    for i in range(start, len(years), step):
        curr_year = years[i]
        #Training set is any year less then curr
        train = data[data["Season"] < curr_year]
        #Test set is when year equals curr
        test = data[data["Season"] == curr_year]
        #Fit model using predictors and years
        model.fit(train[predictors], train["Next_WAR"])

        predics = model.predict(test[predictors])
        predics = pd.Series(predics, index = test.index)
        #Combine predictions with actual values, treating as seperate cols
        combine = pd.concat([test["Next_WAR"], predics], axis = 1)
        combine.columns = ["Actual", "Prediction"]
        #Iterate through seasons and adding seasons as a dataframe
        preds.append(combine)
        
    return pd.concat(preds)

In [28]:
predictions = backtest(batting, rr, predictors)

In [29]:
#List of all the Actual WAR vs prediction
predictions

Unnamed: 0,Actual,Prediction
3587,1.0,-0.303431
2172,-0.5,0.541078
1050,2.3,0.975361
1389,5.5,2.668923
306,4.5,1.705381
...,...,...
999,2.3,3.374998
3484,0.9,2.400568
4297,0.7,1.864326
2816,-0.5,2.324399


In [30]:
#Checking effectiveness of prediction
from sklearn.metrics import mean_squared_error
#Subtract prediction from actual value and square the difference, then find average 
mean_squared_error(predictions["Actual"], predictions["Prediction"])

2.5988343620226924

In [31]:
batting["Next_WAR"].describe()

count    3257.000000
mean        1.773258
std         1.957282
min        -2.900000
25%         0.400000
50%         1.500000
75%         2.800000
max        11.600000
Name: Next_WAR, dtype: float64

In [32]:
2.5988343620226924 ** .5

1.6120900601463592

In [33]:
#Prediction was better than std, not by much but still better than random guess

In [34]:
#To improve data, going to see if players WAR improved our got worse from previous year instead of using just one year
def history(df):
    df = df.sort_values("Season")
    df["Player_season"] = range(0, df.shape[0])
    #Expanding creates groups and finds correlation between player and season
    df["War_corr"] = list(df[["Player_season", "WAR"]].expanding().corr().loc[(slice(None), "Player_season"), "WAR"])
    df["War_corr"].fillna(1, inplace = True)
    #Difference between current war and previous season war
    df["War_diff"] = df["WAR"] / df["WAR"].shift(1)
    #Replacing empty seasons and infinite values with 1
    df["War_diff"].fillna(1, inplace = True)
    df["War_diff"][df["War_diff"] == np.inf] = 1

    return df

#Split data into groups by player and calling history on them
batting = batting.groupby("IDfg", group_keys = False).apply(history)

In [35]:
#Find averages across a season and compare them to how a player did
def averages(df):
    return df["WAR"] / df["WAR"].mean()

In [36]:
batting["War_season"] = batting.groupby("Season", group_keys = False).apply(averages)

In [37]:
new_predictors = predictors + ["Player_season", "War_corr", "War_season", "War_diff"]

In [38]:
predictions = backtest(batting, rr, new_predictors)

In [39]:
mean_squared_error(predictions["Actual"], predictions["Prediction"])

2.4538450235929603

In [40]:
#Data has improved, checking how much each predictor is affecting model
pd.Series(rr.coef_, index = new_predictors).sort_values()

Age             -2.752223
BABIP+          -1.721092
ISO             -1.151048
PH              -0.913079
WAR             -0.861941
Soft%+          -0.844229
SO              -0.740822
Pull%           -0.570639
War_diff        -0.534881
SH              -0.533704
Clutch          -0.474113
wCT/C           -0.416190
War_corr        -0.184279
Player_season    0.002374
vCU (pi)         0.145650
SI-Z (pi)        0.374782
IFH              0.380583
BB%+             0.413043
SL-X (pi)        0.474321
CTv              0.626940
SI% (sc)         0.745916
Spd              0.899027
Hard%+           2.056342
War_season       2.937277
dtype: float64

In [41]:
diff = predictions["Actual"] - predictions["Prediction"]

In [42]:
merged = predictions.merge(batting, left_index = True, right_index = True)

In [44]:
merged["Diff"] = (predictions["Actual"] - predictions["Prediction"]).abs()

In [45]:
#Data with all calculated values
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "Diff"]].sort_values(["Diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,Diff
2055,13338,2021,Omar Narvaez,0.428571,1.1,0.000419
3273,9892,2018,Jay Bruce,0.187970,0.8,0.000944
866,13329,2016,Jake Lamb,0.496241,2.8,0.001622
1423,9927,2017,Brett Gardner,0.526316,2.2,0.002170
1132,12979,2021,Javier Baez,0.518797,1.9,0.002710
...,...,...,...,...,...,...
4030,5933,2015,Jean Segura,0.225564,6.2,5.014011
4140,19339,2019,Nicky Lopez,0.180451,6.0,5.379635
3726,5343,2019,Brandon Crawford,0.248120,6.3,5.586637
1325,13611,2017,Mookie Betts,0.624060,10.4,5.632809


In [46]:
#Aaron Judge absurd breakout season

In [50]:
#On average, how close was the model
average = merged["Diff"].mean()
average

1.2249337185209337