### This a WAR predictor using machine learning and python using the pybaseball library by Kacie Motley

In [5]:
#First we import the packages we plan on working with
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [2]:
# We add the range in years that we'll be using to compile our data
Start = 2002
End = 2022

In [3]:
#We add our batting stats function and our 'qual' which is the minimum plate appearances we want a batter to have
batting = batting_stats(Start, End, qual=200)

In [4]:
# running to_csv puts this information in a csv file
batting.to_csv("battingw.csv")

In [6]:
#we'll then remove players who only have a single season played due to the fact we can't make a proper prediction with only one season
# The groupby command splits our data into groups based on player id
# the filter will help us remove any group who has less than two seasons played
batting = batting.groupby("IDfg" , group_keys=False).filter(lambda x: x.shape[0] > 1)

In [7]:
# we'll check our data by calling it
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6885,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,0.0,,0,0.166,0.252,,,,-2.4
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-3.1
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


In [9]:
#we used 'fillana' to replace our NaN values with 0's instead for a cleaner look
batting.fillna(0)

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,0.0,0.0,0.000,0,0.127,0.191,0.0,0.0,0.0,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,0.0,0.0,0.000,0,0.124,0.164,0.0,0.0,0.0,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,0.0,0.0,0.0,11.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,0.0,0.0,0.0,10.4
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,0.0,0.0,0.000,0,0.135,0.223,0.0,0.0,0.0,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6885,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,0.0,0.0,0.000,0,0.166,0.252,0.0,0.0,0.0,-2.4
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,0.0,0.0,0.0,-3.1
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,0.0,0.0,0.000,0,0.169,0.295,0.0,0.0,0.0,-2.9
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,0.0,0.0,0.000,0,0.130,0.187,0.0,0.0,0.0,-2.9


### we'll now split our data up by player and filter our WAR as our target


In [14]:
#Each player is split into a groups based on player id then the war is computed for each season
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player ["WAR"].shift(-1)
    return player
batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [16]:
#We'll now set the parameters that will be output
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5562,Alfredo Amezaga,2006,1.1,2.0
5006,Alfredo Amezaga,2007,2.0,1.2
5252,Alfredo Amezaga,2008,1.2,
1169,Garret Anderson,2002,3.7,5.1
864,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
6002,Owen Miller,2022,0.6,
4881,Andrew Vaughn,2021,-0.3,-0.5
3377,Andrew Vaughn,2022,-0.5,
6620,Ha-seong Kim,2021,0.5,3.7


In [25]:
#we'll use 'null_count' to better our computing accuracy and give us an idea on how many missing values we have
null_count = batting.isnull().sum()

In [26]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         6754
xSLG        6754
xwOBA       6754
L-WAR          0
Next_WAR    1179
Length: 321, dtype: int64

In [27]:
#we'll now use complete columns so that our data won't run missing values for a more accurate output
# we specify we only want columns that have null counts of 0 and convert that into a list
complete_cols = list(batting.columns[null_count ==0])

In [29]:
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'Def',
 'wSB',
 'UBR',
 'Age Rng',
 'Off',
 'Lg',
 'wGDP',
 'Pull%',
 'Cent%',
 'Oppo%',
 'Soft%',
 'Med%',
 'Hard%',
 'TTO%',
 'AVG+',
 'BB%+',
 'K

In [31]:
# now we'll take the batting df and select all the complete columns and the next war column
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [32]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR
5562,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
5006,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
5252,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,1.2,
1169,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,91,80,65,97,129,0,0.137,0.232,3.7,5.1
864,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,5.1,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6002,24655,2022,Owen Miller,CLE,25,130,424,472,103,70,...,111,97,131,100,83,340,0.188,0.266,-0.1,
4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,104,116,84,99,110,321,0.185,0.285,0.2,-0.5
3377,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,92,...,106,111,94,100,104,419,0.201,0.291,0.4,
6620,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,99,59,137,96,88,201,0.216,0.303,0.6,3.7


In [37]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 133, dtype: object

In [39]:
#now we'll identify any strings in our data
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [40]:
#we'll analyze this column by calling it
batting["Dol"]

5562      $5.5
5006     $11.2
5252      $7.2
1169     $14.6
864      $22.0
         ...  
6002      $4.9
4881    ($2.8)
3377    ($4.1)
6620      $3.9
4396     $29.3
Name: Dol, Length: 6754, dtype: object

In [41]:
#we'll delete the column because we don't need dollar amounts assigned to players
del batting["Dol"]

In [42]:
#We'll  repeat the same process for 'Age Rng'
del batting ["Age Rng"]

### Now a number will be assigned to each team for the ML model

In [47]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [53]:
#now we'll make a copy of the data so we can add it into another variable and place anywhere where 'Next_War' is missing because the model can't be trained without it
batting_full = batting.copy()
batting = batting.dropna()

### Now we'll create a feature selector so the model can filter information 

In [54]:
#The timeseries split will split our data into parts and make predictions for each part
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=20, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [55]:
#we set up the model to make sure it only pulls information based on pratical predictions, to make sure that it doesn't take data from the future to predict what a player did in the past
# we dont want to paste our target in so we'll remove'Next_War' and any other column that might throw off our prediction
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [56]:
#next we'll scale our data so the mean is 0 and the standard deviation is 1 since this a linear regression model
#The minmax scaler makes sure all our values are between 0 and 1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])

In [58]:
#now we'll review our dataframe and use the .describe function to review the data in a more summarized manner
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,0.103459,...,0.403164,0.410923,0.511026,0.478646,0.172991,0.498932,0.545898,0.334663,1.794368,0.474128
std,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,0.105891,...,0.131213,0.121082,0.130359,0.133992,0.273858,0.13718,0.120701,0.120013,1.997233,0.305105
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.4,0.0
25%,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,0.043478,...,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.248447,0.3,0.205882
50%,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,0.086957,...,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,0.31677,1.5,0.470588
75%,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,0.130435,...,0.488722,0.483146,0.594203,0.564626,0.346411,0.591489,0.625551,0.403727,2.9,0.735294
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [59]:
#we'll now fit the selector to our data and it will pick the best 20 predictors for this regression model
sfs.fit(batting[selected_columns], batting["Next_WAR"])


SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=Ridge(alpha=1), n_features_to_select=20,
                          n_jobs=8)

In [63]:
#you can extract the list of predictors from the Sequential Feature Selector by doing the followig
#we'll store the predictors in our list named predictors
predictors = list(selected_columns[sfs.get_support()])

In [66]:
#we'll review our list by calling it
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'BU',
 'BABIP',
 'IFH%',
 'WAR',
 'Spd',
 'PH',
 'CB%',
 'O-Contact%',
 'wGDP',
 'Oppo%',
 'OBP+',
 'SLG+',
 'Pull%+',
 'Soft%+',
 'Hard%+',
 'L-WAR']

In [71]:
#we'll use 'backtest' to generate our predictions
#cross validation splits the data up but still assess the whole
# we only want to use past data to predict future data
#we'll find all the unique seasons from our df and sort them in order from 2002-2021 and assign it to years
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [72]:
#we'll now paste in our parameters and begin backtesting
predictions = backtest(batting, rr, predictors)

In [74]:
#The actual is the current WAR and the prediction is predicted next season WAR
predictions

Unnamed: 0,actual,prediction
5006,1.2,1.559413
1925,1.4,0.791633
3102,-0.1,0.569487
5797,0.6,0.908732
1109,4.8,2.193959
...,...,...
1914,2.4,2.767596
5875,0.9,1.940524
7032,0.6,1.539736
4881,-0.5,1.897370


In [75]:
#we'll use a summary statistic to create an error metric to see how many errors our model has,the columns will be compared to each other and square the difference

from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.801466726541556

### Now lets improve the accuracy

In [76]:
#we'll feed the algorithm information on how the player did in more seasons than the current to make our predictions more robust and personalized
#we'll add a season predictor and war coorealtion metric 
#we'll use fillna to add 0 to any missing seasons
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(0, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [77]:
# this function will help us find and compare avergaes across a whole season 
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [78]:
#This column will help us create one group per season and find us an avergae on how each player did in a season
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [79]:
#now we'll create a list for our predictors + our player seasons
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [80]:
#Now we'll run our backtest again with our new parameters added 
predictions = backtest(batting, rr, new_predictors)

In [81]:
#We'll run our MSE to see if our accuracy has improved 
mean_squared_error(predictions["actual"], predictions["prediction"]) 

2.717210109968

### As you can see our model is doing slightly better

### Now we'll diagnose the issues with the model

In [85]:
#now we'll review how much each predictor is impacting the model for a deeper insight
#the larger the coefficient means the more the model is taking that specific predictor into account
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.587292
WAR             -1.896205
BABIP           -1.850395
SLG+            -1.454811
Soft%+          -1.275969
BU              -0.954210
PH              -0.709638
SO              -0.651198
war_diff        -0.588742
wGDP            -0.441893
CB%             -0.329824
Pull%+          -0.192495
war_corr        -0.091225
player_season    0.000762
O-Contact%       0.257043
L-WAR            0.303302
IFH%             0.400803
OBP+             0.474643
Oppo%            0.703791
Spd              0.749415
SB               1.053500
IBB              1.689358
Hard%+           2.368925
war_season       3.379910
dtype: float64

In [86]:
#we can look at our actual values vs our predicitions
diff = predictions["actual"] - predictions["prediction"] 


In [89]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [91]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [92]:
#we can filter these columns and help see the difference and which players are being miscatergorized
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
6050,1638,2007,Miguel Olivo,0.254658,0.4,0.001826
6337,4712,2011,Ben Revere,0.304348,2.1,0.001852
5447,11846,2016,Leonys Martin,0.422360,1.9,0.002020
4859,4712,2013,Ben Revere,0.267081,1.8,0.003045
166,1274,2008,Alex Rodriguez,0.571429,4.1,0.003065
...,...,...,...,...,...,...
3823,1875,2009,Josh Hamilton,0.291925,8.4,6.361302
3161,4810,2007,Brian McCann,0.304348,8.6,6.373673
871,9166,2010,Buster Posey,0.459627,10.1,6.581365
2516,11579,2014,Bryce Harper,0.310559,9.3,7.454806
