In [75]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [77]:
START = 2002
END = 2022

In [79]:
#Qual stands for how many at bats the batter should at least have
batting = batting_stats(START, END, qual=200)

In [80]:
batting = pd.read_csv("batting.csv") 

In [81]:
#Grouping each individual players' stats and remove groups that have only one season of data

batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [82]:
#Trying to predict the Wins Above Replacement (WAR) of the players
# What is WAR? 
# WAR (Wins Above Replacement) in baseball measures a player's total value to their team by estimating 
# how many more wins they contribute compared to a replacement-level player.

In [83]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player
    
batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [84]:
# Pulling the WAR value one to the back to show in the variable "Next_WAR"
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
3944,Alfredo Amezaga,2006,1.1,2.0
2609,Alfredo Amezaga,2007,2.0,1.2
3787,Alfredo Amezaga,2008,1.2,
1020,Garret Anderson,2002,3.7,5.1
425,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
4665,Owen Miller,2022,0.7,
6108,Andrew Vaughn,2021,-0.2,-0.5
6449,Andrew Vaughn,2022,-0.5,
5238,Ha-seong Kim,2021,0.4,3.7


In [85]:
# Cleaning the data for AI to impute all the values

null_count = batting.isnull().sum()

In [93]:
null_count

Unnamed: 0       0
IDfg             0
Season           0
Name             0
Team             0
              ... 
xBA           6754
xSLG          6754
xwOBA         6754
L-WAR            0
Next_WAR      1179
Length: 322, dtype: int64

In [95]:
complete_cols = list(batting.columns[null_count == 0])

In [99]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [101]:
batting

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR
3944,5562,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
2609,5006,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
3787,5252,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,...,101,101,123,111,64,0,0.178,0.244,1.2,
1020,1169,2,2002,Garret Anderson,ANA,30,158,638,678,195,...,91,80,65,97,129,0,0.137,0.232,3.7,5.1
425,864,2,2003,Garret Anderson,ANA,31,159,638,673,201,...,101,80,90,99,109,0,0.164,0.252,5.1,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4665,6002,24655,2022,Owen Miller,CLE,25,130,424,472,103,...,111,97,131,100,83,340,0.188,0.266,0.7,
6108,4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,...,104,116,84,99,110,321,0.185,0.285,-0.4,-0.5
6449,3377,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,...,106,111,94,100,104,419,0.201,0.291,-0.5,
5238,6620,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,...,99,59,137,96,88,201,0.216,0.303,0.5,3.7


In [103]:
batting.dtypes

Unnamed: 0      int64
IDfg            int64
Season          int64
Name           object
Team           object
               ...   
Events          int64
CStr%         float64
CSW%          float64
L-WAR         float64
Next_WAR      float64
Length: 134, dtype: object

In [105]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [107]:
batting["Dol"]

3944      $5.5
2609     $11.2
3787      $7.2
1020     $14.6
425      $22.0
         ...  
4665      $5.5
6108    ($1.6)
6449    ($4.1)
5238      $3.0
1071     $29.2
Name: Dol, Length: 6754, dtype: object

In [109]:
# We don't need this value as it's for fangraph to 

del batting["Dol"]

In [111]:
batting["Age Rng"]

3944    28 - 28
2609    29 - 29
3787    30 - 30
1020    30 - 30
425     31 - 31
         ...   
4665    25 - 25
6108    23 - 23
6449    24 - 24
5238    25 - 25
1071    26 - 26
Name: Age Rng, Length: 6754, dtype: object

In [113]:
# Don't need this as it's useless information for the prediction
del batting["Age Rng"]

In [117]:
# Notice the team name is an object. We can change the team to corresponding number. 
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [135]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [137]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

# Setting this higherit reduces overfitting 
rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)

In [155]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season", "Unnamed: 0"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [141]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [157]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,0.103459,...,0.403164,0.410923,0.511026,0.478646,0.172991,0.498932,0.545898,0.322028,1.792951,0.474128
std,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,0.105891,...,0.131213,0.121082,0.130359,0.133992,0.273858,0.13718,0.120701,0.122149,1.981057,0.305105
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.1,0.0
25%,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,0.043478,...,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.234177,0.4,0.205882
50%,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,0.086957,...,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,0.303797,1.5,0.470588
75%,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,0.130435,...,0.488722,0.483146,0.594203,0.564626,0.346411,0.591489,0.625551,0.392405,2.9,0.735294
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [159]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [165]:
predictors = list(selected_columns[sfs.get_support()])

In [177]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [179]:
predictions = backtest(batting, rr, predictors)

In [181]:
predictions

Unnamed: 0,actual,prediction
2609,1.2,1.405835
3390,1.4,0.716105
4581,-0.1,0.457908
4668,0.6,0.979155
1756,4.8,2.214873
...,...,...
2063,2.2,2.751896
3796,0.8,2.084249
6696,0.7,1.584162
6108,-0.5,1.819452


In [185]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.73609694226162

In [187]:
batting["Next_WAR"].describe()

count    5575.000000
mean        1.792951
std         1.981057
min        -3.100000
25%         0.400000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [189]:
2.73609694226162 ** .5

1.6541151538697723

In [243]:
def player_history(df):
    df = df.sort_values("Season")
    
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
    df["war_corr"].fillna(1, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [244]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [245]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [246]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [247]:
predictions = backtest(batting, rr, new_predictors)

In [253]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.6788090890767275

In [257]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.712820
BABIP           -1.939542
WAR             -1.843622
SLG+            -1.384088
Soft%+          -1.323119
BU              -1.127176
SO              -0.898473
PH              -0.744796
WPA             -0.555514
CH%             -0.288723
wCH             -0.285498
war_diff        -0.284176
CB%             -0.275700
Pull%+          -0.217444
war_corr        -0.137852
player_season    0.000095
IFH              0.633172
Oppo%            0.694981
Spd              0.768796
OBP+             0.827846
SB               0.963999
IBB              2.062798
Hard%+           2.442550
war_season       3.191648
dtype: float64

In [259]:
diff = predictions["actual"] - predictions["prediction"]

In [263]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [265]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [269]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
4487,4403,2013,Erik Kratz,0.246835,1.1,0.001375
6435,17696,2021,Kevin Newman,0.170886,1.2,0.001875
622,15172,2019,Tim Anderson,0.481013,2.3,0.002661
2105,1286,2008,Michael Young,0.348101,2.6,0.003986
4748,5887,2013,John Jaso,0.234177,0.6,0.004329
...,...,...,...,...,...,...
3595,1875,2009,Josh Hamilton,0.278481,8.4,6.457327
903,9166,2010,Buster Posey,0.443038,9.8,6.526769
5754,5631,2010,Matt Kemp,0.196203,8.3,6.526948
316,15640,2021,Aaron Judge,0.544304,11.1,7.313785
