In [2]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [3]:
START = 2015
END = 2023
stats_raw = batting_stats(START, END, qual=300)
stats_raw.to_csv("batting_stats.csv")

In [4]:
stats_raw = stats_raw.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [5]:
stats_raw

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
1,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246,0.609,404,0.169,0.287,,,,11.3
2,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217,0.500,434,0.220,0.270,,,,10.4
3,10155,2018,Mike Trout,LAA,26,140,471,608,147,80,...,118.0,162,0.460,352,0.201,0.261,,,,9.6
0,11579,2015,Bryce Harper,WSN,22,153,521,654,172,91,...,116.0,188,0.477,394,0.118,0.226,,,,9.3
33,10155,2015,Mike Trout,LAA,23,159,575,682,172,93,...,117.7,205,0.486,422,0.207,0.282,,,,9.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943,1177,2017,Albert Pujols,LAA,37,149,593,636,143,103,...,112.2,197,0.391,504,0.180,0.268,,,,-2.0
2045,4613,2016,Prince Fielder,TEX,32,89,326,370,69,45,...,113.0,85,0.317,268,0.161,0.266,,,,-1.8
1628,10815,2023,Jurickson Profar,- - -,30,125,459,521,111,73,...,108.8,119,0.317,375,0.151,0.236,,,,-0.9
1939,393,2015,Victor Martinez,DET,36,120,440,485,108,77,...,108.9,131,0.332,395,0.163,0.223,,,,-2.0


In [6]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_AVG"] = player["AVG"].shift(-1)
    return player

stats_raw = stats_raw.groupby("IDfg", group_keys=False).apply(next_season)

In [7]:
stats_raw[["Name", "Season", "AVG", "Next_AVG"]]

Unnamed: 0,Name,Season,AVG,Next_AVG
1939,Victor Martinez,2015,0.245,0.289
505,Victor Martinez,2016,0.289,0.255
1660,Victor Martinez,2017,0.255,0.251
2000,Victor Martinez,2018,0.251,
595,Carlos Beltran,2015,0.276,0.295
...,...,...,...,...
1075,Spencer Torkelson,2023,0.233,
1408,Ha-seong Kim,2022,0.251,0.260
959,Ha-seong Kim,2023,0.260,
859,Seiya Suzuki,2022,0.262,0.285


In [8]:
NaN_count = stats_raw.isnull().sum()

In [9]:
NaN_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         2012
xSLG        2012
xwOBA       2012
L-WAR          0
Next_AVG     506
Length: 321, dtype: int64

In [12]:
complete_cols = list(stats_raw.columns[NaN_count == 0])

In [13]:
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CT%',
 'CTv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCT',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCT/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'FA% (sc)',
 'FC% (sc)',
 'SI% (sc)',
 'SL% (sc)',
 'CU% (sc)',
 'KC% (sc)',
 'CH% (sc)',
 'vFA (sc)',
 'vFC (sc)'

In [14]:
stats_complete = stats_raw[complete_cols + ["Next_AVG"]].copy()

In [15]:
stats_complete

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR,Next_AVG
1939,393,2015,Victor Martinez,DET,36,120,440,485,108,77,...,12,0.030,108.9,131,0.332,395,0.163,0.223,-2.0,0.289
505,393,2016,Victor Martinez,DET,37,154,553,610,160,111,...,34,0.073,110.2,194,0.416,466,0.158,0.231,1.1,0.255
1660,393,2017,Victor Martinez,DET,38,107,392,435,100,74,...,16,0.048,107.1,114,0.344,331,0.157,0.226,-1.1,0.251
2000,393,2018,Victor Martinez,DET,39,133,467,508,117,87,...,18,0.042,107.6,129,0.304,425,0.148,0.212,-1.7,
595,589,2015,Carlos Beltran,NYY,38,133,478,531,132,78,...,17,0.043,112.8,167,0.419,399,0.179,0.253,1.8,0.295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,27465,2023,Spencer Torkelson,DET,23,159,606,684,141,75,...,62,0.141,112.7,222,0.505,440,0.166,0.269,1.2,
1408,27506,2022,Ha-seong Kim,SDP,26,150,517,582,130,87,...,18,0.043,109.4,137,0.323,424,0.205,0.276,4.0,0.260
959,27506,2023,Ha-seong Kim,SDP,27,152,538,626,140,100,...,18,0.043,108.5,111,0.262,424,0.233,0.293,4.1,
859,30116,2022,Seiya Suzuki,CHC,27,111,397,446,104,66,...,32,0.110,111.3,117,0.403,290,0.232,0.312,2.0,0.285


In [16]:
stats_complete.dtypes[stats_complete.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [17]:
del stats_complete["Dol"]
del stats_complete["Age Rng"]

In [39]:
stats_complete["team_code"] = stats_complete["Team"].astype('category').cat.codes

In [40]:
stats_full = stats_complete.copy()
stats_complete = stats_complete.dropna().copy()

In [34]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)

In [44]:
removed_columns = ["Next_AVG", "Name", "Team", "IDfg", "Season"]
selected_columns = stats_complete.columns[~stats_complete.columns.isin(removed_columns)]

In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
stats_complete.loc[:, selected_columns] = scaler.fit_transform(stats_complete[selected_columns])

In [46]:
stats_complete.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR,Next_AVG,team_code
count,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,...,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0
mean,11046.478752,2018.169987,0.404232,0.67221,0.500227,0.502442,0.447067,0.3693,0.375429,0.157548,...,0.274011,0.498828,0.440715,0.554331,0.470363,0.441486,0.504448,0.380962,0.258234,14.440239
std,5263.752483,2.371268,0.161453,0.237909,0.238721,0.255904,0.202994,0.167714,0.160445,0.149203,...,0.15657,0.143726,0.178691,0.147081,0.209925,0.154984,0.148826,0.132413,0.030033,9.192483
min,393.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0
25%,6609.0,2016.0,0.272727,0.5,0.306604,0.293722,0.284639,0.243103,0.245283,0.066667,...,0.156489,0.405405,0.29918,0.465649,0.297483,0.340102,0.403409,0.286458,0.239,6.0
50%,11339.0,2018.0,0.409091,0.728261,0.525943,0.524664,0.439759,0.358621,0.358491,0.133333,...,0.259542,0.495495,0.434426,0.562977,0.477117,0.441624,0.505682,0.368056,0.258,14.0
75%,14551.0,2021.0,0.5,0.880435,0.700472,0.719731,0.60241,0.482759,0.490566,0.2,...,0.366412,0.594595,0.572746,0.652672,0.631579,0.543147,0.607955,0.458333,0.277,22.0
max,30116.0,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.354,30.0


In [48]:
sfs.fit(stats_complete[selected_columns], stats_complete["Next_AVG"])

In [51]:
features = list(selected_columns[sfs.get_support()])

In [57]:
def backtest(data, model, features, start=4, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[features], train["Next_AVG"])
        
        preds = model.predict(test[features])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_AVG"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [58]:
predictions = backtest(stats_complete, rr, features)

In [59]:
predictions

Unnamed: 0,actual,prediction
1536,0.270,0.244498
1260,0.256,0.264712
776,0.259,0.254301
153,0.214,0.260811
28,0.265,0.256514
...,...,...
454,0.277,0.251188
1901,0.280,0.258758
2078,0.233,0.245842
1408,0.260,0.246200


In [60]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

0.0006576429732129415

In [61]:
stats_complete["Next_AVG"].describe()

count    1506.000000
mean        0.258234
std         0.030033
min         0.160000
25%         0.239000
50%         0.258000
75%         0.277000
max         0.354000
Name: Next_AVG, dtype: float64

In [62]:
0.0006576429732129415 ** .5

0.025644550555877197

In [64]:
def player_history(df):
    df = df.sort_values("Season")
    
    df["player_season"] = range(0, df.shape[0])
    df["avg_corr"] = list(df[["player_season", "AVG"]].expanding().corr().loc[(slice(None), "player_season"),"AVG"])
    df["avg_corr"].fillna(1, inplace=True)
    
    df["avg_diff"] = df["AVG"] / df["AVG"].shift(1)
    df["avg_diff"].fillna(1, inplace=True)
    
    df["avg_diff"][df["avg_diff"] == np.inf] = 1
    
    return df

stats_complete = stats_complete.groupby("IDfg", group_keys=False).apply(player_history)

In [65]:
def group_averages(df):
    return df["AVG"] / df["AVG"].mean()

In [68]:
stats_complete["avg_season"] = stats_complete.groupby("Season", group_keys=False).apply(group_averages)

In [69]:
new_features = features + ["player_season", "avg_corr", "avg_season", "avg_diff"]

In [71]:
predictions = backtest(stats_complete, rr, new_features)

In [74]:
mean_squared_error(predictions["actual"], predictions["prediction"])

0.0006838544017170163

In [76]:
pd.Series(rr.coef_, index=new_features).sort_values()

SO                -0.026964
Pull%             -0.023203
Age               -0.023159
SI% (sc)          -0.021524
RAR               -0.016971
FS% (pi)          -0.015073
avg_diff          -0.009142
avg_corr          -0.002554
player_season     -0.000012
AVG                0.004011
wFA (sc)           0.005075
Z-Contact% (pi)    0.005236
Spd                0.011788
GB%                0.012502
avg_season         0.014280
Z-Swing% (pi)      0.015489
Med%               0.016396
FC-X (sc)          0.019690
SI% (pi)           0.020324
BABIP              0.022382
Hard%+             0.025620
HardHit            0.029469
Barrels            0.033560
O-Contact%         0.043775
dtype: float64

In [77]:
diff = predictions["actual"] - predictions["prediction"]

merged = predictions.merge(stats_complete, left_index=True, right_index=True)

merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

merged[["IDfg", "Season", "Name", "AVG", "Next_AVG", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,AVG,Next_AVG,diff
1278,17992,2019,David Fletcher,0.691489,0.262,0.000086
1726,10243,2021,Randal Grichuk,0.430851,0.259,0.000182
1497,8709,2022,Elvis Andrus,0.473404,0.251,0.000183
730,11281,2019,Whit Merrifield,0.755319,0.277,0.000222
1827,18577,2022,Ke'Bryan Hayes,0.446809,0.271,0.000245
...,...,...,...,...,...,...
49,13590,2021,Jesse Winker,0.771277,0.219,0.068082
1395,12984,2019,Jackie Bradley Jr.,0.345745,0.163,0.074742
1661,19844,2021,Abraham Toro,0.420213,0.185,0.078346
32,15998,2019,Cody Bellinger,0.771277,0.165,0.108131
