# Final attempt at a more conventional machine learning approach

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Loading combined dataset
df = pd.read_csv('../../data/raw/MLB_player_stats_2015_2024.csv')

# dropping columns that will not be needed.
df.drop(columns=['Name.1', 'Team.1', 'NameASCII', 'PlayerId'], inplace=True)

# Make sure it's sorted properly and resetting the index for the new order
df = df.sort_values(by=['MLBAMID', 'Season']).reset_index(drop=True)
df.columns

Index(['Unnamed: 0', 'Name', 'Team', 'Season', 'Age', 'G', 'AB', 'PA', 'H',
       '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF',
       'SH', 'GDP', 'SB', 'CS', 'AVG', 'IFH', 'BUH', 'BB%', 'K%', 'BB/K',
       'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%',
       'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'WAR',
       'Spd', 'wRC+', 'BsR', 'Def', 'wSB', 'UBR', 'Off', 'wGDP', 'Pull%',
       'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrels',
       'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events', 'xBA', 'xSLG',
       'xwOBA', 'XBR', 'O-Swing% (pi)', 'Z-Swing% (pi)', 'Swing% (pi)',
       'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)', 'Zone% (pi)',
       'Pace (pi)', 'MLBAMID'],
      dtype='object')

## 2024 HRs predicted by 2021, 2022, 2023 and average stats
For this approach, I am going to stick with basic statistics at first an may add in a few statcast features like barrels or hardhit%. I am going to build a new dataset for predicting future HRs using a 3 year lagging stats and an average of those stats over the 3 years as my predictors. This will hopefully keep the models computational cost to something a little more handleable for my laptop.

stats to lag: 'HR', 'Age' 'AB' 'H', '2B', '3B', 'BB', 'SO', 'RBI', 'R', 'SLG' 

In [17]:
stats_to_lag = ['Name', 'Season', 'HR', 'AB', 'H', '2B', '3B', 'BB', 'SO', 'RBI', 'R', 'SLG', 'Age' ]

for lag in range(1, 4):
    lagged = df.groupby("MLBAMID")[stats_to_lag].shift(lag)
    lagged.columns = [f"{col}_lag{lag}" for col in lagged.columns]
    df = pd.concat([df, lagged], axis=1)

df.head

<bound method NDFrame.head of       Unnamed: 0               Name   Team  Season  Age    G   AB   PA    H  \
0             88      Bartolo Colon    NYM    2015   42   33   58   64    8   
1           1048      Bartolo Colon    NYM    2016   43   34   60   65    5   
2           2027      Bartolo Colon  - - -    2017   44   28   19   20    0   
3           2977      Bartolo Colon    TEX    2018   45   28    4    4    0   
4            614     LaTroy Hawkins  - - -    2015   42   42    1    1    0   
...          ...                ...    ...     ...  ...  ...  ...  ...  ...   
8495        8317  Nacho Alvarez Jr.    ATL    2024   21    8   30   32    3   
8496        8118       Jacob Wilson    OAK    2024   22   28   92  103   23   
8497        7616   Masataka Yoshida    BOS    2023   29  140  537  580  155   
8498        8275   Masataka Yoshida    BOS    2024   30  108  378  421  106   
8499        8207       Jung Hoo Lee    SFG    2024   25   37  145  158   38   

       1B  ...  AB_la

In [18]:
# Drop rows where any of the lagged stats (lag1, lag2, lag3) are NaN
df_model = df.dropna(subset=[f"{col}_lag{lag}" for lag in range(1, 4) for col in stats_to_lag])

# Check the result to ensure everything looks good
df_model.columns

Index(['Unnamed: 0', 'Name', 'Team', 'Season', 'Age', 'G', 'AB', 'PA', 'H',
       '1B',
       ...
       'AB_lag3', 'H_lag3', '2B_lag3', '3B_lag3', 'BB_lag3', 'SO_lag3',
       'RBI_lag3', 'R_lag3', 'SLG_lag3', 'Age_lag3'],
      dtype='object', length=122)

In [19]:
# 'Name','HR', 'AB', 'H', '2B', '3B', 'BB', 'SO', 'RBI', 'R', 'SLG'
df = df.drop(columns=['Unnamed: 0', 'Team', 'G', 'AB', 'PA', 'H',
       '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF',
       'SH', 'GDP', 'SB', 'CS', 'AVG', 'IFH', 'BUH', 'BB%', 'K%', 'BB/K',
       'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%',
       'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'WAR',
       'Spd', 'wRC+', 'BsR', 'Def', 'wSB', 'UBR', 'Off', 'wGDP', 'Pull%',
       'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrels',
       'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events', 'xBA', 'xSLG',
       'xwOBA', 'XBR', 'O-Swing% (pi)', 'Z-Swing% (pi)', 'Swing% (pi)',
       'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)', 'Zone% (pi)',
       'Pace (pi)'])

df.columns

Index(['Name', 'Season', 'Age', 'MLBAMID', 'Name_lag1', 'Season_lag1',
       'HR_lag1', 'AB_lag1', 'H_lag1', '2B_lag1', '3B_lag1', 'BB_lag1',
       'SO_lag1', 'RBI_lag1', 'R_lag1', 'SLG_lag1', 'Age_lag1', 'Name_lag2',
       'Season_lag2', 'HR_lag2', 'AB_lag2', 'H_lag2', '2B_lag2', '3B_lag2',
       'BB_lag2', 'SO_lag2', 'RBI_lag2', 'R_lag2', 'SLG_lag2', 'Age_lag2',
       'Name_lag3', 'Season_lag3', 'HR_lag3', 'AB_lag3', 'H_lag3', '2B_lag3',
       '3B_lag3', 'BB_lag3', 'SO_lag3', 'RBI_lag3', 'R_lag3', 'SLG_lag3',
       'Age_lag3'],
      dtype='object')

In [20]:
df_model = df_model.drop(columns=['Unnamed: 0', 'Team', 'G', 'AB', 'PA', 'H',
       '1B', '2B', '3B', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF',
       'SH', 'GDP', 'SB', 'CS', 'AVG', 'IFH', 'BUH', 'BB%', 'K%', 'BB/K',
       'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%',
       'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'WAR',
       'Spd', 'wRC+', 'BsR', 'Def', 'wSB', 'UBR', 'Off', 'wGDP', 'Pull%',
       'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrels',
       'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events', 'xBA', 'xSLG',
       'xwOBA', 'XBR', 'O-Swing% (pi)', 'Z-Swing% (pi)', 'Swing% (pi)',
       'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)', 'Zone% (pi)',
       'Pace (pi)'])

df_model.head

<bound method NDFrame.head of                Name  Season  Age  HR  MLBAMID      Name_lag1  Season_lag1  \
3     Bartolo Colon    2018   45   0   112526  Bartolo Colon       2017.0   
14    Adrian Beltré    2018   39  15   134181  Adrian Beltré       2017.0   
36      CC Sabathia    2019   38   0   282332    CC Sabathia       2017.0   
51    Ichiro Suzuki    2018   44   0   400085  Ichiro Suzuki       2017.0   
52    Ichiro Suzuki    2019   45   0   400085  Ichiro Suzuki       2018.0   
...             ...     ...  ...  ..      ...            ...          ...   
8280   Jarren Duran    2024   27  21   680776   Jarren Duran       2023.0   
8284   Ryan Jeffers    2023   26  14   680777   Ryan Jeffers       2022.0   
8285   Ryan Jeffers    2024   27  21   680777   Ryan Jeffers       2023.0   
8295    Owen Miller    2024   27   0   680911    Owen Miller       2023.0   
8383  Andrew Vaughn    2024   26  19   683734  Andrew Vaughn       2023.0   

      HR_lag1  AB_lag1  H_lag1  ...  AB_lag3 

In [21]:
print(df_model.columns)

Index(['Name', 'Season', 'Age', 'HR', 'MLBAMID', 'Name_lag1', 'Season_lag1',
       'HR_lag1', 'AB_lag1', 'H_lag1', '2B_lag1', '3B_lag1', 'BB_lag1',
       'SO_lag1', 'RBI_lag1', 'R_lag1', 'SLG_lag1', 'Age_lag1', 'Name_lag2',
       'Season_lag2', 'HR_lag2', 'AB_lag2', 'H_lag2', '2B_lag2', '3B_lag2',
       'BB_lag2', 'SO_lag2', 'RBI_lag2', 'R_lag2', 'SLG_lag2', 'Age_lag2',
       'Name_lag3', 'Season_lag3', 'HR_lag3', 'AB_lag3', 'H_lag3', '2B_lag3',
       '3B_lag3', 'BB_lag3', 'SO_lag3', 'RBI_lag3', 'R_lag3', 'SLG_lag3',
       'Age_lag3'],
      dtype='object')


In [22]:
df_model = df_model.drop(columns=['Name_lag1', 'Season_lag1', 'Name_lag2',
       'Season_lag2',  'Name_lag3', 'Season_lag3'])

print(df_model.columns)

Index(['Name', 'Season', 'Age', 'HR', 'MLBAMID', 'HR_lag1', 'AB_lag1',
       'H_lag1', '2B_lag1', '3B_lag1', 'BB_lag1', 'SO_lag1', 'RBI_lag1',
       'R_lag1', 'SLG_lag1', 'Age_lag1', 'HR_lag2', 'AB_lag2', 'H_lag2',
       '2B_lag2', '3B_lag2', 'BB_lag2', 'SO_lag2', 'RBI_lag2', 'R_lag2',
       'SLG_lag2', 'Age_lag2', 'HR_lag3', 'AB_lag3', 'H_lag3', '2B_lag3',
       '3B_lag3', 'BB_lag3', 'SO_lag3', 'RBI_lag3', 'R_lag3', 'SLG_lag3',
       'Age_lag3'],
      dtype='object')


In [23]:
df_model

Unnamed: 0,Name,Season,Age,HR,MLBAMID,HR_lag1,AB_lag1,H_lag1,2B_lag1,3B_lag1,...,AB_lag3,H_lag3,2B_lag3,3B_lag3,BB_lag3,SO_lag3,RBI_lag3,R_lag3,SLG_lag3,Age_lag3
3,Bartolo Colon,2018,45,0,112526,0.0,19.0,0.0,0.0,0.0,...,58.0,8.0,1.0,0.0,0.0,24.0,4.0,2.0,0.155172,42.0
14,Adrian Beltré,2018,39,15,134181,17.0,340.0,106.0,22.0,1.0,...,567.0,163.0,32.0,4.0,41.0,65.0,83.0,83.0,0.453263,36.0
36,CC Sabathia,2019,38,0,282332,0.0,3.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.000000,34.0
51,Ichiro Suzuki,2018,44,0,400085,3.0,196.0,50.0,6.0,0.0,...,398.0,91.0,5.0,6.0,31.0,51.0,21.0,45.0,0.278894,41.0
52,Ichiro Suzuki,2019,45,0,400085,0.0,44.0,9.0,0.0,0.0,...,327.0,95.0,15.0,5.0,30.0,42.0,22.0,48.0,0.376147,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8280,Jarren Duran,2024,27,21,680776,8.0,332.0,98.0,34.0,2.0,...,107.0,23.0,3.0,2.0,4.0,40.0,10.0,17.0,0.336449,24.0
8284,Ryan Jeffers,2023,26,14,680777,7.0,212.0,44.0,10.0,1.0,...,55.0,15.0,0.0,0.0,5.0,19.0,7.0,5.0,0.436364,23.0
8285,Ryan Jeffers,2024,27,21,680777,14.0,286.0,79.0,15.0,2.0,...,267.0,53.0,10.0,1.0,22.0,108.0,35.0,28.0,0.400749,24.0
8295,Owen Miller,2024,27,0,680911,5.0,291.0,76.0,17.0,0.0,...,191.0,39.0,8.0,0.0,9.0,54.0,18.0,17.0,0.308901,24.0


In [24]:
df_model.dropna(inplace=True, ignore_index=True)

## split datasets for training and testing

In [36]:
# preparing the training/testing splits
df_training = df_model[df_model['Season'] != 2024].reset_index(drop=True)
df_testing = df_model[df_model['Season'] == 2024].reset_index(drop=True)

# preparing the training/testing splits
X_train = df_training.drop(columns=['HR', 'Season', 'Name', 'MLBAMID'])
y_train = df_training['HR']

# Store 'Name' and 'MLBAMID' separately to match back after prediction
df_identifiers = df_testing[['Name', 'MLBAMID']]
X_test = df_testing.drop(columns=['HR', 'Season', 'Name', 'MLBAMID'])
y_test = df_testing['HR']


X_test

Unnamed: 0,Age,HR_lag1,AB_lag1,H_lag1,2B_lag1,3B_lag1,BB_lag1,SO_lag1,RBI_lag1,R_lag1,...,AB_lag3,H_lag3,2B_lag3,3B_lag3,BB_lag3,SO_lag3,RBI_lag3,R_lag3,SLG_lag3,Age_lag3
0,36,7.0,394.0,102.0,25.0,1.0,20.0,72.0,55.0,47.0,...,487.0,126.0,30.0,8.0,46.0,92.0,63.0,57.0,0.402464,33.0
1,37,8.0,359.0,100.0,24.0,5.0,39.0,55.0,40.0,57.0,...,514.0,139.0,25.0,4.0,54.0,91.0,78.0,76.0,0.410506,34.0
2,37,15.0,362.0,69.0,12.0,0.0,30.0,139.0,36.0,33.0,...,373.0,64.0,10.0,1.0,47.0,127.0,36.0,40.0,0.300268,34.0
3,36,5.0,394.0,111.0,26.0,1.0,40.0,100.0,38.0,43.0,...,307.0,86.0,17.0,0.0,25.0,58.0,31.0,35.0,0.403909,33.0
4,37,12.0,390.0,100.0,19.0,0.0,75.0,100.0,43.0,55.0,...,482.0,107.0,24.0,1.0,81.0,132.0,80.0,78.0,0.443983,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,29,22.0,403.0,110.0,17.0,2.0,40.0,117.0,70.0,59.0,...,284.0,73.0,12.0,0.0,25.0,104.0,50.0,47.0,0.447183,26.0
336,27,8.0,332.0,98.0,34.0,2.0,24.0,90.0,40.0,46.0,...,107.0,23.0,3.0,2.0,4.0,40.0,10.0,17.0,0.336449,24.0
337,27,14.0,286.0,79.0,15.0,2.0,33.0,93.0,43.0,46.0,...,267.0,53.0,10.0,1.0,22.0,108.0,35.0,28.0,0.400749,24.0
338,27,5.0,291.0,76.0,17.0,0.0,17.0,61.0,27.0,29.0,...,191.0,39.0,8.0,0.0,9.0,54.0,18.0,17.0,0.308901,24.0


## Training Lasso Model

In [37]:
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)

print("Lasso R² Train:", lasso.score(X_train, y_train))
print("Lasso R² Test:", lasso.score(X_test, y_test))


Lasso R² Train: 0.4556024081269585
Lasso R² Test: 0.6015340201065312


In [43]:

y_pred = lasso.predict(X_test)  

# Create a dataframe with player identifiers and predicted HRs
df_predictions = pd.DataFrame({
    'Name': df_identifiers['Name'],
    'MLBAMID': df_identifiers['MLBAMID'],
    'Predicted_HR': y_pred,
    'Actual_HR' : y_test
})

df_actual = pd.DataFrame({
    'Name': df_identifiers['Name'],
    'MLBAMID': df_identifiers['MLBAMID'],
    'Actual_HR' : y_test
})

df_predictions.sort_values(by='Predicted_HR', ascending=False, inplace=True)
df_actual.sort_values(by='Actual_HR', ascending=False, inplace=True)

# Now you have a dataframe with player names, MLBAMID, and their predicted HRs
print(df_predictions.head(10))
print(df_actual.head(10))

                      Name  MLBAMID  Predicted_HR  Actual_HR
69             Aaron Judge   592450     34.461169         58
139             Matt Olson   621566     31.621728         29
150            Pete Alonso   624413     31.546481         34
233          Shohei Ohtani   660271     31.424315         54
221         Kyle Schwarber   656941     30.281586         38
285              Juan Soto   665742     28.511656         41
100           Mookie Betts   605141     28.490115         19
244           Austin Riley   663586     28.119062         19
322         Yordan Alvarez   670541     28.118212         35
283  Vladimir Guerrero Jr.   665489     26.154847         30
                  Name  MLBAMID  Actual_HR
69         Aaron Judge   592450         58
233      Shohei Ohtani   660271         54
149  Anthony Santander   623993         44
285          Juan Soto   665742         41
301       Brent Rooker   667670         39
26       Marcell Ozuna   542303         39
114       José Ramírez   608