In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from ISLP.models import ModelSpec as MS


# Notebook to train models and compare them.

The first 2 models features were selected by by running VIF calculations for each feature and then filtering feature lists down to VIF < 10, < 5 and < 1. Features with a VIF less than 1 resulted in no features.

## Model 1 (Features filtered by VIF < 10)

In [2]:
players_df_1 = pd.read_csv('../../data/processed/lr_model1_training_testing_dataset.csv')
players_df_1.dropna(ignore_index=True, inplace=True)

# preparing the training/testing splits
training_df_1 = players_df_1[players_df_1['Season'] != 2024].reset_index(drop=True)
testing_df_1 = players_df_1[players_df_1['Season'] == 2024].reset_index(drop=True)

# preparing the training/testing splits
X_train_1 = training_df_1.drop(columns=['HR', 'Season'])
y_train_1 = training_df_1['HR']

X_test_1 = testing_df_1.drop(columns=['HR', 'Season'])
y_test_1 = testing_df_1['HR']

# adding intercept
X_train_1.insert(0, 'intercept', 1)
X_test_1.insert(0, 'intercept', 1)

sm_model_1 = sm.OLS(y_train_1, X_train_1).fit()
sm_model_1.summary()

0,1,2,3
Dep. Variable:,HR,R-squared:,0.724
Model:,OLS,Adj. R-squared:,0.723
Method:,Least Squares,F-statistic:,685.0
Date:,"Mon, 14 Apr 2025",Prob (F-statistic):,0.0
Time:,16:14:17,Log-Likelihood:,-20543.0
No. Observations:,6826,AIC:,41140.0
Df Residuals:,6799,BIC:,41320.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-5.5711,2.115,-2.634,0.008,-9.717,-1.425
season_factor,2.2822,0.398,5.740,0.000,1.503,3.062
tmFactor,7.3054,0.461,15.839,0.000,6.401,8.210
Age,0.0551,0.828,0.067,0.947,-1.569,1.679
3B,0.4678,0.052,8.938,0.000,0.365,0.570
IBB,1.0766,0.032,33.347,0.000,1.013,1.140
GDP,0.5978,0.018,32.473,0.000,0.562,0.634
IFH,0.3557,0.019,18.670,0.000,0.318,0.393
BUH,-0.3563,0.058,-6.103,0.000,-0.471,-0.242

0,1,2,3
Omnibus:,1075.242,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9598.681
Skew:,0.482,Prob(JB):,0.0
Kurtosis:,8.729,Cond. No.,5210.0


There are still some high P values, aven with viltering down features by VIF < 10. Also noted at the bottom of summary, condition number is large indicating strong multicollinearity, which makes sense as we are including features that still had VIF values between 5 and 10 with can be problematic.

In [None]:
# getting predictions from test data.
pred_results_1 = sm_model_1.get_prediction(X_test_1)

# creating dataframe of results summary from the predictions and adding in actual homeruns
pred_summary_1 = pred_results_1.summary_frame(alpha=0.05)
pred_summary_1.insert(0, 'HR_actual', y_test_1.values)
# adding Home run error 
pred_summary_1.insert(1, 'HR_error', pred_summary_1['HR_actual'] - pred_summary_1['mean'])

pred_summary_1

Unnamed: 0,HR_actual,HR_error,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,1,-2.437439,3.437439,0.182130,3.080407,3.794471,-6.206708,13.081585
1,58,5.135658,52.864342,0.633557,51.622372,54.106312,43.147111,62.581573
2,2,-1.673944,3.673944,0.171755,3.337252,4.010637,-5.969471,13.317359
3,6,-2.266625,8.266625,0.199407,7.875724,8.657525,-1.378835,17.912085
4,0,-2.152840,2.152840,0.236404,1.689414,2.616267,-7.495831,11.801511
...,...,...,...,...,...,...,...,...
637,4,-1.291186,5.291186,0.287722,4.727161,5.855212,-4.362840,14.945212
638,23,6.843215,16.156785,0.234445,15.697201,16.616370,6.508298,25.805273
639,0,0.344720,-0.344720,0.250866,-0.836496,0.147056,-9.994795,9.305354
640,17,4.853758,12.146242,0.201655,11.750935,12.541548,2.500602,21.791881


## Model 2

Feature list filtered down to features with VIF < 5.

In [11]:
players_df_2 = pd.read_csv('../../data/processed/lr_model2_training_testing_dataset.csv')
players_df_2.dropna(ignore_index=True, inplace=True)

# preparing the training/testing splits
training_df_2 = players_df_2[players_df_2['Season'] != 2024].reset_index(drop=True)
testing_df_2 = players_df_2[players_df_2['Season'] == 2024].reset_index(drop=True)

# preparing the training/testing splits
X_train_2 = training_df_2.drop(columns=['HR', 'Season'])
y_train_2 = training_df_2['HR']

X_test_2 = testing_df_2.drop(columns=['HR', 'Season'])
y_test_2 = testing_df_2['HR']

X_train_2.insert(0, 'intercept', 1)
X_test_2.insert(0, 'intercept', 1)

sm_model_2 = sm.OLS(y_train_2, X_train_2).fit()
sm_model_2.summary()

0,1,2,3
Dep. Variable:,HR,R-squared:,0.694
Model:,OLS,Adj. R-squared:,0.693
Method:,Least Squares,F-statistic:,897.0
Date:,"Mon, 14 Apr 2025",Prob (F-statistic):,0.0
Time:,16:45:16,Log-Likelihood:,-21778.0
No. Observations:,7139,AIC:,43590.0
Df Residuals:,7120,BIC:,43730.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-8.6445,1.677,-5.155,0.000,-11.932,-5.357
season_factor,2.2153,0.405,5.467,0.000,1.421,3.010
tmFactor,7.1845,0.468,15.347,0.000,6.267,8.102
Age,-0.7877,0.840,-0.937,0.349,-2.435,0.860
3B,0.7713,0.051,14.996,0.000,0.670,0.872
IBB,1.1434,0.033,34.874,0.000,1.079,1.208
GDP,0.7524,0.016,47.409,0.000,0.721,0.783
BUH,-0.2433,0.060,-4.082,0.000,-0.360,-0.126
GB/FB,-0.3689,0.037,-9.941,0.000,-0.442,-0.296

0,1,2,3
Omnibus:,1406.534,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8174.179
Skew:,0.818,Prob(JB):,0.0
Kurtosis:,7.98,Cond. No.,4010.0


In [15]:
# getting predictions from test data.
pred_results_2 = sm_model_2.get_prediction(X_test_2)

# creating dataframe of results summary from the predictions and adding in actual homeruns
pred_summary_2 = pred_results_2.summary_frame(alpha=0.05)
pred_summary_2.insert(0, 'HR_actual', y_test_2.values)
# adding Home run error 
pred_summary_2.insert(1, 'HR_error', pred_summary_2['HR_actual'] - pred_summary_2['mean'])

pred_summary_2

Unnamed: 0,HR_actual,HR_error,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,1,-2.319476,3.319476,0.173537,2.979293,3.659660,-6.721530,13.360483
1,58,7.224567,50.775433,0.618751,49.562497,51.988368,40.667154,60.883712
2,2,-2.040904,4.040904,0.155120,3.736824,4.344985,-5.998944,14.080753
3,6,-1.931084,7.931084,0.170158,7.597525,8.264644,-2.109700,17.971869
4,0,-1.226782,1.226782,0.177841,0.878160,1.575403,-8.814515,11.268078
...,...,...,...,...,...,...,...,...
640,4,-2.892394,6.892394,0.268290,6.366466,7.418322,-3.156620,16.941409
641,23,6.775416,16.224584,0.230941,15.771871,16.677296,6.179135,26.270032
642,0,-0.147998,0.147998,0.226278,-0.295575,0.591571,-9.897043,10.193039
643,17,6.006882,10.993118,0.185763,10.628968,11.357269,0.951271,21.034966


## model 3 

This is the start of models where the possible features were chosen by looking at scatter plots of each feature compared to homeruns. Then VIF values were run and and features were selected to be removed one at a time. Each time a feature was removed, the VIF values were recalculated and feature removal was performed again. Additionaly, some added feature engineering was done for the remaining models to include a combination of RBIs and games played.



In [16]:
players_df_3 = pd.read_csv('../../data/processed/lr_model_manual_feature_training_testing_dataset.csv')
players_df_3.dropna(ignore_index=True, inplace=True)

# preparing the training/testing splits
training_df_3 = players_df_3[players_df_3['Season'] != 2024].reset_index(drop=True)
testing_df_3 = players_df_3[players_df_3['Season'] == 2024].reset_index(drop=True)

# preparing the training/testing splits
X_train_3 = training_df_3.drop(columns=['HR', 'Season'])
y_train_3 = training_df_3['HR']

X_test_3 = testing_df_3.drop(columns=['HR', 'Season'])
y_test_3 = testing_df_3['HR']

X_train_3.insert(0, 'intercept', 1)
X_test_3.insert(0, 'intercept', 1)
sm_model_3 = sm.OLS(y_train_3, X_train_3).fit()
sm_model_3.summary()

0,1,2,3
Dep. Variable:,HR,R-squared:,0.812
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,5312.0
Date:,"Mon, 14 Apr 2025",Prob (F-statistic):,0.0
Time:,16:59:03,Log-Likelihood:,-20724.0
No. Observations:,7404,AIC:,41460.0
Df Residuals:,7397,BIC:,41510.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-2.4489,0.108,-22.634,0.000,-2.661,-2.237
G,0.0092,0.002,3.848,0.000,0.005,0.014
RBI/G,8.6854,0.312,27.797,0.000,8.073,9.298
BB,0.1375,0.005,30.072,0.000,0.129,0.146
SO,0.0730,0.003,27.175,0.000,0.068,0.078
Barrel%,16.0521,1.093,14.685,0.000,13.909,18.195
HardHit%,-1.6095,0.314,-5.122,0.000,-2.225,-0.994

0,1,2,3
Omnibus:,1274.311,Durbin-Watson:,1.968
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6353.478
Skew:,0.744,Prob(JB):,0.0
Kurtosis:,7.287,Cond. No.,2500.0


In [17]:
# getting predictions from test data.
pred_results_3 = sm_model_3.get_prediction(X_test_3)

# creating dataframe of results summary from the predictions and adding in actual homeruns
pred_summary_3 = pred_results_3.summary_frame(alpha=0.05)
pred_summary_3.insert(0, 'HR_actual', y_test_3.values)
# adding Home run error 
pred_summary_3.insert(1, 'HR_error', pred_summary_3['HR_actual'] - pred_summary_3['mean'])

pred_summary_3

Unnamed: 0,HR_actual,HR_error,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,1,-2.008615,3.008615,0.075824,2.859978,3.157252,-4.789455,10.806684
1,58,16.987266,41.012734,0.362526,40.302080,41.723387,33.183761,48.841707
2,2,-0.533915,2.533915,0.060664,2.414997,2.652833,-5.263644,10.331475
3,6,-2.315708,8.315708,0.066982,8.184404,8.447011,0.517949,16.113466
4,0,1.174198,-1.174198,0.097546,-1.365416,-0.982981,-8.973196,6.624799
...,...,...,...,...,...,...,...,...
642,4,-4.835737,8.835737,0.105016,8.629876,9.041597,1.036367,16.635107
643,23,3.411699,19.588301,0.129158,19.335114,19.841488,11.787538,27.389064
644,0,0.115582,-0.115582,0.076892,-0.266312,0.035149,-7.913691,7.682528
645,17,-4.666014,21.666014,0.245658,21.184454,22.147574,13.854504,29.477524


## Model 4

