In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv('../data/clean/bat.csv').drop(['Unnamed: 0'], axis = 1)
df.head(3)

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,BA
0,aardsda01,2015,ATL,NL,33,1,0,0,0,0,...,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,abadfe01,2015,OAK,AL,62,0,0,0,0,0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,abreujo02,2015,CHA,AL,154,613,88,178,34,3,...,0.0,0.0,39,140.0,11.0,15.0,0.0,1.0,16.0,0.29


In [24]:
df.shape

(8927, 22)

## Create X and y variables for Batting Average

In [3]:
X = df.drop(columns = ['BA', 'playerID', 'yearID', 'teamID', 'lgID'])
y = df['BA']
X.head()

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,33,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
1,62,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,154,613,88,178,34,3,30,101.0,0.0,0.0,39,140.0,11.0,15.0,0.0,1.0,16.0
3,11,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,85,186,22,40,8,1,6,19.0,2.0,2.0,14,38.0,0.0,1.0,3.0,3.0,3.0


## Train Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.3, random_state=42)

## Standard Scaler

In [5]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [6]:
pca = PCA(random_state=42)
X_train_p = pca.fit_transform(X_train_sc)
X_test_p = pca.transform(X_test_sc)

In [7]:
def reg_metrics(x, y, model):
    y_pred = model.predict(x)
    resids = y - y_pred

    mse = metrics.mean_squared_error(y, y_pred)
    rmse = metrics.mean_squared_error(y, y_pred, squared = False)
    score = model.score(x, y)
    
    print(model)
    print("R2: ", score)
    print("MSE: ", mse)
    print("RMSE: ", rmse)

In [8]:
def fit_models(x, y, model):
    model.fit(x, y)
    
    reg_metrics(x, y, model)

#### Linear Regression

In [9]:
lr = LinearRegression()

#### RidgeCV

In [10]:
#ridge alphas
r_alphas = np.logspace(0, 5, 220)

ridge_cv = RidgeCV(r_alphas, scoring='r2', cv=5)

#### LassoCV

In [11]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 1, 100)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=50_000, n_jobs=-1)

#### ElasticNetCV

In [12]:
# Set up a list of alphas to check.
e_alphas = np.linspace(0.01, 1, 100)

# Set up our l1 ratio. (What does this do?)
enet_ratio = 0.05

# Instantiate model.
enet_cv= ElasticNetCV(alphas=e_alphas,
                          l1_ratio=enet_ratio, 
                          cv=5, 
                          max_iter=3000, 
                          n_jobs=-1)

#### SGD

In [13]:
sgd = SGDRegressor()

#### KNeighbors

In [14]:
knn = KNeighborsRegressor()

#### AdaBoost

In [15]:
ad = AdaBoostRegressor()

#### GradientBoost

In [16]:
gb = GradientBoostingRegressor()

#### HistGradientBoosting

In [17]:
hgb = HistGradientBoostingRegressor()

#### RandomForest

In [18]:
rf = RandomForestRegressor()

#### SVR

In [19]:
svr = SVR()

#### DecisionTree

In [20]:
dt = DecisionTreeRegressor()

In [21]:
models = [lr, ridge_cv, lasso_cv, enet_cv, sgd, knn, ad, gb, hgb, rf, svr, dt]

for model in models:
    fit_models(X_train_p, y_train, model)

LinearRegression()
R2:  0.3111649149805671
MSE:  0.013892357951152795
RMSE:  0.11786584726354278
RidgeCV(alphas=array([1.00000000e+00, 1.05397680e+00, 1.11086709e+00, 1.17082814e+00,
       1.23402569e+00, 1.30063444e+00, 1.37083853e+00, 1.44483200e+00,
       1.52281940e+00, 1.60501632e+00, 1.69164996e+00, 1.78295980e+00,
       1.87919826e+00, 1.98063137e+00, 2.08753951e+00, 2.20021820e+00,
       2.31897894e+00, 2.44414999e+00, 2.57607738e+00, 2.71512579e+00,
       2.86167958e+00, 3.01614388e+0...
       2.98459801e+04, 3.14569705e+04, 3.31549170e+04, 3.49445132e+04,
       3.68307062e+04, 3.88187097e+04, 4.09140193e+04, 4.31224271e+04,
       4.54500376e+04, 4.79032850e+04, 5.04889509e+04, 5.32141828e+04,
       5.60865140e+04, 5.91138844e+04, 6.23046625e+04, 6.56676687e+04,
       6.92121991e+04, 7.29480519e+04, 7.68855541e+04, 8.10355901e+04,
       8.54096317e+04, 9.00197701e+04, 9.48787490e+04, 1.00000000e+05]),
        cv=5, scoring='r2')
R2:  0.31097243207306924
MSE:  0.0138

### Test Scores of Best Train Scores

In [22]:
for model in models:
    reg_metrics(X_test_p, y_test, model)

LinearRegression()
R2:  0.3425130880394013
MSE:  0.012637098693539474
RMSE:  0.112414850858503
RidgeCV(alphas=array([1.00000000e+00, 1.05397680e+00, 1.11086709e+00, 1.17082814e+00,
       1.23402569e+00, 1.30063444e+00, 1.37083853e+00, 1.44483200e+00,
       1.52281940e+00, 1.60501632e+00, 1.69164996e+00, 1.78295980e+00,
       1.87919826e+00, 1.98063137e+00, 2.08753951e+00, 2.20021820e+00,
       2.31897894e+00, 2.44414999e+00, 2.57607738e+00, 2.71512579e+00,
       2.86167958e+00, 3.01614388e+0...
       2.98459801e+04, 3.14569705e+04, 3.31549170e+04, 3.49445132e+04,
       3.68307062e+04, 3.88187097e+04, 4.09140193e+04, 4.31224271e+04,
       4.54500376e+04, 4.79032850e+04, 5.04889509e+04, 5.32141828e+04,
       5.60865140e+04, 5.91138844e+04, 6.23046625e+04, 6.56676687e+04,
       6.92121991e+04, 7.29480519e+04, 7.68855541e+04, 8.10355901e+04,
       8.54096317e+04, 9.00197701e+04, 9.48787490e+04, 1.00000000e+05]),
        cv=5, scoring='r2')
R2:  0.34221197292108463
MSE:  0.012642

Random Forest Regressor is the best model with a test R2 score of 0.996 and with a train R2 score of 0.999. This model has a good balance between bias and variance. Test RMSE is 0.008 which means when predicting batting average the prediction is off by 0.008.

## Create X and y variables for Strike Outs

In [76]:
X = df.drop(columns = ['SO', 'playerID', 'yearID', 'teamID', 'lgID'])
y = df['SO']
X.head()

Unnamed: 0,AB,R,H,HR,RBI,BB,BA
0,0,0,0,0,0.0,0,0.0
1,1,0,0,0,0.0,0,0.0
2,573,88,146,20,78.0,87,0.255
3,193,16,45,1,13.0,4,0.233
4,0,0,0,0,0.0,0,0.0


In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.3, random_state=42)

In [78]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [91]:
models = [lr, ridge_cv, lasso_cv, enet_cv, sgd, knn, ad, gb, hgb, rf, svr, dt]

for model in models:
    fit_models(X_train_sc, y_train, model)

LinearRegression()
R2:  0.9246755450881237
MSE:  109.87387174256283
RMSE:  10.4820738283301
RidgeCV(alphas=array([1.00000000e+00, 1.05397680e+00, 1.11086709e+00, 1.17082814e+00,
       1.23402569e+00, 1.30063444e+00, 1.37083853e+00, 1.44483200e+00,
       1.52281940e+00, 1.60501632e+00, 1.69164996e+00, 1.78295980e+00,
       1.87919826e+00, 1.98063137e+00, 2.08753951e+00, 2.20021820e+00,
       2.31897894e+00, 2.44414999e+00, 2.57607738e+00, 2.71512579e+00,
       2.86167958e+00, 3.01614388e+0...
       2.98459801e+04, 3.14569705e+04, 3.31549170e+04, 3.49445132e+04,
       3.68307062e+04, 3.88187097e+04, 4.09140193e+04, 4.31224271e+04,
       4.54500376e+04, 4.79032850e+04, 5.04889509e+04, 5.32141828e+04,
       5.60865140e+04, 5.91138844e+04, 6.23046625e+04, 6.56676687e+04,
       6.92121991e+04, 7.29480519e+04, 7.68855541e+04, 8.10355901e+04,
       8.54096317e+04, 9.00197701e+04, 9.48787490e+04, 1.00000000e+05]),
        cv=5, scoring='r2')
R2:  0.9246484732540833
MSE:  109.91336074

In [92]:
#models with r2 > 90
best_models = [lr, ridge_cv, lasso_cv, enet_cv, knn, ad, gb, hgb, rf, dt]

for model in best_models:
    reg_metrics(X_test_sc, y_test, model)

LinearRegression()
R2:  0.9260277998688496
MSE:  107.5311995018297
RMSE:  10.36972514109365
RidgeCV(alphas=array([1.00000000e+00, 1.05397680e+00, 1.11086709e+00, 1.17082814e+00,
       1.23402569e+00, 1.30063444e+00, 1.37083853e+00, 1.44483200e+00,
       1.52281940e+00, 1.60501632e+00, 1.69164996e+00, 1.78295980e+00,
       1.87919826e+00, 1.98063137e+00, 2.08753951e+00, 2.20021820e+00,
       2.31897894e+00, 2.44414999e+00, 2.57607738e+00, 2.71512579e+00,
       2.86167958e+00, 3.01614388e+0...
       2.98459801e+04, 3.14569705e+04, 3.31549170e+04, 3.49445132e+04,
       3.68307062e+04, 3.88187097e+04, 4.09140193e+04, 4.31224271e+04,
       4.54500376e+04, 4.79032850e+04, 5.04889509e+04, 5.32141828e+04,
       5.60865140e+04, 5.91138844e+04, 6.23046625e+04, 6.56676687e+04,
       6.92121991e+04, 7.29480519e+04, 7.68855541e+04, 8.10355901e+04,
       8.54096317e+04, 9.00197701e+04, 9.48787490e+04, 1.00000000e+05]),
        cv=5, scoring='r2')
R2:  0.9261053041827414
MSE:  107.41853377