In [181]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import cross_val_score
pd.set_option('display.float_format', '{:.4f}'.format)
np.set_printoptions(suppress=True, precision=4)

In [182]:
players = pd.read_csv("hitters.csv")
players = players.dropna()
players.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


Part I: Different Model Specs

A: Regression without Regularization

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary linear regression

In [183]:
X = players.drop("Salary", axis=1)
y = players[['Salary']]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25)

In [184]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

OLSA1 = Pipeline(
  [("preprocessing", ct),
  ("OLS", LinearRegression())]
)
ct.fit_transform(X)

Unnamed: 0,dummify__League_A,dummify__League_N,dummify__Division_E,dummify__Division_W,dummify__NewLeague_A,dummify__NewLeague_N,standardize__AtBat,standardize__Hits,standardize__HmRun,standardize__Runs,...,standardize__Years,standardize__CAtBat,standardize__CHits,standardize__CHmRun,standardize__CRuns,standardize__CRBI,standardize__CWalks,standardize__PutOuts,standardize__Assists,standardize__Errors
1,0.0000,1.0000,0.0000,1.0000,0.0000,1.0000,-0.6029,-0.5957,-0.5286,-1.2061,...,1.3979,0.3468,0.1744,-0.0029,-0.1217,0.2590,0.4353,1.2215,-0.5232,0.2134
2,1.0000,0.0000,0.0000,1.0000,1.0000,0.0000,0.5125,0.4923,0.7300,0.4415,...,-0.9012,-0.4529,-0.4099,-0.0761,-0.4151,-0.1996,0.0104,2.1091,-0.2539,0.8200
3,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000,0.6282,0.7365,0.9588,0.4023,...,0.7709,1.3016,1.3182,1.8986,1.4121,1.5727,0.3557,-0.3247,-0.7442,-0.8482
4,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000,-0.5621,-0.4625,-0.1853,-0.6177,...,-1.1102,-0.9909,-0.9602,-0.6977,-0.9475,-0.8812,-0.8623,1.8407,-0.5439,-0.6966
5,1.0000,0.0000,0.0000,1.0000,1.0000,0.0000,1.2947,1.3582,-0.8718,0.7553,...,0.7709,0.7670,0.6350,-0.6124,0.4228,0.0173,-0.2514,-0.0312,2.0872,2.4881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000,0.6350,0.4257,-0.5286,0.4023,...,-0.4832,0.0199,0.1295,-0.4539,0.0538,-0.0602,-0.4639,0.1227,-0.7580,-0.8482
318,1.0000,0.0000,1.0000,0.0000,1.0000,0.0000,0.6010,0.6255,-0.7574,0.8338,...,0.9799,1.2503,1.2193,-0.3686,1.6208,0.3736,2.3325,0.0798,1.8110,1.7299
319,1.0000,0.0000,0.0000,1.0000,1.0000,0.0000,0.4853,0.4034,-0.9862,0.2454,...,-0.2742,-0.4196,-0.4470,-0.7586,-0.4363,-0.7356,-0.4336,-0.9081,-0.0398,-0.2416
320,1.0000,0.0000,1.0000,0.0000,1.0000,0.0000,1.1519,0.8031,-0.2997,1.1869,...,0.1438,0.2368,0.2084,0.3384,0.3291,0.2776,0.2722,3.6624,0.0845,0.5167


2. Fit this pipeline to the full dataset, and interpret a few of the most important coefficients.



In [185]:
fittedA1 = OLSA1.fit(X,y)

In [186]:
fittedA1.named_steps['OLS'].coef_

array([[ -31.2997,   31.2997,   58.4246,  -58.4246,   12.3812,  -12.3812,
        -291.0946,  337.8305,   37.8538,  -60.5725,  -26.995 ,  135.0739,
         -16.6934, -391.0387,   86.6876,  -14.1817,  480.7471,  260.6899,
        -213.8923,   78.7613,   53.7325,  -22.1609]])

3. Use cross-validation to estimate the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [187]:
cvA1 = cross_val_score(fittedA1, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(cvA1).mean()

121136.31031816883

B: Ridge Regression

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

In [188]:
from sklearn.model_selection import GridSearchCV
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ridgeB1 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge())]
)

2. Use cross-validation to tune the 
 hyperparameter.

In [189]:
lambdas = {'Ridge__alpha': [.001,.01,.1,1,10,100,1000]}

gscv1 = GridSearchCV(ridgeB1, lambdas, cv = 5, scoring='r2')

In [190]:
gscvfit = gscv1.fit(X,y)

In [191]:
pd.DataFrame(data = {"lambdas": [.001,.01,.1,1,10,100,1000], "scores": gscvfit.cv_results_['mean_test_score']})

Unnamed: 0,lambdas,scores
0,0.001,0.3436
1,0.01,0.3441
2,0.1,0.3477
3,1.0,0.3558
4,10.0,0.3683
5,100.0,0.385
6,1000.0,0.3329


3. Fit the pipeline with your chosen lambda
 to the full dataset, and interpret a few of the most important coefficients.

In [192]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ridgeB3 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(100))]
)

In [193]:
ridgeB3fit = ridgeB3.fit(X,y)
ridgeB3fit.named_steps["Ridge"].coef_

array([[-11.0518,  11.0518,  38.0232, -38.0232,  -4.0916,   4.0916,
         -0.5674,  49.6124,  -1.4642,  29.3433,  22.958 ,  41.3846,
         -2.7083,  24.7058,  44.5343,  38.6853,  45.5076,  47.1456,
          4.0364,  56.8815,   7.4572, -13.3824]])

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [194]:
cvB3 = cross_val_score(ridgeB3fit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(cvB3).mean()

120716.43558937623

C. Lasso Regression

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

In [195]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

lassoC1 = Pipeline(
  [("preprocessing", ct),
  ("Lasso", Lasso())]
)

2. Use cross-validation to tune the 
 hyperparameter.

In [196]:
lasso = {'Lasso__alpha': [.001,.01,.1,1,10,100,1000]}

gscvLassoC = GridSearchCV(lassoC1, lasso, cv = 5, scoring='r2')

In [197]:
gscvLassoCfit = gscvLassoC.fit(X,y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [198]:
pd.DataFrame(data = {"lasso_alpha": [.001,.01,.1,1,10,100,1000], "scores": gscvLassoCfit.cv_results_['mean_test_score']})

Unnamed: 0,lasso_alpha,scores
0,0.001,0.3442
1,0.01,0.3444
2,0.1,0.346
3,1.0,0.3542
4,10.0,0.3695
5,100.0,0.2984
6,1000.0,-0.0432


3. Fit the pipeline with your chosen 
 to the full dataset, and interpret a few of the most important coefficients.

In [199]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

lassoC1 = Pipeline(
  [("preprocessing", ct),
  ("Lasso", Lasso(10))]
)

In [200]:
C1fit = lassoC1.fit(X,y)
C1fit.named_steps['Lasso'].coef_

array([ -0.    ,   0.    ,  95.4132,  -0.    ,  -0.    ,   0.    ,
        -0.    ,  88.7416,   0.    ,   0.    ,   0.    ,  49.9028,
        -0.    ,   0.    ,   0.    ,   0.    ,  72.2275, 134.032 ,
        -0.    ,  66.737 ,   0.    ,  -4.1583])

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [201]:
cvC3 = cross_val_score(C1fit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(cvB3).mean()

120716.43558937623

D. Elastic Net

1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

In [202]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

EN_D = Pipeline(
  [("preprocessing", ct),
  ("Elastic_Net", ElasticNet())]
)
ct.fit_transform(X)

Unnamed: 0,dummify__League_A,dummify__League_N,dummify__Division_E,dummify__Division_W,dummify__NewLeague_A,dummify__NewLeague_N,standardize__AtBat,standardize__Hits,standardize__HmRun,standardize__Runs,...,standardize__Years,standardize__CAtBat,standardize__CHits,standardize__CHmRun,standardize__CRuns,standardize__CRBI,standardize__CWalks,standardize__PutOuts,standardize__Assists,standardize__Errors
1,0.0000,1.0000,0.0000,1.0000,0.0000,1.0000,-0.6029,-0.5957,-0.5286,-1.2061,...,1.3979,0.3468,0.1744,-0.0029,-0.1217,0.2590,0.4353,1.2215,-0.5232,0.2134
2,1.0000,0.0000,0.0000,1.0000,1.0000,0.0000,0.5125,0.4923,0.7300,0.4415,...,-0.9012,-0.4529,-0.4099,-0.0761,-0.4151,-0.1996,0.0104,2.1091,-0.2539,0.8200
3,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000,0.6282,0.7365,0.9588,0.4023,...,0.7709,1.3016,1.3182,1.8986,1.4121,1.5727,0.3557,-0.3247,-0.7442,-0.8482
4,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000,-0.5621,-0.4625,-0.1853,-0.6177,...,-1.1102,-0.9909,-0.9602,-0.6977,-0.9475,-0.8812,-0.8623,1.8407,-0.5439,-0.6966
5,1.0000,0.0000,0.0000,1.0000,1.0000,0.0000,1.2947,1.3582,-0.8718,0.7553,...,0.7709,0.7670,0.6350,-0.6124,0.4228,0.0173,-0.2514,-0.0312,2.0872,2.4881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000,0.6350,0.4257,-0.5286,0.4023,...,-0.4832,0.0199,0.1295,-0.4539,0.0538,-0.0602,-0.4639,0.1227,-0.7580,-0.8482
318,1.0000,0.0000,1.0000,0.0000,1.0000,0.0000,0.6010,0.6255,-0.7574,0.8338,...,0.9799,1.2503,1.2193,-0.3686,1.6208,0.3736,2.3325,0.0798,1.8110,1.7299
319,1.0000,0.0000,0.0000,1.0000,1.0000,0.0000,0.4853,0.4034,-0.9862,0.2454,...,-0.2742,-0.4196,-0.4470,-0.7586,-0.4363,-0.7356,-0.4336,-0.9081,-0.0398,-0.2416
320,1.0000,0.0000,1.0000,0.0000,1.0000,0.0000,1.1519,0.8031,-0.2997,1.1869,...,0.1438,0.2368,0.2084,0.3384,0.3291,0.2776,0.2722,3.6624,0.0845,0.5167


2. Use cross-validation to tune the 
 and 
 hyperparameters.



In [203]:
EN = {'Elastic_Net__alpha': [.001,.01,.1,1,10,100,1000], "Elastic_Net__l1_ratio": [.001,.01,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]}

gscvEN = GridSearchCV(EN_D, EN, cv = 5, scoring='r2')

In [204]:
gscvEnfit = gscvEN.fit(X,y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [205]:
gscvEnfit.best_params_

{'Elastic_Net__alpha': 1, 'Elastic_Net__l1_ratio': 0.2}

3. Fit the pipeline with your chosen hyperparameters to the full dataset, and interpret a few of the most important coefficients.

In [206]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

EN_D = Pipeline(
  [("preprocessing", ct),
  ("Elastic_Net", ElasticNet(alpha = 1, l1_ratio=.2))]
)

In [207]:
EnDfit = EN_D.fit(X,y)
EnDfit.named_steps['Elastic_Net'].coef_

array([ -7.2486,   7.2486,  26.104 , -26.104 ,  -4.1165,   4.1165,
        12.1794,  37.4507,   5.61  ,  27.011 ,  22.9832,  34.5686,
         7.6342,  25.8674,  36.1248,  32.3124,  36.9431,  37.8805,
        15.3558,  44.8351,   3.8366,  -8.0362])

4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [208]:
END3 = cross_val_score(EnDfit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(END3).mean()

121500.81646251371

Part II: Variable Selection

In [209]:
Features = pd.DataFrame({'Feature': ct.fit_transform(X).columns,
                         "OLS_coefs": abs(fittedA1.named_steps['OLS'].coef_)[0],
                         "Ridge_coefs": abs(ridgeB3fit.named_steps["Ridge"].coef_)[0],
                         "Lasso_coefs": abs(C1fit.named_steps['Lasso'].coef_), 
                         "EN_coefs": abs(EnDfit.named_steps['Elastic_Net'].coef_)
                        })

In [210]:
Features

Unnamed: 0,Feature,OLS_coefs,Ridge_coefs,Lasso_coefs,EN_coefs
0,dummify__League_A,31.2997,11.0518,0.0,7.2486
1,dummify__League_N,31.2997,11.0518,0.0,7.2486
2,dummify__Division_E,58.4246,38.0232,95.4132,26.104
3,dummify__Division_W,58.4246,38.0232,0.0,26.104
4,dummify__NewLeague_A,12.3812,4.0916,0.0,4.1165
5,dummify__NewLeague_N,12.3812,4.0916,0.0,4.1165
6,standardize__AtBat,291.0946,0.5674,0.0,12.1794
7,standardize__Hits,337.8305,49.6124,88.7416,37.4507
8,standardize__HmRun,37.8538,1.4642,0.0,5.61
9,standardize__Runs,60.5725,29.3433,0.0,27.011


The numeric variable that is the most important is Career RBIs as it has the highest magnitude Lasso coefficient and the second highest Elastic Net coefficient.

The five numeric variables that are the most important are, Career Runs, Hits, Putouts, Career RBIs, and walks.

The categorical variable that is the most important is the Division variable.

OLS:

In [211]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ['CRBI'])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

OLSnum1 = Pipeline(
  [("preprocessing", ct),
  ("OLS", LinearRegression())]
)
OLSnum1fit = OLSnum1.fit(X,y)
OLSnum11 = cross_val_score(OLSnum1fit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(OLSnum11).mean()

142142.865462403

In [212]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ['CRBI', 'Hits', 'CRuns', 'PutOuts', 'Walks'])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

OLSnum5 = Pipeline(
  [("preprocessing", ct),
  ("OLS", LinearRegression())]
)
OLSnum5fit = OLSnum5.fit(X,y)
OLSnum55 = cross_val_score(OLSnum5fit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(OLSnum55).mean()

121332.85377811702

In [234]:
ct1 = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    ["Division"]),
    ("standardize",
    StandardScaler(),
    ['CRBI', 'Hits', 'CRuns', 'PutOuts', 'Walks'])   
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct2 = ColumnTransformer(
  [
    ("keep", FunctionTransformer(lambda x: x), ['standardize__CRBI', 'standardize__Hits', 'standardize__CRuns', 'standardize__PutOuts', 'standardize__Walks']),
      ("Interaction", PolynomialFeatures(interaction_only=True), ["standardize__CRBI", "dummify__Division_E"]),
    ("Interaction2", PolynomialFeatures(interaction_only=True), ["standardize__Hits", "dummify__Division_E"]),
      ("Interaction3", PolynomialFeatures(interaction_only=True), ["standardize__CRuns", "dummify__Division_E"]),
      ("Interaction4", PolynomialFeatures(interaction_only=True), ["standardize__PutOuts", "dummify__Division_E"]),
      ("Interaction5", PolynomialFeatures(interaction_only=True), ["standardize__Walks", "dummify__Division_E"])
    
  ],
  remainder = "drop"
).set_output(transform = "pandas")


OLSint = Pipeline(
  [("preprocessing", ct1),
   ("preprocessing2", ct2),
  ("OLS", LinearRegression())]
)



In [235]:
OLSintfit = OLSint.fit(X,y)
OLSint3 = cross_val_score(OLSintfit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(OLSint3).mean()

130401.97016858228

Ridge Regression:

In [223]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ["CRBI"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ridgenum1 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge())]
)

In [224]:
lambdas = {'Ridge__alpha': [.001,.01,.1,1,10,100,1000]}

gscvRR1 = GridSearchCV(ridgenum1, lambdas, cv = 5, scoring='r2')

In [225]:
gscvRR1fit = gscvRR1.fit(X,y)

pd.DataFrame(data = {"lambdas": [.001,.01,.1,1,10,100,1000], "scores": gscvRR1fit.cv_results_['mean_test_score']})

Unnamed: 0,lambdas,scores
0,0.001,0.2814
1,0.01,0.2814
2,0.1,0.2814
3,1.0,0.2817
4,10.0,0.283
5,100.0,0.2592
6,1000.0,0.0674


In [226]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ["CRBI"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ridgenum1 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(10))]
)

In [227]:
RRfit = ridgenum1.fit(X,y)
RR1 = cross_val_score(RRfit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(RR1).mean()

142096.24206102983

In [229]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ['CRBI', 'Hits', 'CRuns', 'PutOuts', 'Walks'])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ridgenum5 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge())]
)

In [230]:
lambdas = {'Ridge__alpha': [.001,.01,.1,1,10,100,1000]}

gscvRR5 = GridSearchCV(ridgenum5, lambdas, cv = 5, scoring='r2')

In [231]:
gscvRR5fit = gscvRR5.fit(X,y)

pd.DataFrame(data = {"lambdas": [.001,.01,.1,1,10,100,1000], "scores": gscvRR5fit.cv_results_['mean_test_score']})

Unnamed: 0,lambdas,scores
0,0.001,0.3772
1,0.01,0.3772
2,0.1,0.3774
3,1.0,0.3786
4,10.0,0.3859
5,100.0,0.4001
6,1000.0,0.2321


In [232]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ['CRBI', 'Hits', 'CRuns', 'PutOuts', 'Walks'])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

RR5 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(100))]
)

In [233]:
RR5fit = RR5.fit(X,y)
RR55 = cross_val_score(RR5fit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(RR55).mean()

119398.40011345323

In [236]:
ct1 = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    ["Division"]),
    ("standardize",
    StandardScaler(),
    ['CRBI', 'Hits', 'CRuns', 'PutOuts', 'Walks'])   
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct2 = ColumnTransformer(
  [
    ("keep", FunctionTransformer(lambda x: x), ['standardize__CRBI', 'standardize__Hits', 'standardize__CRuns', 'standardize__PutOuts', 'standardize__Walks']),
      ("Interaction", PolynomialFeatures(interaction_only=True), ["standardize__CRBI", "dummify__Division_E"]),
    ("Interaction2", PolynomialFeatures(interaction_only=True), ["standardize__Hits", "dummify__Division_E"]),
      ("Interaction3", PolynomialFeatures(interaction_only=True), ["standardize__CRuns", "dummify__Division_E"]),
      ("Interaction4", PolynomialFeatures(interaction_only=True), ["standardize__PutOuts", "dummify__Division_E"]),
      ("Interaction5", PolynomialFeatures(interaction_only=True), ["standardize__Walks", "dummify__Division_E"])
    
  ],
  remainder = "drop"
).set_output(transform = "pandas")


Ridgeint = Pipeline(
  [("preprocessing", ct1),
   ("preprocessing2", ct2),
  ("Ridge", Ridge())]
)



In [237]:
lambdas = {'Ridge__alpha': [.001,.01,.1,1,10,100,1000]}

gscvRidgeint = GridSearchCV(ridgenum5, lambdas, cv = 5, scoring='r2')

In [238]:
gscvRRintfit = gscvRidgeint.fit(X,y)

pd.DataFrame(data = {"lambdas": [.001,.01,.1,1,10,100,1000], "scores": gscvRRintfit.cv_results_['mean_test_score']})

Unnamed: 0,lambdas,scores
0,0.001,0.3772
1,0.01,0.3772
2,0.1,0.3774
3,1.0,0.3786
4,10.0,0.3859
5,100.0,0.4001
6,1000.0,0.2321


In [239]:
ct1 = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    ["Division"]),
    ("standardize",
    StandardScaler(),
    ['CRBI', 'Hits', 'CRuns', 'PutOuts', 'Walks'])   
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct2 = ColumnTransformer(
  [
    ("keep", FunctionTransformer(lambda x: x), ['standardize__CRBI', 'standardize__Hits', 'standardize__CRuns', 'standardize__PutOuts', 'standardize__Walks']),
      ("Interaction", PolynomialFeatures(interaction_only=True), ["standardize__CRBI", "dummify__Division_E"]),
    ("Interaction2", PolynomialFeatures(interaction_only=True), ["standardize__Hits", "dummify__Division_E"]),
      ("Interaction3", PolynomialFeatures(interaction_only=True), ["standardize__CRuns", "dummify__Division_E"]),
      ("Interaction4", PolynomialFeatures(interaction_only=True), ["standardize__PutOuts", "dummify__Division_E"]),
      ("Interaction5", PolynomialFeatures(interaction_only=True), ["standardize__Walks", "dummify__Division_E"])
    
  ],
  remainder = "drop"
).set_output(transform = "pandas")


Ridgeint1 = Pipeline(
  [("preprocessing", ct1),
   ("preprocessing2", ct2),
  ("Ridge", Ridge(100))]
)



In [240]:
RRintfit1 = Ridgeint1.fit(X,y)
RRint1 = cross_val_score(RRintfit1, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(RRint1).mean()

114045.37001739668

Lasso Regression:

In [241]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ['CRBI'])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

Lassonum1 = Pipeline(
  [("preprocessing", ct),
  ("OLS", LinearRegression())]
)
Lassonum1fit = Lassonum1.fit(X,y)
Lasso11 = cross_val_score(Lassonum1fit, X, y, cv=5, scoring = 'neg_mean_squared_error')
abs(Lasso11).mean()

142142.865462403

In [None]:
ct = ColumnTransformer(
  [
    ("standardize",
    StandardScaler(),
    ['CRBI', 'Hits', 'CRuns', 'PutOuts', 'Walks'])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ridgenum5 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge())]
)