In [10]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [11]:
df = pd.read_csv("Hitters.csv")


In [12]:
print("Initial shape:", df.shape)
print(df.isnull().sum())

# Drop rows with missing Salary (only variable with NAs)
df = df.dropna(subset=["Salary"])
print("After drop:", df.shape)

# Reset index
df = df.reset_index(drop=True)

Initial shape: (322, 20)
AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64
After drop: (263, 20)


In [13]:
X = df.drop(columns=["Salary"])
y = df["Salary"]

In [14]:

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=np.number).columns.tolist()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [16]:
# Scale numeric features, one-hot encode categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first"), cat_cols)
    ],
    remainder="passthrough"  # keep any unexpected columns
)


In [17]:
def cv_mse(pipeline, Xtr, ytr, cv=5):
    scores = cross_val_score(pipeline, Xtr, ytr, cv=KFold(cv, shuffle=True, random_state=42),
                             scoring="neg_mean_squared_error", n_jobs=-1)
    return -scores.mean(), scores.std()

def evaluate_full(model, X_train, X_test, y_train, y_test, name="model"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Model": name,
        "R2_test": r2_score(y_test, y_pred),
        "RMSE_test": mean_squared_error(y_test, y_pred),
        "MAE_test": mean_absolute_error(y_test, y_pred)
    }

# Part I: Different Model Specs


### A. Regression without regularization
Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary linear regression

Fit this pipeline to the full dataset, and interpret a few of the most important coefficients.

Use cross-validation to estimate the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [18]:
ols_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

ols_cv_mse_mean, ols_cv_mse_sd = cv_mse(ols_pipe, X_train, y_train, cv=5)
ols_results = evaluate_full(ols_pipe, X_train, X_test, y_train, y_test, "OLS")
print(f"[OLS] CV MSE: {ols_cv_mse_mean} (± {ols_cv_mse_sd})")
print(ols_results)

[OLS] CV MSE: 148524.4273448546 (± 88857.66290489696)
{'Model': 'OLS', 'R2_test': 0.4110795465741335, 'RMSE_test': 127487.05257542129, 'MAE_test': 216.23960331813112}


In [19]:
ols_pipe.fit(X_train, y_train)
# names after preprocessing
num_names = num_cols
cat_names = []
if len(cat_cols) > 0:
    cat_names = list(
        ols_pipe.named_steps["preprocessor"]
        .named_transformers_["cat"]
        .get_feature_names_out(cat_cols)
    )

In [20]:
feat_names_all = num_names + cat_names + (X_train.columns.difference(num_cols + cat_cols).tolist() if False else [])
coefs_ols = ols_pipe.named_steps["model"].coef_
coef_df_ols = pd.DataFrame({"Feature": feat_names_all, "Coefficient": coefs_ols})
print("\nTop OLS coefficients by |magnitude|:")
print(coef_df_ols.reindex(coef_df_ols.Coefficient.abs().sort_values(ascending=False).index).head(10))



Top OLS coefficients by |magnitude|:
       Feature  Coefficient
10       CRuns   602.973835
1         Hits   379.226590
7       CAtBat  -318.631805
0        AtBat  -252.645370
11        CRBI   238.251038
12      CWalks  -179.250015
8        CHits  -136.216934
5        Walks   113.427805
16    League_N   113.414763
17  Division_W  -100.028262


### B. Ridge regression
Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

Use cross-validation to tune the 
 hyperparameter.

Fit the pipeline with your chosen 
 to the full dataset, and interpret a few of the most important coefficients.

Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [26]:
ridge_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", Ridge(random_state=42))
])

ridge_grid = {"model__alpha": np.logspace(-3, 3, 25)}
ridge_cv = GridSearchCV(
    ridge_pipe, ridge_grid,
    cv=KFold(5, shuffle=True, random_state=42),
    scoring="neg_mean_squared_error", n_jobs=-1
)

In [27]:
ridge_cv.fit(X_train, y_train)
ridge_best = ridge_cv.best_estimator_
ridge_cv_mse = -ridge_cv.best_score_
ridge_results = evaluate_full(ridge_best, X_train, X_test, y_train, y_test,
                              f"Ridge (alpha={ridge_cv.best_params_['model__alpha']})")

In [33]:
print(f"\n[Ridge] best alpha = {ridge_cv.best_params_['model__alpha']}")
print(f"[Ridge] CV MSE: {ridge_cv_mse}")
print(ridge_results)


[Ridge] best alpha = 177.82794100389228
[Ridge] CV MSE: 116116.53242454273
{'Model': 'Ridge (alpha=177.82794100389228)', 'R2_test': 0.29735824210969497, 'RMSE_test': 152104.96801181967, 'MAE_test': 235.1071060065175}


### C. Lasso Regression
Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

Use cross-validation to tune the 
 hyperparameter.

Fit the pipeline with your chosen 
 to the full dataset, and interpret a few of the most important coefficients.

Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [38]:
lasso_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", Lasso(max_iter=10000, random_state=42))
])

lasso_grid = {"model__alpha": np.logspace(-3, 1, 25)}
lasso_cv = GridSearchCV(
    lasso_pipe, lasso_grid,
    cv=KFold(5, shuffle=True, random_state=42),
    scoring="neg_mean_squared_error", n_jobs=-1
)

In [39]:
lasso_cv.fit(X_train, y_train)
lasso_best = lasso_cv.best_estimator_
lasso_cv_mse = -lasso_cv.best_score_
lasso_results = evaluate_full(lasso_best, X_train, X_test, y_train, y_test,
                              f"LASSO (alpha={lasso_cv.best_params_['model__alpha']})")

In [40]:
print(f"\n[LASSO] best alpha = {lasso_cv.best_params_['model__alpha']}")
print(f"[LASSO] CV MSE: {lasso_cv_mse}")
print(lasso_results)


[LASSO] best alpha = 10.0
[LASSO] CV MSE: 124277.08593290819
{'Model': 'LASSO (alpha=10.0)', 'R2_test': 0.3264794378430288, 'RMSE_test': 145800.93256880206, 'MAE_test': 227.4795114246595}


### D. Elastic Net
Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression

Use cross-validation to tune the 
 and 
 hyperparameters.

Fit the pipeline with your chosen hyperparameters to the full dataset, and interpret a few of the most important coefficients.

Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [44]:
enet_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", ElasticNet(max_iter=10000, random_state=42))
])

enet_grid = {
    "model__alpha": np.logspace(-3, 1, 12),
    "model__l1_ratio": np.linspace(0.1, 0.9, 9)
}
enet_cv = GridSearchCV(
    enet_pipe, enet_grid,
    cv=KFold(5, shuffle=True, random_state=42),
    scoring="neg_mean_squared_error", n_jobs=-1
)

In [45]:
enet_cv.fit(X_train, y_train)
enet_best = enet_cv.best_estimator_
enet_cv_mse = -enet_cv.best_score_
enet_results = evaluate_full(enet_best, X_train, X_test, y_train, y_test,
                             f"ElasticNet (alpha={enet_cv.best_params_['model__alpha']}, l1={enet_cv.best_params_['model__l1_ratio']})")

In [50]:
print(f"\n[ElasticNet] best params = {enet_cv.best_params_}")
print(f"[ElasticNet] CV MSE: {enet_cv_mse}")
print(enet_results)


[ElasticNet] best params = {'model__alpha': np.float64(1.873817422860385), 'model__l1_ratio': np.float64(0.4)}
[ElasticNet] CV MSE: 116214.42321967045
{'Model': 'ElasticNet (alpha=1.873817422860385, l1=0.4)', 'R2_test': 0.29578743958229825, 'RMSE_test': 152445.0088156853, 'MAE_test': 235.97705384223224}


In [51]:
# Collect comparison
model_comp = pd.DataFrame([ols_results, ridge_results, lasso_results, enet_results])
model_comp["CV_MSE"] = [ols_cv_mse_mean, ridge_cv_mse, lasso_cv_mse, enet_cv_mse]
print("\nModel comparison:")
print(model_comp.sort_values("CV_MSE"))


Model comparison:
                                          Model   R2_test      RMSE_test  \
1              Ridge (alpha=177.82794100389228)  0.297358  152104.968012   
3  ElasticNet (alpha=1.873817422860385, l1=0.4)  0.295787  152445.008816   
2                            LASSO (alpha=10.0)  0.326479  145800.932569   
0                                           OLS  0.411080  127487.052575   

     MAE_test         CV_MSE  
1  235.107106  116116.532425  
3  235.977054  116214.423220  
2  227.479511  124277.085933  
0  216.239603  148524.427345  


# Part II. Variable Selection
Based on the above results, decide on:

Which numeric variable is most important.

Which five numeric variables are most important

Which categorical variable is most important

For each of the four model specifications, compare the following possible feature sets:

Using only the one best numeric variable.

Using only the five best variables.

Using the five best numeric variables and their interactions with the one best categorical variable.

Report which combination of features and model performed best, based on the validation metric of MSE.

(Note: 
 and 
 must be re-tuned for each feature set.)

In [52]:
# code

# Part III. Discussion


### A. Ridge


### B. LASSO


### C. Elastic Net


# Part IV: Final Model
Fit your final best pipeline on the full dataset, and summarize your results in a few short sentences and a plot.

In [53]:
# code