In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Load the data
url = "https://raw.githubusercontent.com/MaRo406/eds-232-machine-learning/main/data/abalone-data.csv"
abdat = pd.read_csv(url)

# Explore the data
print(abdat.info())
print(abdat.head())


Data Splitting
Question 1: Split the data into training and test sets using a 70/30 split.

In [None]:
# Define predictors and outcome
X = abdat.drop(columns="Rings")
X = pd.get_dummies(X, drop_first=True)  # Encode categorical variables
y = abdat["Rings"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Ridge Regression
Question 2 & 3: Fit a ridge regression model and visualize how coefficients change with lambda.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV

# Standardize the predictors
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge regression with varying alphas
alphas = np.logspace(-4, 4, 100)
ridge = RidgeCV(alphas=alphas, store_cv_values=True, cv=10)
ridge.fit(X_train_scaled, y_train)

# Coefficients plot
plt.figure(figsize=(10, 6))
plt.plot(ridge.alphas_, ridge.cv_values_.mean(axis=0), label="CV Error")
plt.xscale("log")
plt.xlabel("Lambda (Alpha)")
plt.ylabel("Cross-Validated MSE")
plt.title("Ridge Regression Coefficients vs. Lambda")
plt.legend()
plt.show()


Lasso Regression and Cross Validation
Question 4 & 5: Fit lasso regression using LassoCV and interpret the results.

In [None]:
from sklearn.linear_model import LassoCV

# Lasso regression with cross-validation
lasso = LassoCV(alphas=alphas, cv=10, random_state=42)
lasso.fit(X_train_scaled, y_train)

# Plot Lasso results
plt.figure(figsize=(10, 6))
plt.plot(np.log10(lasso.alphas_), lasso.mse_path_.mean(axis=1), label="CV Error")
plt.xlabel("Log(Lambda)")
plt.ylabel("Cross-Validated MSE")
plt.title("Lasso Regression CV Results")
plt.legend()
plt.show()


Tuning and Model Comparison
Question 6 & 7: Find the minimum MSE and associated lambda for both Ridge and Lass

In [None]:
# Ridge results
ridge_min_mse = ridge.cv_values_.min()
ridge_best_alpha = ridge.alpha_

# Lasso results
lasso_min_mse = lasso.mse_path_.min()
lasso_best_alpha = lasso.alpha_

print(f"Ridge - Minimum MSE: {ridge_min_mse}, Best Alpha: {ridge_best_alpha}")
print(f"Lasso - Minimum MSE: {lasso_min_mse}, Best Alpha: {lasso_best_alpha}")


Question 8: Use the "one-standard-error" rule and find the number of predictors in the Lasso model.

In [None]:
# One-standard-error rule for Lasso
lasso_best_alpha_1se = lasso.alphas_[np.where(
    lasso.mse_path_.mean(axis=1) <= (lasso.mse_path_.mean(axis=1).min() + lasso.mse_path_.std(axis=1).mean())
)[0][0]]

lasso_1se_model = Lasso(alpha=lasso_best_alpha_1se)
lasso_1se_model.fit(X_train_scaled, y_train)

print(f"Lasso 1-SE Rule Alpha: {lasso_best_alpha_1se}")
print(f"Number of Predictors in Lasso (1-SE): {np.sum(lasso_1se_model.coef_ != 0)}")


In [None]:
Question 9: Compare the performance of your Ridge and Lasso models.