In this exercise, we will predict the number of applications received
using the other variables in the College data set.

(a) Split the data set into a training set and a test set.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the College dataset
data = pd.read_csv("College.csv")

# Remove the 'Unnamed: 0' column (college names) and encode 'Private'
data = data.drop(columns=["Unnamed: 0"])
data["Private"] = data["Private"].apply(lambda x: 1 if x == "Yes" else 0)

# Split the dataset into features (X) and target (y)
X = data.drop(columns=["Apps"])  # Independent variables
y = data["Apps"]  # Dependent variable (number of applications received)

# Standardize the features for better performance in ML algorithms
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training (70%) and test (30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# Output the shapes of the datasets
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 543 samples
Test set size: 234 samples


(b) Fit a linear model using least squares on the training set, and
report the test error obtained.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step (b): Fit a linear regression model using least squares
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)  # Train the model on the training set

# Predict on the test set
y_pred_linear = linear_model.predict(X_test)

# Calculate the test error (Mean Squared Error)
linear_mse = mean_squared_error(y_test, y_pred_linear)

print(f"Linear Regression Test Error (MSE): {linear_mse}")

Linear Regression Test Error (MSE): 1931803.1942070152


(c) Fit a ridge regression model on the training set, with λ chosen
by cross-validation. Report the test error obtained.

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

# Step (c): Fit a ridge regression model with cross-validation
# Use RidgeCV to automatically find the best λ (alpha) using cross-validation
ridge_model = RidgeCV(
    alphas=[0.1, 1.0, 10.0, 100.0], cv=5
)  # Specify α values for cross-validation
ridge_model.fit(X_train, y_train)  # Train the model on the training set

# Predict on the test set
y_pred_ridge = ridge_model.predict(X_test)

# Calculate the test error (Mean Squared Error)
ridge_mse = mean_squared_error(y_test, y_pred_ridge)

# Output the results
print(f"Best λ (alpha) chosen by cross-validation: {ridge_model.alpha_}")
print(f"Ridge Regression Test Error (MSE): {ridge_mse}")

Best λ (alpha) chosen by cross-validation: 1.0
Ridge Regression Test Error (MSE): 1901722.50404795


(d) Fit a lasso model on the training set, with λ chosen by cross-
validation. Report the test error obtained, along with the num-
ber of non-zero coefficient estimates.

In [None]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import numpy as np

# Step (d): Fit a lasso regression model with cross-validation
# Use LassoCV to automatically find the best λ (alpha) using cross-validation
lasso_model = LassoCV(
    alphas=np.logspace(-3, 1, 100), cv=5, random_state=42
)  # Specify λ values for cross-validation
lasso_model.fit(X_train, y_train)  # Train the model on the training set

# Predict on the test set
y_pred_lasso = lasso_model.predict(X_test)

# Calculate the test error (Mean Squared Error)
lasso_mse = mean_squared_error(y_test, y_pred_lasso)

# Count the number of non-zero coefficients
non_zero_coefficients = np.sum(lasso_model.coef_ != 0)

# Output the results
print(f"Best λ (alpha) chosen by cross-validation: {lasso_model.alpha_}")
print(f"Lasso Regression Test Error (MSE): {lasso_mse}")
print(f"Number of non-zero coefficients in Lasso model: {non_zero_coefficients}")

Best λ (alpha) chosen by cross-validation: 0.001
Lasso Regression Test Error (MSE): 1931801.797062018
Number of non-zero coefficients in Lasso model: 17
