In [2]:
!pip install ISLP

Collecting ISLP
  Downloading ISLP-0.4.0-py3-none-any.whl.metadata (7.0 kB)
Collecting lifelines (from ISLP)
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting pygam (from ISLP)
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pytorch-lightning (from ISLP)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics (from ISLP)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting autograd-gamma>=0.3 (from lifelines->ISLP)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines->ISLP)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting scipy>=0.9 (from ISLP)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.0 MB/s[0m eta [36m

# 9. In this exercise, we will predict the number of applications received using the other variables in the College data set.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error

(a) Split the data set into a training set and a test set.

In [3]:
from ISLP import load_data
college = load_data("College")
X = college.drop("Apps", axis=1)
y = college["Apps"]
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

(b) Fit a linear model using least squares on the training set, and report the test error obtained.

In [4]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)
print("Linear Model Test MSE:", mse_linear)

Linear Model Test MSE: 1659682.1719133756


(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

In [5]:
ridge_model = RidgeCV(alphas=np.logspace(-6, 6, 13), cv=5)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print("Ridge Model Test MSE:", mse_ridge)
print("Best Lambda for Ridge:", ridge_model.alpha_)

Ridge Model Test MSE: 1642456.265965162
Best Lambda for Ridge: 10.0


(d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

In [6]:
lasso_model = LassoCV(cv=5)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print("Lasso Model Test MSE:", mse_lasso)
print("Best Lambda for Lasso:", lasso_model.alpha_)
print("Number of Non-zero Coefficients (Lasso):", np.sum(lasso_model.coef_ != 0))

Lasso Model Test MSE: 1914912.3699589907
Best Lambda for Lasso: 56401.126011064014
Number of Non-zero Coefficients (Lasso): 5


(e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

In [7]:
# (e) 主成分回歸 (PCR) 模型，使用交叉驗證選擇最佳成分數 M
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [8]:
# 使用不同的主成分數進行交叉驗證選擇最佳 M
mse_pcr = []
for m in range(1, X_train.shape[1] + 1):
    linear_model.fit(X_train_pca[:, :m], y_train)
    y_pred = linear_model.predict(X_test_pca[:, :m])
    mse_pcr.append(mean_squared_error(y_test, y_pred))

best_m_pcr = np.argmin(mse_pcr) + 1
print("PCR Model Test MSE:", mse_pcr[best_m_pcr - 1])
print("Best Number of Components for PCR:", best_m_pcr)

PCR Model Test MSE: 1598862.8989769155
Best Number of Components for PCR: 16


(f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

In [9]:
# (f) 偏最小二乘回歸 (PLS) 模型，使用交叉驗證選擇最佳成分數 M
pls_mse = []
for m in range(1, X_train.shape[1] + 1):
    pls_model = PLSRegression(n_components=m)
    pls_model.fit(X_train, y_train)
    y_pred_pls = pls_model.predict(X_test)
    mse_pls = mean_squared_error(y_test, y_pred_pls)
    pls_mse.append(mse_pls)

best_m_pls = np.argmin(pls_mse) + 1
print("PLS Model Test MSE:", pls_mse[best_m_pls - 1])
print("Best Number of Components for PLS:", best_m_pls)

PLS Model Test MSE: 1659682.1719133682
Best Number of Components for PLS: 17


(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

In [10]:
print("\nModel Comparison:")
print("Linear Model Test MSE:", mse_linear)
print("Ridge Model Test MSE:", mse_ridge, "with Best Lambda:", ridge_model.alpha_)
print("Lasso Model Test MSE:", mse_lasso, "with Best Lambda:", lasso_model.alpha_)
print("PCR Model Test MSE:", mse_pcr[best_m_pcr - 1], "with Best M:", best_m_pcr)
print("PLS Model Test MSE:", pls_mse[best_m_pls - 1], "with Best M:", best_m_pls)


Model Comparison:
Linear Model Test MSE: 1659682.1719133756
Ridge Model Test MSE: 1642456.265965162 with Best Lambda: 10.0
Lasso Model Test MSE: 1914912.3699589907 with Best Lambda: 56401.126011064014
PCR Model Test MSE: 1598862.8989769155 with Best M: 16
PLS Model Test MSE: 1659682.1719133682 with Best M: 17


The ridge model achieves the lowest test error, with the lasso model following closely. Both models outperform PCR and PLS. However, the test error differences across all five methods are minimal.