In [9]:
###(Q4)
###(A)
!pip install ISLP

import pandas as pd
from ISLP import load_data

from sklearn.model_selection import train_test_split
college = load_data("College")
X = college.drop("Apps", axis=1)
y = college["Apps"]
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)



In [10]:
###(b)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 線性迴歸模型
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# 預測與計算測試誤差
y_pred_linear = linear_model.predict(X_test)
test_error_linear = mean_squared_error(y_test, y_pred_linear)

print("Test error (Linear Regression):", test_error_linear)

Test error (Linear Regression): 1659682.1719133756


In [11]:
###(c)

from sklearn.linear_model import RidgeCV

# 嶺迴歸模型，使用交叉驗證選擇最佳的 λ
ridge_model = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)
ridge_model.fit(X_train, y_train)

# 預測與計算測試誤差
y_pred_ridge = ridge_model.predict(X_test)
test_error_ridge = mean_squared_error(y_test, y_pred_ridge)

print("Test error (Ridge Regression):", test_error_ridge)
print("Best lambda (Ridge):", ridge_model.alpha_)


Test error (Ridge Regression): 1642456.265965162
Best lambda (Ridge): 10.0


In [12]:
###(d)

from sklearn.linear_model import LassoCV

# Lasso 模型，使用交叉驗證選擇最佳的 λ
lasso_model = LassoCV(cv=5)
lasso_model.fit(X_train, y_train)

# 預測與計算測試誤差
y_pred_lasso = lasso_model.predict(X_test)
test_error_lasso = mean_squared_error(y_test, y_pred_lasso)

# 非零係數的數量
num_nonzero_coeffs = sum(lasso_model.coef_ != 0)

print("Test error (Lasso):", test_error_lasso)
print("Best lambda (Lasso):", lasso_model.alpha_)
print("Number of non-zero coefficients (Lasso):", num_nonzero_coeffs)


Test error (Lasso): 1914912.3699589907
Best lambda (Lasso): 56401.126011064014
Number of non-zero coefficients (Lasso): 5


In [13]:
###(e)

from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

# 使用交叉驗證選擇最佳的主成分數量 M
mse_scores = []
for m in range(1, X_train.shape[1] + 1):
    pca = PCA(n_components=m)
    pcr_model = make_pipeline(pca, LinearRegression())
    mse = -np.mean(cross_val_score(pcr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    mse_scores.append(mse)

# 找到 M 的最佳數量
best_m = np.argmin(mse_scores) + 1

# 使用最佳的 M 值重新擬合 PCR
pca = PCA(n_components=best_m)
pcr_model = make_pipeline(pca, LinearRegression())
pcr_model.fit(X_train, y_train)

# 預測與計算測試誤差
y_pred_pcr = pcr_model.predict(X_test)
test_error_pcr = mean_squared_error(y_test, y_pred_pcr)

print("Test error (PCR):", test_error_pcr)
print("Best M (PCR):", best_m)


Test error (PCR): 1659682.171913361
Best M (PCR): 17


In [14]:
###(f)
from sklearn.cross_decomposition import PLSRegression

# 使用交叉驗證選擇最佳的 M 值
pls_mse_scores = []
for m in range(1, X_train.shape[1] + 1):
    pls_model = PLSRegression(n_components=m)
    mse = -np.mean(cross_val_score(pls_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    pls_mse_scores.append(mse)

# 找到最佳 M 值
best_m_pls = np.argmin(pls_mse_scores) + 1

# 使用最佳 M 值重新擬合 PLS
pls_model = PLSRegression(n_components=best_m_pls)
pls_model.fit(X_train, y_train)

# 預測與計算測試誤差
y_pred_pls = pls_model.predict(X_test)
test_error_pls = mean_squared_error(y_test, y_pred_pls)

print("Test error (PLS):", test_error_pls)
print("Best M (PLS):", best_m_pls)


Test error (PLS): 1665183.5856380877
Best M (PLS): 9


In [None]:
###(g)
###模型選擇建議：如果測試誤差差距較小，可以優先選擇較簡單的模型（例如 Lasso），使模型更具解釋性。
###整體來看，這些方法在誤差上的表現相差不大。
###可以根據需求選擇 Ridge、PCR 或 PLS 來平衡模型的穩定性和簡單性。
###如果目標是強調模型解釋性或簡單性，可以優先考慮 Lasso，但在這個例子中 Lasso 並未帶來效果提升。