# HW1111Q4 CH06Q09

### 9. In this exercise, we will predict the number of applications received using the other variables in the College data set.

##### (a) Split the data set into a training set and a test set.

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from ISLP import load_data

college = load_data("College")
X = college.drop("Apps", axis=1)
y = college["Apps"]
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

##### (b) Fit a linear model using least squares on the training set, and report the test error obtained.

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Test Error (Mean Squared Error): {mse}")

Test Error (Mean Squared Error): 1659682.171913388


##### (c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

In [3]:
from sklearn.linear_model import RidgeCV

ridge_model = RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0], store_cv_values=True)
ridge_model.fit(X_train, y_train)

y_pred_ridge = ridge_model.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print(f"Ridge Test Error (Mean Squared Error): {mse_ridge}")
print(f"Best λ (alpha) chosen by cross-validation: {ridge_model.alpha_}")

Ridge Test Error (Mean Squared Error): 1657434.7315285418
Best λ (alpha) chosen by cross-validation: 1.0


##### (d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

In [4]:
from sklearn.linear_model import LassoCV

lasso_model = LassoCV(alphas=[0.1, 1.0, 10.0, 100.0], cv=5)
lasso_model.fit(X_train, y_train)

y_pred_lasso = lasso_model.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print(f"Lasso Test Error (Mean Squared Error): {mse_lasso}")
print(f"Best λ chosen by cross-validation: {lasso_model.alpha_}")
print(f"Number of non-zero coefficients: {sum(lasso_model.coef_ != 0)}")

Lasso Test Error (Mean Squared Error): 1657794.6179723172
Best λ chosen by cross-validation: 1.0
Number of non-zero coefficients: 17


##### (e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

In [5]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

best_m = 0
best_mse = float('inf')
for m in range(1, X_train.shape[1] + 1):
    pcr_model = make_pipeline(PCA(n_components=m), LinearRegression())
    mse = -cross_val_score(pcr_model, X_train, y_train, cv=5, scoring="neg_mean_squared_error").mean()
    if mse < best_mse:
        best_mse = mse
        best_m = m

pcr_model = make_pipeline(PCA(n_components=best_m), LinearRegression())
pcr_model.fit(X_train, y_train)

y_pred_pcr = pcr_model.predict(X_test)

mse_pcr = mean_squared_error(y_test, y_pred_pcr)

print(f"PCR Test Error (Mean Squared Error): {mse_pcr}")
print(f"Best M (number of principal components) chosen by cross-validation: {best_m}")

PCR Test Error (Mean Squared Error): 1659682.171913392
Best M (number of principal components) chosen by cross-validation: 17


##### (f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

In [6]:
from sklearn.cross_decomposition import PLSRegression

best_m_pls = 0
best_mse_pls = float('inf')
for m in range(1, X_train.shape[1] + 1):
    pls_model = PLSRegression(n_components=m)
    mse = -cross_val_score(pls_model, X_train, y_train, cv=5, scoring="neg_mean_squared_error").mean()
    if mse < best_mse_pls:
        best_mse_pls = mse
        best_m_pls = m

pls_model = PLSRegression(n_components=best_m_pls)
pls_model.fit(X_train, y_train)

y_pred_pls = pls_model.predict(X_test)

mse_pls = mean_squared_error(y_test, y_pred_pls)

print(f"PLS Test Error (Mean Squared Error): {mse_pls}")
print(f"Best M (number of components) chosen by cross-validation: {best_m_pls}")

PLS Test Error (Mean Squared Error): 1665183.585638088
Best M (number of components) chosen by cross-validation: 9


##### (g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

In [7]:
print("The ridge model provides the lowest test error, followed by lasso. The results from these two models are better than those from PCR and PLS.")
print("The difference in test error between the five methods is not significant.")

The ridge model provides the lowest test error, followed by lasso. The results from these two models are better than those from PCR and PLS.
The difference in test error between the five methods is not significant.
