# 9. In this exercise, we will predict the number of applications received using the other variables in the College data set.

## (a) Split the data set into a training set and a test set.

In [None]:
import pandas as pd
import numpy as np
from ISLP import load_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

#Load the College dataset from ISLP package
df = load_data('College')

#Convert categorical variables to dummy variables
X = pd.get_dummies(df.drop(columns=['Apps']), drop_first=True)
y = df['Apps']

#Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## (b) Fit a linear model using least squares on the training set, and report the test error obtained.

In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred_lm = lm.predict(X_test)
test_error_lm = mean_squared_error(y_test, y_pred_lm)
print("Test Error (Linear Regression):", sqrt(test_error_lm))


Test Error (Linear Regression): 1389.8932312257036


## (c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

In [9]:
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV(alphas=np.logspace(-6, 6, 13), scoring='neg_mean_squared_error', cv=10)
ridge_cv.fit(X_train, y_train)
y_pred_ridge = ridge_cv.predict(X_test)
test_error_ridge = mean_squared_error(y_test, y_pred_ridge)
print("Test Error (Ridge Regression):", sqrt(test_error_ridge))

Test Error (Ridge Regression): 1388.0510461162175


## (d) Fit a lasso model on the training set, with chosen by cross validation. Report the test error obtained, along with the number of non-zero coefficient estimates

In [10]:
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(alphas=np.logspace(-6, 6, 13), cv=10, random_state=42)
lasso_cv.fit(X_train, y_train)
y_pred_lasso = lasso_cv.predict(X_test)
test_error_lasso = mean_squared_error(y_test, y_pred_lasso)
non_zero_coefs = np.sum(lasso_cv.coef_ != 0)
print("Test Error (Lasso Regression):", sqrt(test_error_lasso))
print("Number of non-zero coefficients (Lasso):", non_zero_coefs)


Test Error (Lasso Regression): 1389.6937633335688
Number of non-zero coefficients (Lasso): 17


## (e) Fit a PCR model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.

In [None]:
from sklearn.decomposition import PCA

#Using PCA for dimensionality reduction, followed by linear regression
mse_pcr = []
for i in range(1, X_train.shape[1] + 1):
    pca = PCA(n_components=i)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    lm_pca = LinearRegression()
    lm_pca.fit(X_train_pca, y_train)
    y_pred_pca = lm_pca.predict(X_test_pca)
    mse_pcr.append(mean_squared_error(y_test, y_pred_pca))

#Choose the optimal number of components based on minimum MSE
optimal_M_pcr = np.argmin(mse_pcr) + 1
print("Optimal number of components (PCR):", optimal_M_pcr)
print("Test Error (PCR):", sqrt(mse_pcr[optimal_M_pcr - 1]))

Optimal number of components (PCR): 16
Test Error (PCR): 1388.7859506033444


##  (f) Fit a PLS model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation

In [None]:
from sklearn.cross_decomposition import PLSRegression

mse_pls = []
for i in range(1, X_train.shape[1] + 1):
    pls = PLSRegression(n_components=i)
    pls.fit(X_train, y_train)
    y_pred_pls = pls.predict(X_test)
    mse_pls.append(mean_squared_error(y_test, y_pred_pls))

#Choose the optimal number of components based on minimum MSE
optimal_M_pls = np.argmin(mse_pls) + 1
print("Optimal number of components (PLS):", optimal_M_pls)
print("Test Error (PLS):", sqrt(mse_pls[optimal_M_pls - 1]))

Optimal number of components (PLS): 11
Test Error (PLS): 1381.4172155500146


##  (g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches

In [13]:
print("Linear Regression:", sqrt(test_error_lm))
print("Ridge Regression:", sqrt(test_error_ridge))
print("Lasso Regression:", sqrt(test_error_lasso))
print("PCR (optimal M):", sqrt(mse_pcr[optimal_M_pcr - 1]))
print("PLS (optimal M):", sqrt(mse_pls[optimal_M_pls - 1]))

Linear Regression: 1389.8932312257036
Ridge Regression: 1388.0510461162175
Lasso Regression: 1389.6937633335688
PCR (optimal M): 1388.7859506033444
PLS (optimal M): 1381.4172155500146


While PLS has the lowest test error, the difference is minimal. 