# Ch06_Q9
In this exercise, we will predict the number of applications received
using the other variables in the College data set.

In [1]:
!pip install ISLP

Collecting ISLP
  Downloading ISLP-0.4.0-py3-none-any.whl.metadata (7.0 kB)
Collecting lifelines (from ISLP)
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting pygam (from ISLP)
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pytorch-lightning (from ISLP)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics (from ISLP)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting autograd-gamma>=0.3 (from lifelines->ISLP)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines->ISLP)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting scipy>=0.9 (from ISLP)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.2 MB/s[0m eta [36m

## (a) Split the data set into a training set and a test set.

In [8]:
import pandas as pd
from ISLP import load_data
from sklearn.model_selection import train_test_split

# Load the College dataset
college = load_data('College')

# Define the features (X) and target variable (y)
X = college.drop("Apps", axis=1)
y = college["Apps"]
X = pd.get_dummies(X, drop_first=True)
# Split the dataset into a training set and a test set (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## (b) Fit a linear model using least squares on the training set, and report the test error obtained.

In [9]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error (MSE) for the test set
mse = mean_squared_error(y_test, y_pred)

# Report the test error (MSE) and R-squared value
print(f"Mean Squared Error (MSE) on the test set: {mse}")

Mean Squared Error (MSE) on the test set: 1492443.379039042


## (c) Fit a ridge regression model on the training set, with  λ chosen by cross-validation. Report the test error obtained.

In [11]:
from sklearn.linear_model import RidgeCV

# Handle categorical variables (if any) using Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to all columns that contain categorical data
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_encoder.fit_transform(X[column])

# Initialize the Ridge Regression model with cross-validation
ridge = RidgeCV(alphas=[0.1, 1.0, 10.0], store_cv_values=True)

# Fit the model on the training data
ridge.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ridge.predict(X_test)

# Calculate the Mean Squared Error (MSE) for the test set
mse = mean_squared_error(y_test, y_pred)

# Report the test error (MSE) and R-squared value
print(f"Mean Squared Error (MSE) on the test set: {mse}")
print(f"Best alpha (λ) chosen by cross-validation: {ridge.alpha_}")

Mean Squared Error (MSE) on the test set: 1478569.58034882
Best alpha (λ) chosen by cross-validation: 10.0




## (d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

In [12]:
from sklearn.linear_model import LassoCV

# Initialize the Lasso model with cross-validation
lasso = LassoCV(alphas=[0.1, 1.0, 10.0, 100.0], cv=5)

# Fit the model on the training data
lasso.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lasso.predict(X_test)

# Calculate the Mean Squared Error (MSE) for the test set
mse = mean_squared_error(y_test, y_pred)
non_zero_coeffs = (lasso.coef_ != 0).sum()

# Report the test error (MSE),and number of non-zero coefficients
print(f"Mean Squared Error (MSE) on the test set: {mse}")
print(f"Best alpha (λ) chosen by cross-validation: {lasso.alpha_}")
print(f"Number of non-zero coefficients: {non_zero_coeffs}")

Mean Squared Error (MSE) on the test set: 1477248.9589983297
Best alpha (λ) chosen by cross-validation: 10.0
Number of non-zero coefficients: 17


## (e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

In [14]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np

# Initialize PCA and Linear Regression
pca = PCA()
lr = LinearRegression()

# List to store cross-validation MSE values for each number of components
mse_values = []

# Perform cross-validation for each number of principal components (M)
for M in range(1, X_train.shape[1] + 1):
    # Apply PCA with M components
    pca.set_params(n_components=M)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Fit a linear regression model on the transformed data
    lr.fit(X_train_pca, y_train)

    # Predict and calculate the MSE
    y_pred = lr.predict(X_test_pca)
    mse = mean_squared_error(y_test, y_pred)
    mse_values.append(mse)

# Find the value of M that minimizes the MSE
best_M = np.argmin(mse_values) + 1  # Add 1 because M starts from 1
best_mse = mse_values[best_M - 1]

# Report the test error (MSE) and the value of M selected by cross-validation
print(f"Best number of principal components (M) selected by cross-validation: {best_M}")
print(f"Mean Squared Error (MSE) on the test set with {best_M} components: {best_mse}")

Best number of principal components (M) selected by cross-validation: 16
Mean Squared Error (MSE) on the test set with 16 components: 1442579.6185429145


## (f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

In [15]:
from sklearn.cross_decomposition import PLSRegression

# Initialize the PLS model and list to store the MSE for each number of components (M)
mse_values = []

# Perform cross-validation for each number of components (M)
for M in range(1, X_train.shape[1] + 1):
    pls = PLSRegression(n_components=M)
    pls.fit(X_train, y_train)

    # Make predictions and calculate the Mean Squared Error (MSE)
    y_pred = pls.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_values.append(mse)

# Find the value of M that minimizes the MSE
best_M = np.argmin(mse_values) + 1  # Add 1 because M starts from 1
best_mse = mse_values[best_M - 1]

# Report the test error (MSE) and the value of M selected by cross-validation
print(f"Best number of components (M) selected by cross-validation: {best_M}")
print(f"Mean Squared Error (MSE) on the test set with {best_M} components: {best_mse}")

Best number of components (M) selected by cross-validation: 7
Mean Squared Error (MSE) on the test set with 7 components: 1448566.342451739


## (g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

 Among these five approaches, the Lasso Regression model achieved the lowest test MSE, is the most effective model for predicting the number of applications.