In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the College data set
college = pd.read_csv('D:/Master Degree at NYCU/1. Slides and Homeworks/Spring_2024/1. FinTech/JN/college.csv')

# Split the data set into a training set and a test set
X = college.drop('Apps', axis=1)
X = X.select_dtypes(include=[np.number])
y = college['Apps']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Training set shape: ", X_train.shape)
print("Test set shape: ", X_test.shape)

Training set shape:  (621, 16)
Test set shape:  (156, 16)


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Fit a linear model using least squares on the training set
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the test set
y_pred = model.predict(X_test)

# Report the test error
mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error: ", mse)

Test Mean Squared Error:  1182113.6667499843


In [23]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the hyperparameter space for λ
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}

# Perform grid search to choose the best λ
grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Report the best λ and the test error
best_alpha = grid_search.best_params_['alpha']
mse = grid_search.best_score_
print("Best λ: ", best_alpha)
print("Test Mean Squared Error: ", -mse)

Best λ:  0.1
Test Mean Squared Error:  1369077.8157046435


In [24]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the hyperparameter space for λ
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}

# Perform grid search to choose the best λ
grid_search = GridSearchCV(Lasso(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Report the best λ, the test error, and the number of non-zero coefficient estimates
best_alpha = grid_search.best_params_['alpha']
mse = grid_search.best_score_
print("Best λ: ", best_alpha)
print("Test Mean Squared Error: ", -mse)

# Get the coefficients of the best model
coefficients = grid_search.best_estimator_.coef_

# Count the number of non-zero coefficients
non_zero_coefficients = np.count_nonzero(coefficients)
print("Number of non-zero coefficient estimates: ", non_zero_coefficients)

Best λ:  10
Test Mean Squared Error:  1368673.728651584
Number of non-zero coefficient estimates:  16


In [25]:
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the hyperparameter space for M
param_grid = {'n_components': [1, 2, 3, 4, 5]}

# Perform grid search to choose the best M
grid_search = GridSearchCV(PCA(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train)

# Report the best M and the test error
best_n_components = grid_search.best_params_['n_components']
mse = grid_search.best_score_
print("Best M: ", best_n_components)
print("Test Mean Squared Error: ", -mse)

# Fit the best PCR model on the training set
pca = PCA(n_components=best_n_components)
pca.fit(X_train)

# Transform the training and test sets using the best PCR model
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Fit a linear model on the transformed training set
model = LinearRegression()
model.fit(X_train_pca, y_train)

# Predict the test set using the trained model
y_pred = model.predict(X_test_pca)

# Report the test error
mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error: ", mse)

Best M:  1
Test Mean Squared Error:  nan
Test Mean Squared Error:  9250580.274545342


Traceback (most recent call last):
  File "C:\Users\PINYKEWD\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 969, in _score
    scores = scorer(estimator, X_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "C:\Users\PINYKEWD\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 969, in _score
    scores = scorer(estimator, X_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "C:\Users\PINYKEWD\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 969, in _score
    scores = scorer(estimator, X_test, **score_params)
             ^^^^

In [26]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the hyperparameter space for M
param_grid = {'n_components': [1, 2, 3, 4, 5]}

# Perform grid search to choose the best M
grid_search = GridSearchCV(PLSRegression(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Report the best M and the test error
best_n_components = grid_search.best_params_['n_components']
mse = grid_search.best_score_
print("Best M: ", best_n_components)
print("Test Mean Squared Error: ", -mse)

# Fit the best PLS model on the training set
pls = PLSRegression(n_components=best_n_components)
pls.fit(X_train, y_train)

# Predict the test set using the trained model
y_pred = pls.predict(X_test)

# Report the test error
mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error: ", mse)

Best M:  5
Test Mean Squared Error:  1688319.2648955558
Test Mean Squared Error:  1050758.213672055


In [30]:
# Print the test errors for each approach
print("Linear Regression: ", mse)
print("Ridge Regression: ", mse_ridge)
print("Lasso Regression: ", mse_lasso)
print("Principal Component Regression (PCR): ", mse_pcr)
print("Partial Least Squares (PLS): ", mse_pls)

# Comment on the results
print("The test errors for the five approaches are:")
print("Linear Regression: ", mse)
print("Ridge Regression: ", mse_ridge)
print("Lasso Regression: ", mse_lasso)
print("Principal Component Regression (PCR): ", mse_pcr)
print("Partial Least Squares (PLS): ", mse_pls)

# Compare the test errors
if mse < mse_ridge and mse < mse_lasso and mse < mse_pcr and mse < mse_pls:
    print("Linear Regression has the lowest test error.")
elif mse_ridge < mse and mse_ridge < mse_lasso and mse_ridge < mse_pcr and mse_ridge < mse_pls:
    print("Ridge Regression has the lowest test error.")
elif mse_lasso < mse and mse_lasso < mse_ridge and mse_lasso < mse_pcr and mse_lasso < mse_pls:
    print("Lasso Regression has the lowest test error.")
elif mse_pcr < mse and mse_pcr < mse_ridge and mse_pcr < mse_lasso and mse_pcr < mse_pls:
    print("Principal Component Regression (PCR) has the lowest test error.")
elif mse_pls < mse and mse_pls < mse_ridge and mse_pls < mse_lasso and mse_pls < mse_pcr:
    print("Partial Least Squares (PLS) has the lowest test error.")
else:
    print("The test errors are similar.")

Linear Regression:  1050758.213672055


NameError: name 'mse_ridge' is not defined