In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# **Load Data Set**

In [76]:
# Load the data
df = pd.read_csv('../Data/data_5KNN.csv')

# Transform it into numpy array
data = df.to_numpy()

In [77]:
# Get the variables
X = data[:,1:]
y = data[:,0]

# Shape
[n, p] = np.shape(X)

# **OLS**

## **Training**

In [78]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import StandardScaler

# # Initialize scaler (standardization)
# scaler = StandardScaler()

# ols_model = LinearRegression()
# ols_model.fit(X, y)

# **Ridge Regression**

In [79]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## **Training**

In [80]:
# Define a range of lambda (alpha) values to test
lambda_values = np.logspace(-3, 3, 100)

# Initialize scaler (standardization)
scaler = StandardScaler()

# Define a pipeline that includes normalization and regression to indtroduce in the GridSearchCV
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('ridge', Ridge())             # Apply Ridge Regression
])

# We need a dictionary as an input for the parameters
param_lambda_values = {'ridge__alpha' : lambda_values}

# Perform GridSearchCV to find the best lambda (alpha in this case)
# Its basically a tool to tune/optimize a hyperparamter (lambda in this case)
# its neg_mse beacuse we want to MINIMIZE the score mse (gridsearch always maximizes the score)
ridge_cv = GridSearchCV(estimator=ridge_pipeline, param_grid=param_lambda_values, scoring='neg_mean_squared_error', cv=5) # we obtain min RMSE with 5 folds
ridge_cv.fit(X, y)  # it is using the normalized data stablished in the pipeline

# Get the best lambda
best_lambda_ridge = ridge_cv.best_params_['ridge__alpha']
# Get the best RMSE
ridge_rmse = np.sqrt(-ridge_cv.best_score_)

# Apply one standard error rule
cv_results = ridge_cv.cv_results_
mean_scores = -cv_results['mean_test_score']  # Convert back to MSE
std_scores = cv_results['std_test_score']

best_idx = ridge_cv.best_index_     # find the indx with the lowest mse
one_std_error = mean_scores[best_idx] + std_scores[best_idx]

# Find the simplest model (largest lambda) within one std error
candidate_indices = np.where(mean_scores <= one_std_error)[0]
best_lambda_ridge_ose = np.max(lambda_values[candidate_indices])

# Get RMSE for OSE lambda
ose_idx = np.where(lambda_values == best_lambda_ridge_ose)[0][0]
ridge_ose_rmse = np.sqrt(-ridge_cv.cv_results_['mean_test_score'][ose_idx])

# Parameters
print(f"Best lambda (original): {ridge_cv.best_params_['ridge__alpha']:.4f} ***** RMSE: {np.sqrt(-ridge_cv.best_score_):.4f}")
print(f"Best lambda (one std error): {best_lambda_ridge_ose:.4f} ***** RMSE: {ridge_ose_rmse:.4f}")

# Fit Ridge with the one std error lambda
best_ridge = Ridge(alpha=best_lambda_ridge_ose)
best_ridge.fit(scaler.fit_transform(X), y)

Best lambda (original): 5.7224 ***** RMSE: 35.9119
Best lambda (one std error): 35.1119 ***** RMSE: 40.0059


# **Lasso Regression**

In [81]:
from sklearn.linear_model import LassoLars

## **Training: LARS**

In [82]:
# Define a range of lambda (alpha) values to test
lambda_values = np.logspace(-3, 3, 100)

# Initialize scaler (standardization)
scaler = StandardScaler()

# Define a pipeline that includes normalization and regression to indtroduce in the GridSearchCV
lasso_lars_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('lasso_lars', LassoLars())             # Apply Lasso LARS Regression
])

# We need a dictionary as an input for the parameters
param_lambda_values = {'lasso_lars__alpha' : lambda_values}

# Perform GridSearchCV to find the best lambda (alpha in this case)
# Its basically a tool to tune/optimize a hyperparamter (lambda in this case)
# its neg_mse beacuse we want to MINIMIZE the score mse (gridsearch always maximizes the score)
lasso_lars_cv = GridSearchCV(estimator=lasso_lars_pipeline, param_grid=param_lambda_values, scoring='neg_mean_squared_error', cv=5) # we obtain min RMSE with 5 folds
lasso_lars_cv.fit(X, y)  # it is using the normalized data stablished in the pipeline

# Get the best lambda
best_lambda_lasso_lars = lasso_lars_cv.best_params_['lasso_lars__alpha']
# Get the best RMSE
lasso_lars_rmse = np.sqrt(-lasso_lars_cv.best_score_)

# Apply one standard error rule (OSE)
cv_results = lasso_lars_cv.cv_results_
mean_scores = -cv_results['mean_test_score']  # Convert back to MSE
std_scores = cv_results['std_test_score']

best_idx = lasso_lars_cv.best_index_     # find the indx with the lowest mse
one_std_error = mean_scores[best_idx] + std_scores[best_idx]

# Find the simplest model (largest lambda) within one std error
candidate_indices = np.where(mean_scores <= one_std_error)[0]
best_lambda_lasso_lars_ose = np.max(lambda_values[candidate_indices])

# Get RMSE for OSE lambda
ose_idx = np.where(lambda_values == best_lambda_lasso_lars_ose)[0][0]
lasso_lars_ose_rmse = np.sqrt(-lasso_lars_cv.cv_results_['mean_test_score'][ose_idx])

# Parameters
print(f"Best lambda Lasso LARS (original): {lasso_lars_cv.best_params_['lasso_lars__alpha']:.4f} ***** RMSE: {np.sqrt(-lasso_lars_cv.best_score_):.4f}")
print(f"Best lambda Lasso LARS (one std error): {best_lambda_lasso_lars_ose:.4f} ***** RMSE: {lasso_lars_ose_rmse:.4f}")

# Fit Lasso with the BEST lambda
best_lasso_lars = LassoLars(alpha=best_lambda_lasso_lars_ose)
best_lasso_lars.fit(scaler.fit_transform(X), y)  # Transform X before fitting

Best lambda Lasso LARS (original): 1.4175 ***** RMSE: 28.1015
Best lambda Lasso LARS (one std error): 5.7224 ***** RMSE: 31.3881


# 2-Layer

In [83]:
from sklearn.linear_model import Ridge, Lasso, LassoLars, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [95]:
# 1. Outer K-Fold
K_outer = 5
outer_cv = KFold(n_splits=K_outer)

# Define a range of alphas to try
alpha_grid = np.logspace(-3, 2, 100)
l1_ratios = np.linspace(0.1, 0.9, 9)   # Mix ratio between Lasso (L1) and Ridge (L2)

# Store the outer test RMSE
outer_test_rmse = []

# Track which model was selected in each fold
model_selections = []


# 2. Inner CV Loop
for train_idx, test_idx in outer_cv.split(X):
    # Split data into inner training and outer test sets
    X_train_outer, X_test_outer = X[train_idx], X[test_idx]
    y_train_outer, y_test_outer = y[train_idx], y[test_idx]

    # 2.1 Inner CV: RIDGE
    # Define a pipeline that includes normalization and regression to indtroduce in the GridSearchCV
    ridge_pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('ridge', Ridge())             # Apply Ridge Regression
    ])
    
    # We need a dictionary as an input for the parameters
    ridge_param_grid = {'ridge__alpha' : lambda_values}

    # Perform GridSearchCV to find the best lambda (alpha in this case)
    # Its basically a tool to tune/optimize a hyperparamter (lambda in this case)
    # its neg_mse beacuse we want to MINIMIZE the score mse (gridsearch always maximizes the score)
    ridge_cv = GridSearchCV(
        estimator=ridge_pipeline,
        param_grid=ridge_param_grid,
        scoring="neg_mean_squared_error",
        cv=KFold(n_splits=5)
    )
    ridge_cv.fit(X_train_outer, y_train_outer)  # it is using the normalized data stablished in the pipeline
    
    # Obtain the best parameters and the corresponding model
    best_lambda_ridge = ridge_cv.best_params_['ridge__alpha']

    # Obtain the best model, score (MSE) and RMSE
    best_ridge_model = ridge_cv.best_estimator_  # pipeline refit on entire outer-training set
    best_ridge_score = ridge_cv.best_score_      # negative MSE (the higher, the better)
    best_ridge_rmse   = np.sqrt(-best_ridge_score)         # turn negative MSE into positive MSE and obtain RMSE

    # 2.2 Inner CV: LASSO LARS
    # Define a pipeline that includes normalization and regression to indtroduce in the GridSearchCV
    lasso_lars_pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('lasso_lars', LassoLars())             # Apply Lasso LARS Regression
    ])
    
    # We need a dictionary as an input for the parameters
    lasso_lars_param_grid = {'lasso_lars__alpha' : lambda_values}

    # Perform GridSearchCV to find the best lambda (alpha in this case)
    lasso_lars_cv = GridSearchCV(
        estimator=lasso_lars_pipeline,
        param_grid=lasso_lars_param_grid,
        scoring="neg_mean_squared_error",
        cv=KFold(n_splits=5)
    )

    lasso_lars_cv.fit(X_train_outer, y_train_outer)

    # Obtain the best parameters and the corresponding model
    best_lambda_lasso_lars = lasso_lars_cv.best_params_['lasso_lars__alpha']

    # Obtain the best model, score (MSE) and RMSE
    best_lasso_model = lasso_lars_cv.best_estimator_
    best_lasso_score = lasso_lars_cv.best_score_
    best_lasso_rmse   = np.sqrt(-best_lasso_score)

    # 2.3 Inner CV: ELASTICNET
    # Define a pipeline that includes normalization and regression to introduce in GridSearchCV
    elasticnet_pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('elasticnet', ElasticNet(max_iter=20000))  # Apply ElasticNet with a high iteration limit
    ])

    # We need a dictionary as an input for the parameters
    elasticnet_param_grid = {
        'elasticnet__alpha': lambda_values,
        'elasticnet__l1_ratio': l1_ratios  # The mix ratio between Lasso and Ridge
    }

    # Perform GridSearchCV to find the best lambda (alpha in this case)
    elasticnet_cv = GridSearchCV(
        estimator=elasticnet_pipeline,
        param_grid=elasticnet_param_grid,
        scoring="neg_mean_squared_error",
        cv=KFold(n_splits=5),
        n_jobs=-1
    )
    elasticnet_cv.fit(X_train_outer, y_train_outer)

    # Obtain the best parameters and the corresponding model
    best_lambda_elasticnet = elasticnet_cv.best_params_['elasticnet__alpha']
    best_l1_ratio_elasticnet = elasticnet_cv.best_params_['elasticnet__l1_ratio']

    # Obtain the best model, score (MSE) and RMSE
    best_elasticnet_model = elasticnet_cv.best_estimator_
    best_elasticnet_score = elasticnet_cv.best_score_
    best_elasticnet_rmse   = np.sqrt(-best_elasticnet_score)

    # 2.4 Decide which model is better (Ridge, Lasso, or ElasticNet)
    model_comparison = {
        'Ridge': best_ridge_rmse,
        'Lasso': best_lasso_rmse,
        'ElasticNet': best_elasticnet_rmse
    }    
    # Find model with lowest RMSE
    best_model_name = min(model_comparison, key=model_comparison.get)
    chosen_model = {
        'Ridge': best_ridge_model,
        'Lasso': best_lasso_model,
        'ElasticNet': best_elasticnet_model
    }[best_model_name]

    # Track which model was selected
    model_selections.append(best_model_name)

    # 2.4 Evaluate on the Outer Test Fold
    y_pred_outer_test = chosen_model.predict(X_test_outer)
    rmse_outer_test = root_mean_squared_error(y_test_outer, y_pred_outer_test)
    outer_test_rmse.append(rmse_outer_test)

# 3. Final Estimate of Generalization Error (in terms of RMSE)
two_layer_cv_rmse = np.mean(outer_test_rmse)
print("2-layer CV estimate of RMSE =", two_layer_cv_rmse)
print("Model selection frequency:", {model: f"{model_selections.count(model)} / {K_outer}" for model in set(model_selections)})

2-layer CV estimate of RMSE = 28.101561564233513
Model selection frequency: {'Lasso': '5 / 5'}


We see that the 2-layer CV method chooses Lasso as the best model in EVERY outer iteration. That means that Ridge and Elastic Net are not even considered at all in the estimation of the generalization error (in RMSE terms). Consequently, we can conclude that Lasso is the best model in this problem, and makes sense that we obtain the same estimated value of the RMSE as the one obtained in the 1-layer CV method. 

Now we have 2 options: choose the RMSE of Lasso using the best lambda, or choose the RMSE of Lasso using the OSE lambda (which will give us a simpler model and reduce overfitting, improving generalization).