In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import RandomizedSearchCV

#  Load data from CSV
This function loads data from a CSV file. The last column is treated as the target variable `Y`,
and the remaining columns are treated as features `X`.

Parameters:
- `csv_path`: Path to the CSV file.

Returns:
- `X`: Feature matrix.
- `Y`: Target vector.

In [None]:
def load_data(csv_path):
    data = pd.read_csv(csv_path)
    X = data.iloc[:, :-1].values  # Features
    Y = data.iloc[:, -1].values   # Target
    return X, Y

# Generate synthetic data
Generates synthetic data for linear regression with U, V decomposition.

In [None]:
def generate_data(n, p, sparsity=0.1, gamma=1.0, sigma=1.0, tau=1.0):
    X = np.random.normal(0, 1, size=(n, p))
    beta = np.zeros(p)
    non_zero_indices = np.random.choice(p, int(p * sparsity), replace=False)
    beta[non_zero_indices] = np.random.normal(0, 1, len(non_zero_indices))
    epsilon = np.random.normal(0, sigma, n)
    W = np.random.normal(0, tau, n)
    Y = X @ beta + epsilon
    U = Y + gamma * W
    V = Y - (1 / gamma) * W
    return X, Y, U, V, beta

#  Variance estimation using RCV

In [None]:
def rcv_variance_estimation(X, Y, n_splits=2):
    n = X.shape[0]
    split_idx = n // n_splits
    X1, X2 = X[:split_idx], X[split_idx:]
    Y1, Y2 = Y[:split_idx], Y[split_idx:]
    model1 = LassoCV(cv=5).fit(X1, Y1)
    beta1 = model1.coef_
    sigma1_squared = np.mean((Y1 - X1 @ beta1) ** 2)
    model2 = LassoCV(cv=5).fit(X2, Y2)
    beta2 = model2.coef_
    sigma2_squared = np.mean((Y2 - X2 @ beta2) ** 2)
    variance_rcv = (sigma1_squared * (split_idx - np.sum(beta1 != 0)) +
                    sigma2_squared * (split_idx - np.sum(beta2 != 0))) / \
                   (n - np.sum(beta1 != 0) - np.sum(beta2 != 0))
    return variance_rcv

#  Variance estimation using Fan's method

In [None]:
def fan_variance_estimation(X, Y, alpha=None, cv=5):
    n = X.shape[0]
    if alpha is None:
        lasso_cv = LassoCV(cv=cv).fit(X, Y)
        alpha = lasso_cv.alpha_
    lasso = Lasso(alpha=alpha).fit(X, Y)
    beta_lasso = lasso.coef_
    residual = Y - X @ beta_lasso
    rss = np.sum(residual ** 2)
    l1_penalty = alpha * np.sum(np.abs(beta_lasso))
    sigma_squared = (rss / n) + l1_penalty
    return sigma_squared

#  Choosing lambda in Fan et al.'s Method 

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
import math

# 1. Theoretical Approach
def choose_lambda_theoretical(n, p, c=1.0):
    return c * np.log(p) / n

# 2. Cross-Validation Approach
def choose_lambda_cv(X, Y, cv=5):
    lasso_cv = LassoCV(cv=cv).fit(X, Y)
    return lasso_cv.alpha_

# 3. Information Criteria Approach
def choose_lambda_information_criteria(X, Y, criterion='AIC'):
    lasso = LassoCV(cv=5).fit(X, Y)
    alpha = lasso.alpha_
    beta = lasso.coef_
    residual = Y - X @ beta
    rss = np.sum(residual ** 2)
    n, p = X.shape
    k = np.sum(beta != 0)
    
    if criterion == 'AIC':
        return 2 * k - 2 * np.log(rss)
    elif criterion == 'BIC':
        return k * np.log(n) - 2 * np.log(rss)
    elif criterion == 'GIC':
        return rss + (alpha ** 2) * np.log(p)
    else:
        raise ValueError("Invalid criterion. Choose 'AIC', 'BIC', or 'GIC'.")

# 4. Bayesian Optimization Approach
def choose_lambda_bayesian(X, Y):
    search = BayesSearchCV(
        estimator=Lasso(),
        search_spaces={'alpha': (1e-4, 1.0, 'log-uniform')},
        n_iter=50,
        cv=5,
        scoring='neg_mean_squared_error',
        random_state=42
    )
    search.fit(X, Y)
    return search.best_params_['alpha']



# Fit and Evaluate LASSO Model

In [None]:
def fit_and_evaluate(X, Y, alpha, method_name):
    """
    Fits LASSO regression with the given alpha and evaluates the model.
    
    Parameters:
    - X: Feature matrix
    - Y: Target vector
    - alpha: Regularization parameter
    - method_name: Name of the method for labeling results
    
    Returns:
    - Dictionary with method name, alpha, MSE, and model coefficients.
    """
    # Split data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # Fit LASSO model
    model = Lasso(alpha=alpha).fit(X_train, Y_train)
    predictions = model.predict(X_test)
    
    # Calculate mean squared error
    mse = mean_squared_error(Y_test, predictions)
    
    # Return results
    return {
        'Method': method_name,
        'Alpha': alpha,
        'MSE': mse,
        'Coefficients': model.coef_
    }


# Compare Methods and Evaluate Performance

In [None]:
def compare_lambda_methods_and_evaluate(X, Y):
    """
    Compares different methods for selecting lambda and evaluates model performance.
    """
    n, p = X.shape

    # Choose lambdas
    lambda_theoretical = choose_lambda_theoretical(n, p)
    lambda_cv = choose_lambda_cv(X, Y)
    lambda_aic = choose_lambda_information_criteria(X, Y, criterion='AIC')
    lambda_bic = choose_lambda_information_criteria(X, Y, criterion='BIC')
    lambda_gic = choose_lambda_information_criteria(X, Y, criterion='GIC')
    lambda_bayesian = choose_lambda_bayesian(X, Y)

    # Evaluate models
    results = []
    results.append(fit_and_evaluate(X, Y, lambda_theoretical, "Theoretical"))
    results.append(fit_and_evaluate(X, Y, lambda_cv, "Cross-Validation"))
    results.append(fit_and_evaluate(X, Y, lambda_aic, "AIC"))
    results.append(fit_and_evaluate(X, Y, lambda_bic, "BIC"))
    results.append(fit_and_evaluate(X, Y, lambda_gic, "GIC"))
    results.append(fit_and_evaluate(X, Y, lambda_bayesian, "Bayesian"))

    # Print results
    print("Comparison of Lambda Selection Methods and Their Performance:")
    for result in results:
        print(f"Method: {result['Method']}, Alpha: {result['Alpha']:.4f}, MSE: {result['MSE']:.4f}")
    return results