# Part 1

In [2]:
import numpy as np

from sklearn.linear_model import Lasso, LassoCV

In [3]:
def simulate_lasso_data(n, p, rng, *, sparsity=0.95, SNR=2.0, beta_scale=5.0):
    """Simulate data for Project 3, Part 1.

    Parameters
    ----------
    n : int
        Number of samples
    p : int
        Number of features
    rng : numpy.random.Generator
        Random number generator (e.g. from `numpy.random.default_rng`)
    sparsity : float in (0, 1)
        Percentage of zero elements in simulated regression coefficients
    SNR : positive float
        Signal-to-noise ratio (see explanation above)
    beta_scale : float
        Scaling for the coefficient to make sure they are large

    Returns
    -------
    X : `n x p` numpy.array
        Matrix of features
    y : `n` numpy.array
        Vector of responses
    beta : `p` numpy.array
        Vector of regression coefficients
    """
    X = rng.standard_normal(size=(n, p))
    
    q = int(np.ceil((1.0 - sparsity) * p))
    beta = np.zeros((p,), dtype=float)
    beta[:q] = beta_scale * rng.standard_normal(size=(q,))
    
    sigma = np.sqrt(np.sum(np.square(X @ beta)) / (n - 1)) / SNR

    y = X @ beta + sigma * rng.standard_normal(size=(n,))

    # Shuffle columns so that non-zero features appear
    # not simply in the first (1 - sparsity) * p columns
    idx_col = rng.permutation(p)
    
    return X[:, idx_col], y, beta[idx_col]

In [8]:
n_simulations = 10
n_features = 1000
n_observations = [250, 500, 750]
sparsity_values = [0.5, 0.7, 0.8, 0.9, 0.95]
rng = np.random.default_rng()

results = {}
for N in n_observations:
    results[f"N={N}"] = {}
    for sparsity in sparsity_values:
        min_alphas = []
        one_std_alphas = []
        for ix in n_simulations:
            X, y, beta = simulate_lasso_data(N, n_features, rng, sparsity=sparsity)
            lcv = LassoCV(cv=10, random_state=0).fit(X,y)
            min_alphas.append(lcv.aplha_)
            
            # Searching for the mse_path_ row corresponding to alpha_
            # and calculate the std over all folds for alpha_.
            std = 0
            for idx, alpha in enumerate(lcv.alphas_):
                if alpha == alpha_:
                    std = np.std(lcv.mse_path_[idx,:])
                    break
                    
            # Find the largest alpha in alphas_ within one std 
            # (std of MSE for alpha_ across all folds) of alpha_.
            one_std_alpha = lcv.alpha_
            ubnd = lcv.alpha_ + std
            for alpha in lcv.alphas_:
                if lcv.alpha_ < one_std_alpha <= ubnd:
                    one_std_alpha = alpha              
            one_std_alphas.appned(one_std_alpha)
            
            
        results[f"N={N}"][f"SPRSTY={sparsity}"] = {
            "min_alphas": min_alphas
            "1se_alphas": one_std_alphas
        }
            
            

X, y, beta = simulate_lasso_data(100, 200, rng)

2.919248435537968
1.4450257237555304
2.2454234728328246
0.18296791533914877
0.829862625710573
2.8142138464734754
2.438358157042785
2.1835183463992522
1.9946977026783155
0.9745340518433181
1.1779367445108737
1.2630175725874135
1.1810356181238635
1.0991742046017778


KeyboardInterrupt: 