In [1]:
import numpy as np
import pandas as pd

In [206]:
data = pd.read_csv("data/Coding2_Data.csv")

var_names = data.columns
y = data[['Y']].to_numpy()
X = data.drop(['Y'], axis = 1).to_numpy()
print(X.shape, y.shape)

(506, 13) (506, 1)


In [254]:
def one_var_lasso(v, z, penalty):
    n = v.shape[0]
    z_norm = z.T @ z
    a = (v.T @ z) / z_norm

    ita = (penalty * n) / z_norm

    if a > ita:
        return a - ita
    elif a < -ita:
        return a + ita
    else:
        return 0

In [244]:
def MyLasso(X, y, lam_seq, maxit = 100):
    # Input
    # X: n-by-p design matrix without the intercept 
    # y: n-by-1 response vector 
    # lam.seq: sequence of lambda values (arranged from large to small)
    # maxit: number of updates for each lambda 
    
    # Output
    # B: a (p+1)-by-len(lam.seq) coefficient matrix 
    #    with the first row being the intercept sequence 
    n, p = X.shape
    nlam = len(lam_seq)
    B = np.zeros((p+1, nlam))
    
    ##############################
    # YOUR CODE: 
    # (1) newX = Standardizad X; 
    # (2) Record the centers and scales used in (1) 
    ##############################
    newX = np.copy(X)
    means = np.zeros(M)
    sd = np.zeros(M)
    for j in range(M):
        means[j] = np.mean(newX[:, j])
        sd[j] = np.mean(newX[:, j])
        newX[:, j] = (newX[:, j] - means[j]) / sd[j]
    y_mean = np.mean(y)

    # Initilize coef vector b and residual vector r
    b = np.zeros(p)
    r = y
    
    # Triple nested loop
    for m in range(nlam):
        for step in range(maxit):
            for j in range(p):
                X_j = newX[:, j].reshape(-1,1)
                r = r + X_j * b[j]
                b[j] = one_var_lasso(r, X_j, lam_seq[m])
                r = r - X_j * b[j]
        B[1:, m] = b 
        
    ##############################
    # YOUR CODE:
    # Scale back the coefficients;
    # Update the intercepts stored in B[, 0]
    ##############################
    for m in range(nlam):
        B[0, m] = y_mean
        B[1:, m] *= sd
    print(means)
    
    return(B)

In [252]:
log_lam_seq = np.linspace(-1, -8, num = 80)
lam_seq = np.exp(log_lam_seq)
myout = MyLasso(X, y, lam_seq, maxit = 100)

[-0.78043626  1.13636364  2.16019166  0.06916996 -0.61002573  1.83186406
  5.06079038  1.1880324   1.86766099  5.93140464  2.15005374  3.56674032
  3.41767318]


In [216]:
lasso_coef = pd.read_csv("data/Coding2_lasso_coefs.csv").to_numpy()
print(lasso_coef[:, 0])

[3.03451287 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


In [253]:
# for j in range(len(lam_seq)):
print(myout[:, 0])
abs(myout - lasso_coef).max()

[ 3.03451287e+00 -1.71634468e-01 -2.96692564e-02  1.87225258e+00
  3.88827107e-04 -1.79007924e+00  1.31387068e+02  1.37501380e+01
  1.72822778e-02 -3.98969232e-01 -2.11738455e+02  7.60991903e+00
 -1.82546058e+00  1.22236130e+01]


211.7384549959159