In [8]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from sklearn.linear_model import LassoLars, LinearRegression
from sklearn.metrics import mean_squared_error

from RegressionModel import RegressionModel
from utils import *

In [9]:
X = pd.read_csv("data/efron2004_x.csv", sep=",", index_col=0).values
y = pd.read_csv("data/efron2004_y.csv", sep=",", index_col=0).values.reshape(-1)
X = standardize(X)

In [10]:
lam = 1

In [11]:
model = RegressionModel(lam=lam, tol=1e-4, max_iter=1000)

In [12]:
conv, it = model.fit(X, y, shuffle=True)
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
mse, conv, it

[8 2 1 4 0 7 3 5 6 9]
[3 8 0 7 5 6 4 9 2 1]
[2 5 8 3 6 0 4 9 1 7]
[6 7 0 2 5 1 4 8 3 9]
[1 8 7 3 9 6 4 5 2 0]
[3 9 8 2 5 7 4 1 6 0]
[1 0 3 2 5 6 9 7 4 8]
[0 5 6 3 7 1 4 9 2 8]
[0 7 8 4 2 3 5 6 9 1]
[1 9 2 6 8 5 4 0 7 3]
[5 8 7 2 0 1 3 6 9 4]
[1 0 2 6 3 8 5 4 9 7]
[1 9 5 2 8 4 0 6 7 3]
[3 2 9 8 0 6 5 4 7 1]
[7 9 5 1 4 2 8 3 6 0]
[6 5 0 2 8 1 7 4 3 9]
[1 7 8 3 9 2 5 0 6 4]


(0.5004903026709713, True, 16)

In [13]:
np.where(model.weights.abs() < 1e-8, 0, 1)

array([ 0.        , -1.25464347,  6.6456634 ,  3.18044378,  0.        ,
        0.        , -2.40408075,  0.        ,  5.86553119,  0.09183343])

In [16]:
np.where(np.abs(model.weights) < 1e-8, 0, 1).sum()

6

In [64]:
1/(2 * len(X)) * ((y-y_pred)**2).sum() + lam * np.abs(model.weights).sum(), 1/(2 * len(X)) * ((y-y_pred)**2).sum(), lam * np.abs(model.weights).sum()

(19.692474685640313, 0.2502450753058891, 19.442229610334426)

In [65]:
model.weights

array([ 0.        , -1.25466995,  6.64565839,  3.18044834,  0.        ,
        0.        , -2.40411938,  0.        ,  5.86551573,  0.09181781])

In [66]:
model = LassoLars(alpha=lam / len(X), fit_intercept=False)
model.fit(X, y)
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
mse

0.5004904374245501

In [67]:
1/(2 * len(X)) * ((y-y_pred)**2).sum() + lam * np.abs(model.coef_).sum(), 1/(2 * len(X)) * ((y-y_pred)**2).sum(), lam * np.abs(model.coef_).sum()

(19.69241144267663, 0.2502452187122751, 19.442166223964353)

In [68]:
model.coef_

array([ 0.        , -1.25464197,  6.64568016,  3.1804226 ,  0.        ,
        0.        , -2.40409038,  0.        ,  5.86550413,  0.09182698])

In [42]:
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
mse

0.48115950993832063

In [43]:
model.coef_

array([ -0.13001884,  -3.11430123,   6.75066232,   4.21254835,
       -10.28733834,   6.1910424 ,   1.31216923,   2.29936317,
         9.75614628,   0.87818624])

In [None]:
ns = [100, 100, 100, 1000, 5000]
ps = [1000, 5000, 20000, 100, 100]
rhos = [0, 0.1, 0.2, 0.5, 0.9, 0.95]
L = 1

In [None]:
def eval(model, X, y):
    start = time.time()
    model.fit(X, y)
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    return mse, time.time() - start

In [None]:
results = []
for n, p in tqdm(zip(ns, ps), total=len(ns)):
    for rho in rhos:
        mse_coord, mse_lars, mse_lr = [], [], []
        time_coord, time_lars, time_lr = [], [], []
        its, convs = [], []
        for _ in range(L):
            X, y = generate_data(n, p, rho)
            X = standardize(X)

            model = RegressionModel(1, 1e-4, 100)
            start = time.time()
            it, conv = model.fit(X, y)
            y_pred = model.predict(X)
            mse = mean_squared_error(y, y_pred)
            mse_coord.append(mse)
            time_coord.append(time.time() - start)
            its.append(it)
            convs.append(conv)

            model = LassoLars()
            mse, ex_time = eval(model, X, y)
            mse_lars.append(mse)
            time_lars.append(ex_time)
            
            model = LinearRegression()
            mse, ex_time = eval(model, X, y)
            mse_lr.append(mse)
            time_lr.append(ex_time)
            
        results.append(["coord", n, p, rho, np.mean(mse_coord), np.mean(time_coord), np.mean(its), np.mean(convs)])
        results.append(["LARS", n, p, rho, np.mean(mse_lars), np.mean(time_lars)])
        results.append(["LR", n, p, rho, np.mean(mse_lr), np.mean(time_lr)])

In [None]:
df = pd.DataFrame(results, columns=["method", "n", "p", "rho", "mse", "time", "it", "conv"])
df.head()

In [None]:
df.to_csv("results.csv", index=False)

In [41]:
# Fit_intercept?
# What lambda, no data in paper
# What tol, no data in paper, generalnie stopping rule
# LassoLars works worse
# Different results than in example in paper
# Cos o normalizacji
# Czy raport ok?