In [1]:
import numpy as np

*compute_loss* computes the error of the parameters w regarding the data x and the labels y. mse = True computes the MSE error and mse = False computes the MAE error.

In [2]:
def compute_loss(y, tx, w, mse = True):
    N = y.shape[0]
    if mse:
        e = y - tx @ w
        loss = 1/(2 * N) * e.T @ e
    else:
        loss = np.mean(np.abs(y - tx @ w))
    return loss

*mse_gradient* computes the gradient of the MSE loss function regarding the labels y, the data tx and the parameters w

In [2]:
def mse_gradient(y, tx, w):
    e = y - tx @ w
    grad = -(1/y.shape[0]) * tx.T @ e
    return grad

*mae_gradient* computes the gradient of the MAE loss function regarding the labels y, the data tx and the parameters w

In [None]:
def mae_gradient(y, tx, w):
    e = y - tx @ w
    e = np.where(e < 0, 1, -1)
    e = np.vstack((e,e)).T * tx
    return np.mean(e, axis = 0)

### least_squares_GD
*least_squares_GD* applies the full gradient descent with respect to the MSE loss function. y are the labels, tx the data, initial_w the initial vector, max_iters are the number of steps of the algorithm and gamma the step size.
at each step, this performs $\underline{w} = \underline{w} - \gamma * \nabla L(\underline{w})$.

In [4]:
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    """Gradient descent algorithm."""
    # Define parameters to store w and loss
    w = initial_w
    for n_iter in range(max_iters):
        grad = mse_grad(y, tx, w)
        w = w - gamma * grad
    loss = compute_loss(y, tx, w)
    return losse, w

### least_squares_SGD
*least_squares_GD* applies batch gradient descent with respect to the MSE loss function. y are the labels, tx the data, initial_w the initial vector, max_iters are the number of steps of the algorithm and gamma the step size.
at each step, this performs $\underline{w}^{(t+1)} = \underline{w}^{(t)} - \gamma * \underline{g}$ where $\underline{g} = \frac{1}{|B|} \sum\limits_{n \in B} \nabla L_n(\underline{w}^{(t)})$

In [1]:
def least_squares_SGD(y, tx, initial_w, max_iters, gamma, batch_size = 1):
    
    w = initial_w
    rand_list = np.arange(y.shape[0])
    
    for n in np.arange(max_iters):
        np.random.shuffle(rand_list)
        # randomizing y and tx so we can take the first *batch_size* elements
        y = y[rand_list]
        tx = tx[rand_list]
        
        # compute loss and gradent descent
        grad = compute_stoch_gradient(y[:batch_size], tx[:batch_size,:], w)
        
        w = w - gamma * grad
        
    loss = compute_loss(y, tx, w)
    return losse, w

### least square
given tx (N, D+1) and y (N), finds the solution in terms of the lest squares, namely $arg min_{b} || Y - XB ||^2$
which is $B = (X^T X)^{-1} (X^T Y)$

In [None]:
def least_square_loss(tx, y, w):
    e = y - tx @ w.T
    return (1/y.shape[0]) * e @ e.T

In [3]:
def least_squares(tx, y):
    XT_X = tx.T @ tx
    XT_Y = tx.T @ y
    w = np.linalg.inv(XT_X) @ XT_Y
    loss = least_square_loss(tx, y, w)
    return w, loss

### ridge regression
given tx (N, D+1), lamdba and y (N), finds the solution in terms of the lest squares, namely $arg min_{b} || Y - XB ||^2 + \lambda ||B||^2$
which is $B = (X^T X + \lambda I_{D+1})^{-1} (X^T Y)$

In [4]:
def ridge_regression_loss(tx, y, w, lambda_):
    return least_square_loss(tx, y, w) + lambda_ * w.T @ w

In [5]:
def ridge_regression(tx, y, lambda_):
    XT_X = tx.T @ tx
    XT_Y = tx.T @ y
    w = np.linalg.inv(XT_X + np.eye(tx.shape[1]) * lambda_) @ XT_Y
    loss = ridge_regression_loss(tx, y, w, lambda_)
    return w, loss