In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize

# Gradient Descent Algorithm

We can think of a function of two or more variables. We have an objective function e.g. $$\large L(\beta_1, \beta_2)$$

We hope to share some ideas when $$L(\vec{\beta})$$ is convex in the vector $\vec{\beta}$

In particular we are interested in the situation when $L$ is the regularized sum of squarred errors, such as:

$$L: = \frac{1}{n}\displaystyle\sum_{i=1}^{n} (y_i-\sum_{j=1}^{p} x_{ij} \cdot \beta_j)^2 +P_{\alpha}(\beta)$$

For example consider the Elastic Net:

$$P_{\alpha}(\beta) = \alpha \cdot (\lambda\cdot \sum |\beta_j| + (1-\lambda ) \cdot \sum (\beta_j^2))$$

We can compute the partial derivatives with repect to $\beta_j$:

$$\large \frac{\partial L}{\partial \beta_j}=-2\cdot\displaystyle\sum_{i=1}^{n} (y_i-\sum_{j=1}^{p} x_{ij} \cdot \beta_j)\cdot (x_{ij}) + \alpha\cdot\lambda\cdot\text{sign}(\beta_j) + \alpha\cdot (1-\lambda)\cdot 2\cdot\beta_j$$

In sklearn you provide the L1_ratio that is $$\frac{\lambda}{1-\lambda}$$
We want to know how to update $\vec{\beta}$ so that we make the best progress in minimizing $L.$ We can think updating $\vec{\beta}$ by going $t$ units in a direction $\vec{v}$. So we ask, what is $\vec{v}$ that makes the best progress?

We consider $g(t):= L(\vec{\beta} + t \cdot \vec{v})$ SO the derivative of $g$ is

$$\large g'(t)= \nabla L \cdot \vec{v}$$ so for the most dramatic decrease of the objective we need $\vec{v}=-\nabla L$

<font color='red' size=5pt>This works well if $L$ is convex in $\beta$ and the only issue would when there is a very shallow basin of the minimum for $L.$</font>

The main goal is to approximate a ground truth coefficient vector in probability.

$$\large y = X\cdot \beta^* +\text{noise}$$

We can see the noise as $$\large \sigma\cdot \epsilon$$

So we see that:

$$\large y - X\beta^*=\sigma \cdot \epsilon$$ 

Now think about taking a partial derivative with respect to $\beta_j$ for 

$$\|y-X\beta\|_2$$

and we get 

$$- \frac{(y-X\beta)h_j(X)}{\|y-X\beta\|_2}$$

This suggests that we should rather minimize $\|y-X\beta\|_2$ + a regularization term


## Good reads about soft-thresholding:

https://www.kaggle.com/residentmario/soft-thresholding-with-lasso-regression

https://eeweb.engineering.nyu.edu/iselesni/lecture_notes/SoftThresholding.pdf



In [4]:
data = pd.read_csv('Data/Concrete_Data.csv')
y = data[data.columns[-1]]
x = data[data.columns[0:7]]
 
scaler = StandardScaler()
xscaled = scaler.fit_transform(x)

In [5]:
def gradient_mse(beta, x, y, alpha,l): # we defined a function that computes the gradient of the objective function
    n = len(y) # the number of observations
    y_hat = x.dot(beta).flatten()
    error = (y - y_hat)
    mse = (1.0 /n) * np.sum(np.square(error)) + alpha*(l*np.sum(np.abs(beta))+(1-l)*np.sum(beta**2)) # here we have the ridge penalty
    gradient = -(2.0 /n) * error.dot(x) + 2*(1-l)*alpha*beta+alpha*l*np.sign(beta) # the penalty is baked into the gradient as well
    return gradient, mse

In [6]:
# data & hyper-parameters
alpha = 1
l = 1
x = xscaled

In [7]:
def objective(beta):
  n = len(y) # the number of observations
  y_hat = x.dot(beta).flatten()
  error = (y - y_hat)
  mse = (1.0 /n) * np.sum(np.square(error)) + alpha*(l*np.sum(np.abs(beta))+(1-l)*np.sum(beta**2))
  return mse

In [8]:
def gradient(beta):
  n = len(y) # the number of observations
  y_hat = x.dot(beta).flatten()
  error = (y - y_hat)
  gradient = -(2.0 /n) * error.dot(x) + 2*(1-l)*alpha*beta+alpha*l*np.sign(beta)
  return gradient

In [9]:
b0 = np.zeros(x.shape[1])

In [10]:
output = minimize(objective, b0, method='L-BFGS-B', jac=gradient,options={'gtol': 1e-8, 'maxiter': 50000,'maxls': 25,'disp': True})

In [11]:
coef = output.x

In [12]:
mse(y,xscaled.dot(coef))

1439.3405549300053

In [13]:
coef

array([ 8.74876380e+00,  4.35249941e+00,  9.94314631e-01, -2.31920333e+00,
        2.91437468e+00, -7.72372844e-06, -8.91124760e-01])

In [14]:
xscaled.dot(coef)

array([18.50979363, 18.50979214,  0.84271722, ..., -8.29196791,
       -2.40782287, -0.74459015])

## SCAD

In [15]:
def scad_penalty(beta_hat, lambda_val, a_val):
    is_linear = (np.abs(beta_hat) <= lambda_val)
    is_quadratic = np.logical_and(lambda_val < np.abs(beta_hat), np.abs(beta_hat) <= a_val * lambda_val)
    is_constant = (a_val * lambda_val) < np.abs(beta_hat)
    
    linear_part = lambda_val * np.abs(beta_hat) * is_linear
    quadratic_part = (2 * a_val * lambda_val * np.abs(beta_hat) - beta_hat**2 - lambda_val**2) / (2 * (a_val - 1)) * is_quadratic
    constant_part = (lambda_val**2 * (a_val + 1)) / 2 * is_constant
    return linear_part + quadratic_part + constant_part
    
def scad_derivative(beta_hat, lambda_val, a_val):
    return lambda_val * ((beta_hat <= lambda_val) + (a_val * lambda_val - beta_hat)*((a_val * lambda_val - beta_hat) > 0) / ((a_val - 1) * lambda_val) * (beta_hat > lambda_val))

In [20]:
def scad(beta):
  beta = beta.flatten()
  beta = beta.reshape(-1,1)
  n = len(y)
  return 1/n*np.sum((y-x.dot(beta))**2) + np.sum(scad_penalty(beta,lam,a))
  
def dscad(beta):
  beta = beta.flatten()
  beta = beta.reshape(-1,1)
  n = len(y)
  return np.array(-2/n*np.transpose(x).dot(y-x.dot(beta))+scad_derivative(beta,lam,a)).flatten()

In [21]:
p = x.shape[1]
b0 = np.random.normal(1,1,p)

In [24]:
lam = 1
a = 2
output = minimize(scad, b0, method='L-BFGS-B', jac=dscad,options={'gtol': 1e-8, 'maxiter': 50000,'maxls': 25,'disp': True})

ValueError: Data must be 1-dimensional