In [None]:
# default_exp updates

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# updates

The various update rules that can be used by the ```GradientDescent``` optimizer are collected here. ```GradientDescent``` expects update rules to by instances of a class with an ```evaluate``` method that takes as input a gradient estimate, a learning rate, and the current iteration (Comment: this seems like a suboptimal requirement), and returns a proposed parameter update. The reason update rules are required to be structured as classes is so that they can store any information they need to update the parameters. All update rules should return a numpy array of floats to be used as a step to update the model parameters. 

Many of the update steps are implemented based on [this review](https://arxiv.org/abs/1609.04747)



In [None]:
#hide
from nbdev.showdoc import *



In [None]:
#export
import numpy
import scipy
from abc import ABC, abstractmethod 

In [None]:
#export

class UpdateBase(ABC):
    @abstractmethod
    #This is the workhorse of the class
    def evaluate(self): pass


(Note: I may want to do some restructuring and move the parameter stepsize, decay to the update rules)

### The standard SPSA step

The default step used by Spall's classic SPSA algorithm is 

$$\theta_{t+1}=\theta_t - a_t \hat{g}$$

where $\hat{g}$ is the gradient estimate and $a_t$ is the learning rate at iteration $t$.

In [None]:
#export
class StandardSPSA(UpdateBase):
    def __init__(self):
        pass

    def evaluate(self,ghat, nu, t=0. ):

        return nu*ghat

### The ADAGRAD step update

ADAGRAD

The update rule is
$$G_0=0$$
$$G_t=G_{t-1}+(\hat{g}_t)^2$$
$$\theta_{t+1}=\theta_t - a_t \frac{\hat{g}_t}{\sqrt{G_t}+\epsilon} $$

This step update requires a small term ```eps``` be defined to prevent divide be zero errors. By default this is set to ```1e-8```

Note that typically, ADAGRAD is employed with a fixed learning rate. By default, however, ```GradientDescent``` uses a decreasing serir

In [None]:
#export
class ADAGRAD(UpdateBase):
    def __init__(self, eps=1e-8):
        self.eps=eps
        self.G_t=None

    def evaluate(self,ghat, nu, t=0. ):
        if self.G_t is None:
            self.G_t=numpy.zeros(ghat.shape)
        self.G_t+=ghat**2
        return nu*ghat/(self.G_t+self.eps)**.5

### The ADAM step update

Adaptive Moment Estimation (Adam) 

In [None]:
#export
class ADAM(UpdateBase):
    def __init__(self,beta1=.9, beta2=.999, eps=1e-8):
        self.beta1=beta1
        self.beta2=beta2
        self.eps=eps
        
        self.m_t=[0.]
        self.v_t=[0.]
    def evaluate(self,ghat, nu, t ):
        #Update the gradient histories
        self.m_t.append(self.beta1*self.m_t[-1]+(1-self.beta1)*ghat)
        self.v_t.append(self.beta2*self.v_t[-1]+(1-self.beta2)*ghat**2)
        
        #compute the bias corrections
        m_hat=self.m_t[-1]/(1.-self.beta1**t)
        v_hat=self.v_t[-1]/(1.-self.beta2**t)
        
        #compute the proposed step
        return nu*m_hat/(v_hat**.5+self.eps)

### The Nestorov-accelerated ADAM step update

In [None]:
#export
class NADAM(UpdateBase):
    def __init__(self,beta1=.9, beta2=.999, eps=1e-8):
        self.beta1=beta1
        self.beta2=beta2
        self.eps=eps
        
        self.m_t=[0.]
        self.v_t=[0.]
    def evaluate(self,ghat, nu, t ):
        #Update the gradient histories
        self.m_t.append(self.beta1*self.m_t[-1]+(1-self.beta1)*ghat)
        self.v_t.append(self.beta2*self.v_t[-1]+(1-self.beta2)*ghat**2)
        

        if t>=2:
            m_hat=self.m_t[-1]/(1-self.beta1**(t-1))
        else:
            m_hat=0.
        v_hat=self.v_t[-1]/(1-self.beta2**t)


        part_1=(nu/(v_hat**.5+self.eps))
        part_2=self.beta1*m_hat
        part_3=(1-self.beta1)*ghat/(1-self.beta1**t)
        step=part_1*(part_2+part_3) 
        return step