In [None]:
# default_exp updates

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# updates - Strategies for updating parameters



A collection of various update rules that can be used by the `GradientDescent` optimizer are collected here. 

***

`GradientDescent` uses two kinds of update rules

1. Update steps (required): Determines how to update the model parameters based on the gradient estimate
2. Acceptance criteria: Decides whether a parameter update should be accepted or rejected. `GradientDescent` always rejects parameter updates that yield an invalid cost (i.e. ```nan``` or ```inf```), but additional rules be provided.
 

Many of the update steps are implemented based on [this review](https://arxiv.org/abs/1609.04747)


In [None]:
#hide
from nbdev.showdoc import *



In [None]:
#export
import numpy
import scipy
from abc import ABC, abstractmethod, abstractproperty

In [None]:
#export

class UpdateBase(ABC):
    """A helper class for constructing update rules"""
#     @abstractmethod
    #This is the workhorse of the class
    
    max_step=None

    def evaluate(self,ghat, nu, t=0.):
        step=self.step_update(ghat, nu, t)
    
        if self.max_step is not None:
            #I would prefer these be called during creation
            try:
                self.max_step=float(self.max_step)
                
                assert self.max_step>0
            except:
                AssertionError("max_step must be a number greater than zero")
            
            #If the proposed step is too large, rescale it
    
            max_proposal=numpy.max(numpy.abs(step))
            if max_proposal>self.max_step:
                step*=self.max_step/max_proposal
            
        return step
    
    @abstractmethod
    #This is the workhorse of the class
    def step_update(self): pass
    



In [None]:
#hide

## I should test that max step works properly

***
`GradientDescent` expects update steps to be instances of a class with an ```evaluate``` method that takes as input a gradient estimate, a learning rate, and the current iteration (Comment: this seems like a suboptimal requirement), and returns a proposed parameter update. The reason update rules are required to be structured as classes is so that they can store any information they need to update the parameters. All update rules should return a numpy array of floats to be used as a step to update the model parameters. 

### The standard SPSA step

The default step used by Spall's classic SPSA algorithm is 

$$\theta_{t+1}=\theta_t - a_t \hat{g}$$

where $\hat{g}$ is the gradient estimate and $a_t$ is the learning rate at iteration $t$.

In [None]:
#export
class StandardSPSA(UpdateBase):

    """A standard gradient descent update."""
    def __init__(self, max_step):
        self.max_step=max_step
        pass

    def step_update(self,ghat, nu, t=0. ):

        return nu*ghat

### The ADAGRAD step update

ADAGRAD

The update rule is
$$G_0=0$$
$$G_t=G_{t-1}+(\hat{g}_t)^2$$
$$\theta_{t+1}=\theta_t - a_t \frac{\hat{g}_t}{\sqrt{G_t}+\epsilon} $$

This step update requires a small term ```eps``` be defined to prevent divide be zero errors. By default this is set to ```1e-8```



In [None]:
#export
class ADAGRAD(UpdateBase):
    """The Adagrad gradient descent update."""
    def __init__(self, eps=1e-8, max_step=None):
        self.max_step=max_step
        self.eps=eps
        self.G_t=None

    def step_update(self,ghat, nu, t=0. ):
        if self.G_t is None:
            self.G_t=numpy.zeros(ghat.shape)
        self.G_t+=ghat**2
        return nu*ghat/(self.G_t+self.eps)**.5

### The ADAM step update

Adaptive Moment Estimation (Adam) 

In [None]:
#export
class ADAM(UpdateBase):
    def __init__(self,beta1=.9, beta2=.999, eps=1e-8, max_step=None):
        self.max_step=max_step
        self.beta1=beta1
        self.beta2=beta2
        self.eps=eps
        
        self.m_t=[0.]
        self.v_t=[0.]
    def step_update(self,ghat, nu, t ):
        #Update the gradient histories
        self.m_t.append(self.beta1*self.m_t[-1]+(1-self.beta1)*ghat)
        self.v_t.append(self.beta2*self.v_t[-1]+(1-self.beta2)*ghat**2)
        
        #compute the bias corrections
        m_hat=self.m_t[-1]/(1.-self.beta1**t)
        v_hat=self.v_t[-1]/(1.-self.beta2**t)
        
        #compute the proposed step
        return nu*m_hat/(v_hat**.5+self.eps)

### The Nestorov-accelerated ADAM step update

In [None]:
#export
class NADAM(UpdateBase):
    def __init__(self,beta1=.9, beta2=.999, eps=1e-8, max_step=None):
        self.max_step=max_step
        self.beta1=beta1
        self.beta2=beta2
        self.eps=eps
        
        self.m_t=[0.]
        self.v_t=[0.]
    def step_update(self,ghat, nu, t ):
        #Update the gradient histories
        self.m_t.append(self.beta1*self.m_t[-1]+(1-self.beta1)*ghat)
        self.v_t.append(self.beta2*self.v_t[-1]+(1-self.beta2)*ghat**2)
        

        if t>=2:
            m_hat=self.m_t[-1]/(1-self.beta1**(t-1))
        else:
            m_hat=0.
        v_hat=self.v_t[-1]/(1-self.beta2**t)


        part_1=(nu/(v_hat**.5+self.eps))
        part_2=self.beta1*m_hat
        part_3=(1-self.beta1)*ghat/(1-self.beta1**t)
        step=part_1*(part_2+part_3) 
        return step

## Acceptance criteria (not really developed or used at the moment)

In [None]:
#export 

class AcceptanceBase(ABC):
    """A helper class for constructing update rules"""

    @abstractmethod
    #This initializes the class and is called when ```GradientDescent``` is instatiated
    def initialize(self): pass
    @abstractmethod
    #This is the workhorse of the class
    def evaluate(self): pass

***
An acceptance criterion evaluates the cost after an update step and compares it to past evaluations of the cost function to decide whether it should be accepted or rejected. `GradientDescent` expects acceptance criteria to be objects of a class with two required methods:

1. ```initialize```: This is called when a `GradientDescent` object is initialized and performs any routines needed to set-up the acceptance criteria.  For example, an acceptance criterion based on the amount of noise in the cost function might evaluate the cost function some number ot times and compute its standard deviation. This method will be passed the entire `GradientDescent` object, so it has access to all information stored therein.
2. ```evaluate```: This 

In [None]:
#export
class BlockWithLocalResiduals(AcceptanceBase):
    """This acceptance rules computes a streaming linear regression
    to estimate how the cost function has changed over the past k 
    iterations. It then requires that the a new update be """
    
    def __init__(self,delta=2., window_size=100):
        self.k=window_size
        self.history=[]
        self.times=[]
        self.delta=delta
        
    def initialize(self, opt):
        #Evaluate the cost function
        self.SD=numpy.std([opt.cost.evaluate(opt.theta) for i in range(self.k) ])
        self.history.append(opt.cost_history[-1])
        self.times.append(0.)
    def update_regression(self):
        self.linreg=scipy.stats.linregress(self.times, self.history)
        self.slope=self.linreg.slope
        self.intercept=self.linreg.intercept
    
    
    def evaluate(self, cost,t ):
        accept=True
        if t<self.k:
            expected=self.history[-1]
            if cost>expected+self.delta*self.SD: accept=False
        if t>self.k:
            self.update_regression()
            expected=self.slope*numpy.array(self.times)+self.intercept
            resid_sd=numpy.std(numpy.array(self.history)-self.times)
            if cost>self.history[-1]+self.delta*resid_sd: accept=False
        if accept==True:
            self.history.append(cost)
            self.times.append(t)
            if t>self.k:
                rem=self.history.pop(0)
                rem=self.times.pop(0)
        return (accept)
            