In [None]:
# default_exp optimizers

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# optimizers - The GradientDescent implementation


The class used to perform gradient descent.
***


In [None]:
#hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#hide
from nbdev.showdoc import *
import pymc3 as pm
import hypothesis



In [None]:
#export
import numpy
import scipy
from gradless.gradient import SPSAGradient


In [None]:
#export
class GradientDescent():
    def __init__(self,x_0, model, update, gradient=None, acceptance_rule=None,
                 param_stepsize=1, param_stepdecay=.4, param_decay_offset=0, 
                 grad_stepsize=1, grad_stepdecay=.2, 
                seed=None):
        if seed is not None:
            assert type(seed) is int
            numpy.random.seed(seed)
        # store the model
        self.cost=model
        
        # Call the model once to ensure evaluate returns a float
        
#         test_val=self.cost.evaluate(x_0)
#         print (isinstance(test_val,float))
        try: 
            numpy.isnan(self.cost.evaluate(x_0))
        except: raise(AssertionError("The cost function must return a float or an array with shape (1,) (e.g. not an array)"))
#         assert isinstance(test_val,float) or test_val.shape==(1), "The cost function must return a float or an array with shape (1,) (e.g. not an array)"
        
        self.update=update
        if gradient is None: gradient=SPSAGradient()
        self.gradient=gradient
        
        #if the gradient was passed without cost being defined, set the cost
        if self.gradient.cost is None:
            self.gradient.set_cost(self.cost)
            
        self.param_stepsize=param_stepsize
        self.param_stepdecay=param_stepdecay
        self.param_decay_offset=param_decay_offset
        self.grad_stepsize=grad_stepsize
        self.grad_stepdecay=grad_stepdecay
        self.t=0.
        self.cost_history=[self.cost.evaluate(x_0)]

        self.theta_hist=[x_0]
        self.theta=x_0
        
        self.acceptance_rule=acceptance_rule
        if self.acceptance_rule is not None:
            self.acceptance_rule.initialize(self)

    def update_params (self, gradient_reps=1,block_val=None, update_rvs=False):
        """This performs a single update of the model parameters"""
        self.t+=1
        
        c_k=self.grad_step()
        ### get the gradient
        ghat= self.gradient.evaluate( self.theta, c_k, gradient_reps=gradient_reps, update_rvs=update_rvs )

        
        ### determine the proposed step
        a_k=self.param_step()
        step=self.update.evaluate(ghat, a_k ,self.t)



        ### update the parameters
        new_theta=self.theta-step
        new_cost=self.cost.evaluate(new_theta)
        
        #I want to replace this with an acceptance rule
        
        #Always reject nans
        if numpy.isnan(new_cost):
            self.t-=1
            return()
        
        #Evaluate the acceptance criterion here
        if self.acceptance_rule is not None:
            accept=self.acceptance_rule.evaluate(new_cost, self.t)
            if accept==False:
                self.t-=1
                return() 
                
        if block_val is not None:
            if self.t<100:
                if new_cost>(1.5*self.cost_history[-1]):
                    self.t-=1
                    return() 
            else:
#                 mean_cost=numpy.mean(self.cost_history[-100:])
                sd_cost=numpy.std(self.cost_history[-100:])
                if new_cost>(block_val*sd_cost+self.cost_history[-1]):
                    self.t-=1
                    return()

        ### evaluate the objective function
        
        self.theta_hist.append(new_theta)
        self.theta=new_theta
        
        self.cost_history.append(new_cost)
        
    def fit(self, niter=10000, init_grad_reps=100):
        """This performs a set number of gradient descent descent iterations, along with some initialization"""
        pass
    def param_step(self):
        """This determines the step size used to update the model parameters.
        
        a_k= a/(t+A)**alpha"""
        return  (self.param_stepsize / (self.t+self.param_decay_offset)**self.param_stepdecay)
        
    def grad_step(self):
        """This determines the step size used to perturb the parameters during the gradient approximation"""
        return (self.grad_stepsize/(self.t)**self.grad_stepdecay)
        


The general class used to perform gradient descent is `GradientDescent` which is modelled after the default implementation of Spall's SPSA optimization scheme outlined [here](https://www.jhuapl.edu/SPSA/PDF-SPSA/Spall_An_Overview.PDF). However, this can be modified by choosing different update rules to embed the SPSA gradient estimate inside more efficient gradient descent algorithms, such as ADAM and ADAGRAD.

### Usage


The ```GradientDescent``` class has two general classes of arguments:

1. Arguments that determine how model parameters are updated (cost, update, gradient)

 * x_0 (required): An initial guess of the model parameters where the optimizer will begin
 
 * model (required): The model to be optimized. Generally this should be an instance of the `Model` class.
 
 * update (required): This should be a class that proposes a parameter update based on the gradient. See `updates` for details
 
 * gradient : This should be an instance of a class that provides an estimate of the gradient. By default, uses `SPSAGradient`. See ```gradient``` for more details
 
 * acceptance_rule (optional): An AcceptanceRule class may be passed here to define rules for accepting or rejecting parameter updates. See `updates` for details

2. Arguments related to how steps are performed (i.e. learning rate) and the gradient is approximated

The following parameters are required and relate to how the parameters are updated

 * param_stepsize
 * param_stepdecay
 * param_decay_offset

The learning rate at iteration ```t``` is calculated as

```learning_rate = param_stepsize / (param_decay_offset + t) ** param_stepdecay```

A constant

The following parameters are required and relate to how the parameters are perturbed during the gradient approximation:

 * grad_stepsize
 * grad_stepdecay


The perturbation step at iteration ```t``` is calculated as

```C_t = grad_stepsize / ( t ** grad_stepdecay )```

# hide

## Tests

I want to make sure I can initialize the optimizer


In [None]:
#hide 
import numpy
from gradless import optimizers, costs, gradient, updates
from matplotlib import pyplot
import hypothesis
from hypothesis import given
import hypothesis.strategies as st
import hypothesis.extra.numpy as hypo_numpy
from hypothesis import note
from numpy.testing import *


def test_GradientDescent_initialization():
    """Tests to ensure GradientDescent objects are independent
    by initializing multiple GradientDescent objects with different numbers of parameters """
    def quadratic(x):
        return ((x)**2).sum()
 
    true_value=0
    for param_num in range(1,10):
        start_value=numpy.array([50]*param_num)
        model=costs.Model(quadratic)
        update_rule=updates.StandardSPSA(max_step=.2)
        opt=GradientDescent(start_value, model,update_rule,
                                   param_stepsize = 2, param_stepdecay = .4, param_decay_offset = 0, 
                                   grad_stepsize = 1, grad_stepdecay = .2, seed=2 )
        #perform two updates to make sure all function calls work
        opt.update_params()
        opt.update_params()
test_GradientDescent_initialization()

I want to make sure the optimizer still converges after whatever updates I've made. I'll find the minimum of a quadratic function $f(x)=\sum_i x_i^2$ for both one and two parameters models. I'll run it for 1000 iterations each time.

In [None]:
#hide


# I need a test to ensure the optimizer runs and hasn't been broken




def test_GradientDescent_convergence_one_param():
    """Tests whether the optimizer converges on a simple 1-d problem"""
    def quadratic(x):
        return ((x)**2).sum()
    start_value=numpy.array([50])
    true_value=0
    
    model=costs.Model(quadratic)
    update_rule=updates.StandardSPSA(max_step=.2)
    opt=GradientDescent(start_value, model,update_rule,
                               param_stepsize = 2, param_stepdecay = .4, param_decay_offset = 0, 
                               grad_stepsize = 1, grad_stepdecay = .2, seed=2 )

    for i in range(1000):
        opt.update_params()
    
    assert_almost_equal(opt.theta, true_value)

test_GradientDescent_convergence_one_param()

def test_GradientDescent_convergence_two_param():
    """Tests whether the optimizer converges on a simple 2-d problem"""
    def quadratic(x):
        return ((x)**2).sum()
    start_value=numpy.array([50,-25])
    true_value=numpy.array([0,0])
    
    model=costs.Model(quadratic)
    update_rule=updates.StandardSPSA(max_step=.2)
    opt=GradientDescent(start_value, model,update_rule,
                               param_stepsize = 2, param_stepdecay = .4, param_decay_offset = 0, 
                               grad_stepsize = 1, grad_stepdecay = .2, seed=2 )

    for i in range(1000):
        opt.update_params()
    
    assert_almost_equal(opt.theta, true_value)

test_GradientDescent_convergence_two_param()
    
# model=costs.Model(quadratic)
# update_rule=updates.StandardSPSA(max_step=.2)
# test_convergence(numpy.array([50.,-25]),[0,0], model,update_rule)  
