diff --git a/lasagne/updates.py b/lasagne/updates.py index 61ee4c14..ab008b5f 100644 --- a/lasagne/updates.py +++ b/lasagne/updates.py @@ -42,12 +42,15 @@ Examples -------- +Using :func:`nesterov_momentum` to define an update dictionary for a toy +example network: + >>> import lasagne >>> import theano.tensor as T >>> import theano >>> from lasagne.nonlinearities import softmax >>> from lasagne.layers import InputLayer, DenseLayer, get_output ->>> from lasagne.updates import sgd, apply_momentum +>>> from lasagne.updates import nesterov_momentum >>> l_in = InputLayer((100, 20)) >>> l1 = DenseLayer(l_in, num_units=3, nonlinearity=softmax) >>> x = T.matrix('x') # shp: num_batch x num_features @@ -55,9 +58,26 @@ >>> l_out = get_output(l1, x) >>> params = lasagne.layers.get_all_params(l1) >>> loss = T.mean(T.nnet.categorical_crossentropy(l_out, y)) ->>> updates_sgd = sgd(loss, params, learning_rate=0.0001) ->>> updates = apply_momentum(updates_sgd, params, momentum=0.9) ->>> train_function = theano.function([x, y], updates=updates) +>>> updates = nesterov_momentum(loss, params, learning_rate=1e-4, momentum=.9) +>>> train_fn = theano.function([x, y], updates=updates) + +With :func:`apply_momentum` and :func:`apply_nesterov_momentum`, we can add +momentum to optimization schemes that do not usually support this: + +>>> updates = lasagne.updates.rmsprop(loss, params, learning_rate=0.0001) +>>> updates = lasagne.updates.apply_momentum(updates, params, momentum=0.9) + +All optimization schemes support symbolic variables for their hyperparameters, +such as shared variables. This allows to vary hyperparameters during training +without recompiling the training function. Note that the dtypes must match the +dtypes of the network parameters, which follow Theano's ``floatX`` setting. +In the following example, we use :func:`lasagne.utils.floatX` to ensure this: + +>>> eta = theano.shared(lasagne.utils.floatX(0.001)) +>>> updates = lasagne.updates.adam(loss, params, learning_rate=eta) +>>> train_fn = theano.function([x, y], updates=updates) +>>> # we can now modify the learning rate at any time during training: +>>> eta.set_value(lasagne.utils.floatX(eta.get_value() * 0.1)) """ from collections import OrderedDict @@ -554,13 +574,13 @@ def adam(loss_or_grads, params, learning_rate=0.001, beta1=0.9, A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for - learning_rate : float + learning_rate : float or symbolic scalar Learning rate - beta1 : float + beta1 : float or symbolic scalar Exponential decay rate for the first moment estimates. - beta2 : float + beta2 : float or symbolic scalar Exponential decay rate for the second moment estimates. - epsilon : float + epsilon : float or symbolic scalar Constant for numerical stability. Returns @@ -622,13 +642,13 @@ def adamax(loss_or_grads, params, learning_rate=0.002, beta1=0.9, A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for - learning_rate : float + learning_rate : float or symbolic scalar Learning rate - beta1 : float + beta1 : float or symbolic scalar Exponential decay rate for the first moment estimates. - beta2 : float + beta2 : float or symbolic scalar Exponential decay rate for the weighted infinity norm estimates. - epsilon : float + epsilon : float or symbolic scalar Constant for numerical stability. Returns