Permalink
Switch branches/tags
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
485 lines (389 sloc) 16.8 KB
"""
Provides some minimal help with building loss expressions for training or
validating a neural network.
Six functions build element- or item-wise loss expressions from network
predictions and targets:
.. autosummary::
:nosignatures:
binary_crossentropy
categorical_crossentropy
squared_error
binary_hinge_loss
multiclass_hinge_loss
huber_loss
A convenience function aggregates such losses into a scalar expression
suitable for differentiation:
.. autosummary::
:nosignatures:
aggregate
Note that these functions only serve to write more readable code, but are
completely optional. Essentially, any differentiable scalar Theano expression
can be used as a training objective.
Finally, two functions compute evaluation measures that are useful for
validation and testing only, not for training:
.. autosummary::
:nosignatures:
binary_accuracy
categorical_accuracy
Those can also be aggregated into a scalar expression if needed.
Examples
--------
Assuming you have a simple neural network for 3-way classification:
>>> from lasagne.layers import InputLayer, DenseLayer, get_output
>>> from lasagne.nonlinearities import softmax, rectify
>>> l_in = InputLayer((100, 20))
>>> l_hid = DenseLayer(l_in, num_units=30, nonlinearity=rectify)
>>> l_out = DenseLayer(l_hid, num_units=3, nonlinearity=softmax)
And Theano variables representing your network input and targets:
>>> import theano
>>> data = theano.tensor.matrix('data')
>>> targets = theano.tensor.matrix('targets')
You'd first construct an element-wise loss expression:
>>> from lasagne.objectives import categorical_crossentropy, aggregate
>>> predictions = get_output(l_out, data)
>>> loss = categorical_crossentropy(predictions, targets)
Then aggregate it into a scalar (you could also just call ``mean()`` on it):
>>> loss = aggregate(loss, mode='mean')
Finally, this gives a loss expression you can pass to any of the update
methods in :mod:`lasagne.updates`. For validation of a network, you will
usually want to repeat these steps with deterministic network output, i.e.,
without dropout or any other nondeterministic computation in between:
>>> test_predictions = get_output(l_out, data, deterministic=True)
>>> test_loss = categorical_crossentropy(test_predictions, targets)
>>> test_loss = aggregate(test_loss)
This gives a loss expression good for monitoring validation error.
"""
import theano.tensor
from .utils import as_theano_expression
__all__ = [
"binary_crossentropy",
"categorical_crossentropy",
"squared_error",
"aggregate",
"binary_hinge_loss",
"multiclass_hinge_loss",
"huber_loss",
"binary_accuracy",
"categorical_accuracy"
]
def align_targets(predictions, targets):
"""Helper function turning a target 1D vector into a column if needed.
This way, combining a network of a single output unit with a target vector
works as expected by most users, not broadcasting outputs against targets.
Parameters
----------
predictions : Theano tensor
Expression for the predictions of a neural network.
targets : Theano tensor
Expression or variable for corresponding targets.
Returns
-------
predictions : Theano tensor
The predictions unchanged.
targets : Theano tensor
If `predictions` is a column vector and `targets` is a 1D vector,
returns `targets` turned into a column vector. Otherwise, returns
`targets` unchanged.
"""
if (getattr(predictions, 'broadcastable', None) == (False, True) and
getattr(targets, 'ndim', None) == 1):
targets = as_theano_expression(targets).dimshuffle(0, 'x')
return predictions, targets
def binary_crossentropy(predictions, targets):
"""Computes the binary cross-entropy between predictions and targets.
.. math:: L = -t \\log(p) - (1 - t) \\log(1 - p)
Parameters
----------
predictions : Theano tensor
Predictions in (0, 1), such as sigmoidal output of a neural network.
targets : Theano tensor
Targets in [0, 1], such as ground truth labels.
Returns
-------
Theano tensor
An expression for the element-wise binary cross-entropy.
Notes
-----
This is the loss function of choice for binary classification problems
and sigmoid output units.
"""
predictions, targets = align_targets(predictions, targets)
return theano.tensor.nnet.binary_crossentropy(predictions, targets)
def categorical_crossentropy(predictions, targets):
"""Computes the categorical cross-entropy between predictions and targets.
.. math:: L_i = - \\sum_j{t_{i,j} \\log(p_{i,j})}
:math:`p` are the predictions, :math:`t` are the targets, :math:`i`
denotes the data point and :math:`j` denotes the class.
Parameters
----------
predictions : Theano 2D tensor
Predictions in (0, 1), such as softmax output of a neural network,
with data points in rows and class probabilities in columns.
targets : Theano 2D tensor or 1D tensor
Either targets in [0, 1] matching the layout of `predictions`, or
a vector of int giving the correct class index per data point.
In the case of an integer vector argument, each element
represents the position of the '1' in a one-hot encoding.
Returns
-------
Theano 1D tensor
An expression for the item-wise categorical cross-entropy.
Notes
-----
This is the loss function of choice for multi-class classification
problems and softmax output units. For hard targets, i.e., targets
that assign all of the probability to a single class per data point,
providing a vector of int for the targets is usually slightly more
efficient than providing a matrix with a single 1.0 per row.
"""
return theano.tensor.nnet.categorical_crossentropy(predictions, targets)
def squared_error(a, b):
"""Computes the element-wise squared difference between two tensors.
.. math:: L = (p - t)^2
Parameters
----------
a, b : Theano tensor
The tensors to compute the squared difference between.
Returns
-------
Theano tensor
An expression for the element-wise squared difference.
Notes
-----
This is the loss function of choice for many regression problems
or auto-encoders with linear output units.
"""
a, b = align_targets(a, b)
return theano.tensor.square(a - b)
def aggregate(loss, weights=None, mode='mean'):
"""Aggregates an element- or item-wise loss to a scalar loss.
Parameters
----------
loss : Theano tensor
The loss expression to aggregate.
weights : Theano tensor, optional
The weights for each element or item, must be broadcastable to
the same shape as `loss` if given. If omitted, all elements will
be weighted the same.
mode : {'mean', 'sum', 'normalized_sum'}
Whether to aggregate by averaging, by summing or by summing and
dividing by the total weights (which requires `weights` to be given).
Returns
-------
Theano scalar
A scalar loss expression suitable for differentiation.
Notes
-----
By supplying binary weights (i.e., only using values 0 and 1), this
function can also be used for masking out particular entries in the
loss expression. Note that masked entries still need to be valid
values, not-a-numbers (NaNs) will propagate through.
When applied to batch-wise loss expressions, setting `mode` to
``'normalized_sum'`` ensures that the loss per batch is of a similar
magnitude, independent of associated weights. However, it means that
a given data point contributes more to the loss when it shares a batch
with low-weighted or masked data points than with high-weighted ones.
"""
if weights is not None:
loss = loss * weights
if mode == 'mean':
return loss.mean()
elif mode == 'sum':
return loss.sum()
elif mode == 'normalized_sum':
if weights is None:
raise ValueError("require weights for mode='normalized_sum'")
return loss.sum() / weights.sum()
else:
raise ValueError("mode must be 'mean', 'sum' or 'normalized_sum', "
"got %r" % mode)
def binary_hinge_loss(predictions, targets, delta=1, log_odds=None,
binary=True):
"""Computes the binary hinge loss between predictions and targets.
.. math:: L_i = \\max(0, \\delta - t_i p_i)
Parameters
----------
predictions : Theano tensor
Predictions in (0, 1), such as sigmoidal output of a neural network
(or log-odds of predictions depending on `log_odds`).
targets : Theano tensor
Targets in {0, 1} (or in {-1, 1} depending on `binary`), such as
ground truth labels.
delta : scalar, default 1
The hinge loss margin
log_odds : bool, default None
``False`` if predictions are sigmoid outputs in (0, 1), ``True`` if
predictions are sigmoid inputs, or log-odds. If ``None``, will assume
``True``, but warn that the default will change to ``False``.
binary : bool, default True
``True`` if targets are in {0, 1}, ``False`` if they are in {-1, 1}
Returns
-------
Theano tensor
An expression for the element-wise binary hinge loss
Notes
-----
This is an alternative to the binary cross-entropy loss for binary
classification problems.
Note that it is a drop-in replacement only when giving ``log_odds=False``.
Otherwise, it requires log-odds rather than sigmoid outputs. Be aware that
depending on the Theano version, ``log_odds=False`` with a sigmoid
output layer may be less stable than ``log_odds=True`` with a linear layer.
"""
if log_odds is None: # pragma: no cover
raise FutureWarning(
"The `log_odds` argument to `binary_hinge_loss` will change "
"its default to `False` in a future version. Explicitly give "
"`log_odds=True` to retain current behavior in your code, "
"but also check the documentation if this is what you want.")
log_odds = True
if not log_odds:
predictions = theano.tensor.log(predictions / (1 - predictions))
if binary:
targets = 2 * targets - 1
predictions, targets = align_targets(predictions, targets)
return theano.tensor.nnet.relu(delta - predictions * targets)
def multiclass_hinge_loss(predictions, targets, delta=1):
"""Computes the multi-class hinge loss between predictions and targets.
.. math:: L_i = \\max_{j \\not = t_i} (0, p_j - p_{t_i} + \\delta)
Parameters
----------
predictions : Theano 2D tensor
Predictions in (0, 1), such as softmax output of a neural network,
with data points in rows and class probabilities in columns.
targets : Theano 2D tensor or 1D tensor
Either a vector of int giving the correct class index per data point
or a 2D tensor of one-hot encoding of the correct class in the same
layout as predictions (non-binary targets in [0, 1] do not work!)
delta : scalar, default 1
The hinge loss margin
Returns
-------
Theano 1D tensor
An expression for the item-wise multi-class hinge loss
Notes
-----
This is an alternative to the categorical cross-entropy loss for
multi-class classification problems
"""
num_cls = predictions.shape[1]
if targets.ndim == predictions.ndim - 1:
targets = theano.tensor.extra_ops.to_one_hot(targets, num_cls)
elif targets.ndim != predictions.ndim:
raise TypeError('rank mismatch between targets and predictions')
corrects = predictions[targets.nonzero()]
rest = theano.tensor.reshape(predictions[(1-targets).nonzero()],
(-1, num_cls-1))
rest = theano.tensor.max(rest, axis=1)
return theano.tensor.nnet.relu(rest - corrects + delta)
def huber_loss(predictions, targets, delta=1):
""" Computes the huber loss between predictions and targets.
.. math:: L_i = \\frac{(p - t)^2}{2}, |p - t| \\le \\delta
L_i = \\delta (|p - t| - \\frac{\\delta}{2} ), |p - t| \\gt \\delta
Parameters
----------
predictions : Theano 2D tensor or 1D tensor
Prediction outputs of a neural network.
targets : Theano 2D tensor or 1D tensor
Ground truth to which the prediction is to be compared
with. Either a vector or 2D Tensor.
delta : scalar, default 1
This delta value is defaulted to 1, for `SmoothL1Loss`
described in Fast-RCNN paper [1]_ .
Returns
-------
Theano tensor
An expression for the element-wise huber loss [2]_ .
Notes
-----
This is an alternative to the squared error for
regression problems.
References
----------
.. [1] Ross Girshick et al (2015):
Fast RCNN
https://arxiv.org/pdf/1504.08083.pdf
.. [2] Huber, Peter et al (1964)
Robust Estimation of a Location Parameter
https://projecteuclid.org/euclid.aoms/1177703732
"""
predictions, targets = align_targets(predictions, targets)
abs_diff = abs(targets - predictions)
ift = 0.5 * squared_error(targets, predictions)
iff = delta * (abs_diff - delta / 2.)
return theano.tensor.switch(abs_diff <= delta, ift, iff)
def binary_accuracy(predictions, targets, threshold=0.5):
"""Computes the binary accuracy between predictions and targets.
.. math:: L_i = \\mathbb{I}(t_i = \mathbb{I}(p_i \\ge \\alpha))
Parameters
----------
predictions : Theano tensor
Predictions in [0, 1], such as a sigmoidal output of a neural network,
giving the probability of the positive class
targets : Theano tensor
Targets in {0, 1}, such as ground truth labels.
threshold : scalar, default: 0.5
Specifies at what threshold to consider the predictions being of the
positive class
Returns
-------
Theano tensor
An expression for the element-wise binary accuracy in {0, 1}
Notes
-----
This objective function should not be used with a gradient calculation;
its gradient is zero everywhere. It is intended as a convenience for
validation and testing, not training.
To obtain the average accuracy, call :func:`theano.tensor.mean()` on the
result, passing ``dtype=theano.config.floatX`` to compute the mean on GPU.
"""
predictions, targets = align_targets(predictions, targets)
predictions = theano.tensor.ge(predictions, threshold)
return theano.tensor.eq(predictions, targets)
def categorical_accuracy(predictions, targets, top_k=1):
"""Computes the categorical accuracy between predictions and targets.
.. math:: L_i = \\mathbb{I}(t_i = \\operatorname{argmax}_c p_{i,c})
Can be relaxed to allow matches among the top :math:`k` predictions:
.. math::
L_i = \\mathbb{I}(t_i \\in \\operatorname{argsort}_c (-p_{i,c})_{:k})
Parameters
----------
predictions : Theano 2D tensor
Predictions in (0, 1), such as softmax output of a neural network,
with data points in rows and class probabilities in columns.
targets : Theano 2D tensor or 1D tensor
Either a vector of int giving the correct class index per data point
or a 2D tensor of 1 hot encoding of the correct class in the same
layout as predictions
top_k : int
Regard a prediction to be correct if the target class is among the
`top_k` largest class probabilities. For the default value of 1, a
prediction is correct only if the target class is the most probable.
Returns
-------
Theano 1D tensor
An expression for the item-wise categorical accuracy in {0, 1}
Notes
-----
This is a strictly non differential function as it includes an argmax.
This objective function should never be used with a gradient calculation.
It is intended as a convenience for validation and testing not training.
To obtain the average accuracy, call :func:`theano.tensor.mean()` on the
result, passing ``dtype=theano.config.floatX`` to compute the mean on GPU.
"""
if targets.ndim == predictions.ndim:
targets = theano.tensor.argmax(targets, axis=-1)
elif targets.ndim != predictions.ndim - 1:
raise TypeError('rank mismatch between targets and predictions')
if top_k == 1:
# standard categorical accuracy
top = theano.tensor.argmax(predictions, axis=-1)
return theano.tensor.eq(top, targets)
else:
# top-k accuracy
top = theano.tensor.argsort(predictions, axis=-1)
# (Theano cannot index with [..., -top_k:], we need to simulate that)
top = top[[slice(None) for _ in range(top.ndim - 1)] +
[slice(-top_k, None)]]
targets = theano.tensor.shape_padaxis(targets, axis=-1)
return theano.tensor.any(theano.tensor.eq(top, targets), axis=-1)