In [57]:
import torch
from torch import nn
from torch.nn import functional as F

In [58]:
y_true = torch.randint(0, 5, (10,))
y_true

tensor([4, 3, 2, 3, 3, 1, 0, 2, 4, 4])

In [59]:
y_pred = torch.rand(10, 5)
y_pred

tensor([[0.8002, 0.7749, 0.2756, 0.3778, 0.4834],
        [0.6982, 0.4399, 0.6566, 0.9451, 0.7807],
        [0.6962, 0.7916, 0.8821, 0.2757, 0.3177],
        [0.0843, 0.2305, 0.5761, 0.4651, 0.0891],
        [0.6787, 0.9949, 0.4036, 0.0659, 0.5659],
        [0.7988, 0.6487, 0.9356, 0.4983, 0.8143],
        [0.7884, 0.2093, 0.0569, 0.0449, 0.0860],
        [0.1711, 0.2202, 0.2939, 0.4170, 0.0136],
        [0.5033, 0.0894, 0.4785, 0.1606, 0.6598],
        [0.6039, 0.5352, 0.4168, 0.8218, 0.6274]])

# CrossEntropyLoss

In [60]:
nn.CrossEntropyLoss()(y_pred, y_true)

tensor(1.5312)

# LogSoftmax + NLLLoss = CrossEntropyLoss

In [61]:
nn.NLLLoss()(torch.log_softmax(y_pred, dim=-1), y_true)

tensor(1.5312)

## LogSoftmax = Log(Softmax(x))

In [62]:
torch.log(torch.softmax(y_pred, dim=-1))

tensor([[-1.3739, -1.3992, -1.8985, -1.7963, -1.6907],
        [-1.6288, -1.8871, -1.6704, -1.3819, -1.5463],
        [-1.5360, -1.4406, -1.3501, -1.9565, -1.9145],
        [-1.8343, -1.6881, -1.3425, -1.4536, -1.8296],
        [-1.5188, -1.2025, -1.7938, -2.1316, -1.6316],
        [-1.5610, -1.7110, -1.4241, -1.8614, -1.5455],
        [-1.1028, -1.6819, -1.8342, -1.8462, -1.8051],
        [-1.6704, -1.6213, -1.5476, -1.4244, -1.8278],
        [-1.5076, -1.9215, -1.5324, -1.8503, -1.3511],
        [-1.6154, -1.6842, -1.8026, -1.3975, -1.5920]])

In [63]:
nn.NLLLoss()(torch.log(torch.softmax(y_pred, dim=-1)), y_true)

tensor(1.5312)

# label_smoothing

In [64]:
label_smoothing = 0.1

In [65]:
nn.CrossEntropyLoss(label_smoothing=label_smoothing)(y_pred, y_true)

tensor(1.5413)

## 尝试复现label_smoothing

In [66]:
y_true_ont_hot = F.one_hot(y_true)
y_true_ont_hot = y_true_ont_hot.type(torch.float32)
y_true_ont_hot

tensor([[0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.]])

In [67]:
y_true_ont_hot == 1

tensor([[False, False, False, False,  True],
        [False, False, False,  True, False],
        [False, False,  True, False, False],
        [False, False, False,  True, False],
        [False, False, False,  True, False],
        [False,  True, False, False, False],
        [ True, False, False, False, False],
        [False, False,  True, False, False],
        [False, False, False, False,  True],
        [False, False, False, False,  True]])

In [68]:
y_true_ont_hot[y_true_ont_hot == 1] = 1 - label_smoothing
y_true_ont_hot[y_true_ont_hot == 0] = label_smoothing / (5 - 1)

In [69]:
y_true_ont_hot

tensor([[0.0250, 0.0250, 0.0250, 0.0250, 0.9000],
        [0.0250, 0.0250, 0.0250, 0.9000, 0.0250],
        [0.0250, 0.0250, 0.9000, 0.0250, 0.0250],
        [0.0250, 0.0250, 0.0250, 0.9000, 0.0250],
        [0.0250, 0.0250, 0.0250, 0.9000, 0.0250],
        [0.0250, 0.9000, 0.0250, 0.0250, 0.0250],
        [0.9000, 0.0250, 0.0250, 0.0250, 0.0250],
        [0.0250, 0.0250, 0.9000, 0.0250, 0.0250],
        [0.0250, 0.0250, 0.0250, 0.0250, 0.9000],
        [0.0250, 0.0250, 0.0250, 0.0250, 0.9000]])

In [70]:
y_pred.shape, y_true_ont_hot.shape

(torch.Size([10, 5]), torch.Size([10, 5]))

In [71]:
nn.CrossEntropyLoss()(y_pred, y_true_ont_hot)

tensor(1.5439)

# 多类别分类(一个目标有多个标签)

In [76]:
y_true_ont_hot_ = F.one_hot(y_true)
y_true_ont_hot_

tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1]])

In [77]:
# 伪造多类别
y_true_ont_hot_[0, 1] = 1
y_true_ont_hot_[1, 1:3] = 1
y_true_ont_hot_[2, 2] = 1
y_true_ont_hot_[3, 0] = 1
y_true_ont_hot_[4, 3] = 1
y_true_ont_hot_[5, 3] = 1
y_true_ont_hot_[6, 0:2] = 1
y_true_ont_hot_[7, 2] = 1
y_true_ont_hot_[8, 1] = 1
y_true_ont_hot_[9, 0:3] = 1
y_true_ont_hot_

tensor([[0, 1, 0, 0, 1],
        [0, 1, 1, 1, 0],
        [0, 0, 1, 0, 0],
        [1, 0, 0, 1, 0],
        [0, 0, 0, 1, 0],
        [0, 1, 0, 1, 0],
        [1, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1],
        [1, 1, 1, 0, 1]])

## BCELoss = Binary Cross Entropy

In [78]:
nn.BCELoss()(torch.sigmoid(y_pred), y_true_ont_hot_.type(torch.float32))

tensor(0.7704)

## BCEWithLogitsLoss = Signoid + BCELoss

This loss combines a `Sigmoid` layer and the `BCELoss` in one single class. This version is more numerically stable than using a plain `Sigmoid` followed by a `BCELoss` as, by combining the operations into one layer we take advantage of the log-sum-exp trick for numerical stability.

该损失将 `Sigmoid` 层和 `BCELoss` 组合在一个类中。 这个版本比使用简单的 `Sigmoid` 后跟 `BCELoss` 在数值上更稳定，因为通过将操作组合到一层，我们利用 log-sum-exp 技巧来实现数值稳定性。

In [87]:
nn.BCEWithLogitsLoss()(y_pred, y_true_ont_hot_.type(torch.float32))

tensor(0.7704)

## Log(Sigmoid(x)) + NLLLoss

In [91]:
nn.NLLLoss()(torch.log(torch.sigmoid(y_pred)).flatten(), y_true_ont_hot_.flatten())

tensor(0.3710)