In [1]:
import torch
from torch import nn, Tensor
from torch.nn import functional as F

In [2]:
torch.manual_seed(42)
y_true = torch.randint(0, 5, (10,))
y_true

tensor([2, 2, 1, 4, 1, 0, 0, 4, 0, 3])

In [3]:
torch.manual_seed(42)
y_pred = torch.rand(10, 5)
y_pred

tensor([[0.8823, 0.9150, 0.3829, 0.9593, 0.3904],
        [0.6009, 0.2566, 0.7936, 0.9408, 0.1332],
        [0.9346, 0.5936, 0.8694, 0.5677, 0.7411],
        [0.4294, 0.8854, 0.5739, 0.2666, 0.6274],
        [0.2696, 0.4414, 0.2969, 0.8317, 0.1053],
        [0.2695, 0.3588, 0.1994, 0.5472, 0.0062],
        [0.9516, 0.0753, 0.8860, 0.5832, 0.3376],
        [0.8090, 0.5779, 0.9040, 0.5547, 0.3423],
        [0.6343, 0.3644, 0.7104, 0.9464, 0.7890],
        [0.2814, 0.7886, 0.5895, 0.7539, 0.1952]])

# CrossEntropyLoss

## 手动实现 CrossEntropyLoss

In [4]:
def cross_entropyloss(
    y_pred: torch.Tensor, y_true: torch.Tensor, reduction="mean"
) -> Tensor:
    """交叉熵损失
        result = - log(softmax(y_pred)) * one_hot(y_true)

    Args:
        input (Tensor):  predict value
        target (Tensor): target value
        reduction (str, optional): mean' | 'sum' | 'none'. Defaults to 'mean'.

    Returns:
        Tensor: 交叉熵损失结果
    """

    y_true = nn.functional.one_hot(y_true)
    y_pred = torch.softmax(y_pred, dim=-1)
    y_pred = -torch.log(y_pred)
    result = y_pred * y_true

    if reduction == "sum":
        return result.sum()
    elif reduction == "mean":
        return result.sum() / y_true.size(0)
    elif reduction == "none":
        max, _ = result.max(dim=-1)
        return max

In [5]:
print(nn.CrossEntropyLoss(reduction="mean")(y_pred, y_true))
print(F.cross_entropy(y_pred, y_true, reduction="mean"))
print(cross_entropyloss(y_pred, y_true, reduction="mean"))

tensor(1.6211)
tensor(1.6211)
tensor(1.6211)


In [6]:
print(nn.CrossEntropyLoss(reduction="sum")(y_pred, y_true))
print(F.cross_entropy(y_pred, y_true, reduction="sum"))
print(cross_entropyloss(y_pred, y_true, reduction="sum"))

tensor(16.2108)
tensor(16.2108)
tensor(16.2108)


In [7]:
print(nn.CrossEntropyLoss(reduction="none")(y_pred, y_true))
print(F.cross_entropy(y_pred, y_true, reduction="none"))
print(cross_entropyloss(y_pred, y_true, reduction="none"))

tensor([1.9654, 1.4071, 1.7677, 1.5602, 1.5892, 1.6320, 1.2770, 1.9243, 1.6820,
        1.4059])
tensor([1.9654, 1.4071, 1.7677, 1.5602, 1.5892, 1.6320, 1.2770, 1.9243, 1.6820,
        1.4059])
tensor([1.9654, 1.4071, 1.7677, 1.5602, 1.5892, 1.6320, 1.2770, 1.9243, 1.6820,
        1.4059])


## CrossEntropyLoss = LogSoftmax + NLLLoss

In [8]:
F.nll_loss(torch.log_softmax(y_pred, dim=-1), y_true, reduction="mean")

tensor(1.6211)

### LogSoftmax = Log(Softmax(x))

In [9]:
torch.log(torch.softmax(y_pred, dim=-1))

tensor([[-1.4660, -1.4332, -1.9654, -1.3889, -1.9578],
        [-1.5999, -1.9442, -1.4071, -1.2600, -2.0676],
        [-1.4267, -1.7677, -1.4919, -1.7936, -1.6202],
        [-1.7582, -1.3022, -1.6137, -1.9210, -1.5602],
        [-1.7609, -1.5892, -1.7336, -1.1989, -1.9252],
        [-1.6320, -1.5427, -1.7022, -1.3543, -1.8954],
        [-1.2770, -2.1533, -1.3426, -1.6454, -1.8909],
        [-1.4577, -1.6887, -1.3627, -1.7120, -1.9243],
        [-1.6820, -1.9519, -1.6059, -1.3699, -1.5273],
        [-1.8784, -1.3712, -1.5703, -1.4059, -1.9645]])

In [10]:
F.nll_loss(torch.log(torch.softmax(y_pred, dim=-1)), y_true, reduction="mean")

tensor(1.6211)

## label_smoothing

In [11]:
label_smoothing = 0.1

In [12]:
nn.CrossEntropyLoss(label_smoothing=label_smoothing)(y_pred, y_true)

tensor(1.6227)

### 尝试复现label_smoothing

In [13]:
y_true_ont_hot = F.one_hot(y_true)
y_true_ont_hot = y_true_ont_hot.type(torch.float32)
y_true_ont_hot

tensor([[0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.]])

In [14]:
y_true_ont_hot == 1

tensor([[False, False,  True, False, False],
        [False, False,  True, False, False],
        [False,  True, False, False, False],
        [False, False, False, False,  True],
        [False,  True, False, False, False],
        [ True, False, False, False, False],
        [ True, False, False, False, False],
        [False, False, False, False,  True],
        [ True, False, False, False, False],
        [False, False, False,  True, False]])

In [15]:
y_true_ont_hot[y_true_ont_hot == 1] = 1 - label_smoothing
y_true_ont_hot[y_true_ont_hot == 0] = label_smoothing / (5 - 1)

In [16]:
y_true_ont_hot

tensor([[0.0250, 0.0250, 0.9000, 0.0250, 0.0250],
        [0.0250, 0.0250, 0.9000, 0.0250, 0.0250],
        [0.0250, 0.9000, 0.0250, 0.0250, 0.0250],
        [0.0250, 0.0250, 0.0250, 0.0250, 0.9000],
        [0.0250, 0.9000, 0.0250, 0.0250, 0.0250],
        [0.9000, 0.0250, 0.0250, 0.0250, 0.0250],
        [0.9000, 0.0250, 0.0250, 0.0250, 0.0250],
        [0.0250, 0.0250, 0.0250, 0.0250, 0.9000],
        [0.9000, 0.0250, 0.0250, 0.0250, 0.0250],
        [0.0250, 0.0250, 0.0250, 0.9000, 0.0250]])

In [17]:
y_pred.shape, y_true_ont_hot.shape

(torch.Size([10, 5]), torch.Size([10, 5]))

In [18]:
nn.CrossEntropyLoss(label_smoothing=label_smoothing)(y_pred, y_true)

tensor(1.6227)

In [19]:
# 结果不同
nn.CrossEntropyLoss()(y_pred, y_true_ont_hot)

tensor(1.6231)

# 多类别分类(一个目标有多个标签)

In [20]:
y_true_ont_hot_ = F.one_hot(y_true)
y_true_ont_hot_

tensor([[0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0]])

In [21]:
# 伪造多类别
y_true_ont_hot_[0, 1] = 1
y_true_ont_hot_[1, 1:3] = 1
y_true_ont_hot_[2, 2] = 1
y_true_ont_hot_[3, 0] = 1
y_true_ont_hot_[4, 3] = 1
y_true_ont_hot_[5, 3] = 1
y_true_ont_hot_[6, 0:2] = 1
y_true_ont_hot_[7, 2] = 1
y_true_ont_hot_[8, 1] = 1
y_true_ont_hot_[9, 0:3] = 1
y_true_ont_hot_

tensor([[0, 1, 1, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 1, 1, 0, 0],
        [1, 0, 0, 0, 1],
        [0, 1, 0, 1, 0],
        [1, 0, 0, 1, 0],
        [1, 1, 0, 0, 0],
        [0, 0, 1, 0, 1],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0]])

## 手动实现 BCELoss

In [43]:
def bce_loss(y_pred: torch.Tensor, y_true: torch.Tensor, reduction="mean") -> Tensor:
    """二分类交叉熵损失
        result = - log(y_pred) * y_true - log(1 - y_pred) * (1 - y_true)

    Args:
        input (Tensor):  predict value
        target (Tensor): target value
        reduction (str, optional): mean' | 'sum' | 'none'. Defaults to 'mean'.

    Returns:
        Tensor: 二分类交叉熵损失结果
    """

    result = -torch.log(y_pred) * y_true - torch.log(1 - y_pred) * (1 - y_true)

    if reduction == "sum":
        return result.sum()
    elif reduction == "mean":
        return result.sum() / y_true.numel()
    elif reduction == "none":
        return result

In [48]:
#       y_pred, y_true
#       0.7      1
#       0.3      0
# sum   1        1
# 两者相同
print(bce_loss(torch.tensor(0.7), torch.tensor(1)))
print(bce_loss(torch.tensor(0.3), torch.tensor(0)))
print(
    -torch.log(torch.tensor(0.7)) * torch.tensor(1)
    - torch.log(torch.tensor(0.3)) * torch.tensor(0)
)

tensor(0.3567)
tensor(0.3567)
tensor(0.3567)
tensor(0.5514)
tensor(0.5514)
tensor(0.5514)


In [46]:
#       y_pred, y_true
#       0.6      0.9
#       0.4      0.1
# sum   1        1
# 两者相同
print(bce_loss(torch.tensor(0.6), torch.tensor(0.9)))
print(bce_loss(torch.tensor(0.4), torch.tensor(0.1)))
print(
    -torch.log(torch.tensor(0.6)) * torch.tensor(0.9)
    - torch.log(torch.tensor(0.4)) * torch.tensor(0.1)
)

tensor(0.5514)

## BCELoss = Binary Cross Entropy

In [23]:
# 交叉熵也可以计算,不过多个类别之间是互斥的
nn.CrossEntropyLoss()(y_pred, y_true_ont_hot)

tensor(1.6231)

In [24]:
# BCELoss计算时的 pred 和 target 的形状要相同
print(nn.BCELoss(reduction="mean")(torch.sigmoid(y_pred), y_true_ont_hot))
print(bce_loss(torch.sigmoid(y_pred), y_true_ont_hot, reduction="mean"))

tensor(0.9070)
tensor(0.9070)


In [25]:
# BCELoss计算时的 pred 和 target 的形状要相同
print(nn.BCELoss(reduction="sum")(torch.sigmoid(y_pred), y_true_ont_hot))
print(bce_loss(torch.sigmoid(y_pred), y_true_ont_hot, reduction="sum"))

tensor(45.3489)
tensor(45.3489)


In [26]:
# BCELoss计算时的 pred 和 target 的形状要相同
print(nn.BCELoss(reduction="none")(torch.sigmoid(y_pred), y_true_ont_hot))
print(bce_loss(torch.sigmoid(y_pred), y_true_ont_hot, reduction="none"))

tensor([[1.2065, 1.2290, 0.5582, 1.2597, 0.8975],
        [1.0230, 0.8232, 0.4524, 1.2468, 0.7586],
        [1.2425, 0.4991, 1.1978, 1.0026, 1.1123],
        [0.9200, 1.2087, 1.0064, 0.8286, 0.4906],
        [0.8303, 0.5408, 0.8452, 1.1723, 0.7446],
        [0.5944, 0.8796, 0.7928, 0.9900, 0.6961],
        [0.4217, 0.7296, 1.2091, 1.0121, 0.8677],
        [1.1571, 1.0088, 1.2214, 0.9946, 0.5708],
        [0.4889, 0.8828, 1.0924, 1.2507, 1.1438],
        [0.8367, 1.1436, 1.0160, 0.4610, 0.7906]])
tensor([[1.2065, 1.2290, 0.5582, 1.2597, 0.8975],
        [1.0230, 0.8232, 0.4524, 1.2468, 0.7586],
        [1.2425, 0.4991, 1.1978, 1.0026, 1.1123],
        [0.9200, 1.2087, 1.0064, 0.8286, 0.4906],
        [0.8303, 0.5408, 0.8452, 1.1723, 0.7446],
        [0.5944, 0.8796, 0.7928, 0.9900, 0.6961],
        [0.4217, 0.7296, 1.2091, 1.0121, 0.8677],
        [1.1571, 1.0088, 1.2214, 0.9946, 0.5708],
        [0.4889, 0.8828, 1.0924, 1.2507, 1.1438],
        [0.8367, 1.1436, 1.0160, 0.4610, 0.7906]]

In [27]:
# 交叉熵也可以计算,不过多个类别之间是互斥的
nn.CrossEntropyLoss()(y_pred, y_true_ont_hot_.type(torch.float32))

tensor(3.5679)

In [28]:
# BCELoss计算时的 pred 和 target 的形状要相同
print(
    nn.BCELoss(reduction="mean")(
        torch.sigmoid(y_pred), y_true_ont_hot_.type(torch.float32)
    )
)
print(
    bce_loss(
        torch.sigmoid(y_pred), y_true_ont_hot_.type(torch.float32), reduction="mean"
    )
)

tensor(0.7695)
tensor(0.7695)


In [29]:
# BCELoss计算时的 pred 和 target 的形状要相同
print(
    nn.BCELoss(reduction="sum")(
        torch.sigmoid(y_pred), y_true_ont_hot_.type(torch.float32)
    )
)
print(
    bce_loss(
        torch.sigmoid(y_pred), y_true_ont_hot_.type(torch.float32), reduction="sum"
    )
)

tensor(38.4762)
tensor(38.4762)


In [30]:
# BCELoss计算时的 pred 和 target 的形状要相同
print(
    nn.BCELoss(reduction="none")(
        torch.sigmoid(y_pred), y_true_ont_hot_.type(torch.float32)
    )
)
print(
    bce_loss(
        torch.sigmoid(y_pred), y_true_ont_hot_.type(torch.float32), reduction="none"
    )
)

tensor([[1.2286, 0.3368, 0.5199, 1.2837, 0.9073],
        [1.0381, 0.5731, 0.3731, 1.2703, 0.7620],
        [1.2659, 0.4398, 0.3501, 1.0168, 1.1308],
        [0.5013, 1.2308, 1.0207, 0.8353, 0.4278],
        [0.8370, 0.4966, 0.8526, 0.3614, 0.7472],
        [0.5675, 0.8886, 0.7978, 0.4565, 0.6962],
        [0.3265, 0.6562, 1.2312, 1.0267, 0.8762],
        [1.1773, 1.0233, 0.3400, 1.0084, 0.5366],
        [0.4255, 0.5275, 1.1102, 1.2744, 1.1635],
        [0.5623, 0.3746, 0.4412, 0.3856, 0.7955]])
tensor([[1.2286, 0.3368, 0.5199, 1.2837, 0.9073],
        [1.0381, 0.5731, 0.3731, 1.2703, 0.7620],
        [1.2659, 0.4398, 0.3501, 1.0168, 1.1308],
        [0.5013, 1.2308, 1.0207, 0.8353, 0.4278],
        [0.8370, 0.4966, 0.8526, 0.3614, 0.7472],
        [0.5675, 0.8886, 0.7978, 0.4565, 0.6962],
        [0.3265, 0.6562, 1.2312, 1.0267, 0.8762],
        [1.1773, 1.0233, 0.3400, 1.0084, 0.5366],
        [0.4255, 0.5275, 1.1102, 1.2744, 1.1635],
        [0.5623, 0.3746, 0.4412, 0.3856, 0.7955]]

## BCEWithLogitsLoss = Sigmoid + BCELoss

This loss combines a `Sigmoid` layer and the `BCELoss` in one single class. This version is more numerically stable than using a plain `Sigmoid` followed by a `BCELoss` as, by combining the operations into one layer we take advantage of the log-sum-exp trick for numerical stability.

该损失将 `Sigmoid` 层和 `BCELoss` 组合在一个类中。 这个版本比使用简单的 `Sigmoid` 后跟 `BCELoss` 在数值上更稳定，因为通过将操作组合到一层，我们利用 log-sum-exp 技巧来实现数值稳定性。

In [31]:
# BCELoss计算时的 pred 和 target 的形状要相同
nn.BCEWithLogitsLoss()(y_pred, y_true_ont_hot)

tensor(0.9070)

In [32]:
# BCELoss计算时的 pred 和 target 的形状要相同
nn.BCEWithLogitsLoss()(y_pred, y_true_ont_hot_.type(torch.float32))

tensor(0.7695)