In [1]:
import torch
import torch.optim as optim
import torch.nn.functional as f
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision import datasets

# 多分类问题 (利用softmax函数进行多分类)

## Softmax Layer Theory

Suppose $Z^l \in \mathbb{R}^K$ is the output of the last linear layer, the softmax function is defined as:

$$ P(y = i) = \frac{e^{Z_i}}{\sum_{j=1}^Ke^{Z_j}} , i \in \{0, \cdots, K-1 \}$$

## Cross Entropy in numpy

In [2]:
y = np.array([1, 0, 0])
z = np.array([0.2, 0.1, -0.1])
y_pred = np.exp(z) / np.sum(np.exp(z))
loss = np.sum(-y * np.log(y_pred))
print(loss)

0.9729189131256584


## Cross Entropy in pytorch

In [3]:
y = torch.LongTensor([0])
z = torch.FloatTensor([[0.2, 0.1, -0.1]])
criterion = torch.nn.CrossEntropyLoss()
loss = criterion(z, y)
print(loss)

tensor(0.9729)


## Step-1 Prepare Dataset

Note the use of `transforms.ToTensor()` to convert the images from PIL format to Tensor format.
$$ \mathbb Z^{28 \times 28}, pixel \in \{0, \cdots, 255 \} \rightarrow \mathbb Z^{1 \times 28 \times 28}, pixel \in [0, 1] $$

And we use `transforms.Normalize()` to normalize the data in this way:
$$ Pixel_{norm} = \frac{Pixel_{origin} - mean}{std} $$

In [4]:
batch_size = 64
transform = transforms.Compose([
    transforms.ToTensor(),                      # Convert PIL image to PyTorch Tensor.
    transforms.Normalize((0.1307,), (0.3081,))  # Normalize using the mean value and std loss value
])

train_dataset = datasets.MNIST(root='./dataset/mnist', train=True, transform=transform, download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = datasets.MNIST(root='./dataset/mnist', train=False, transform=transform, download=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

## Step-2 Define Model

In [5]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = torch.nn.Linear(28 * 28, 512)
        self.layer2 = torch.nn.Linear(512, 256)
        self.layer3 = torch.nn.Linear(256, 128)
        self.layer4 = torch.nn.Linear(128, 64)
        self.layer5 = torch.nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = f.relu(self.layer1(x))
        x = f.relu(self.layer2(x))
        x = f.relu(self.layer3(x))
        x = f.relu(self.layer4(x))
        return self.layer5(x)
    
model = Net()

## Step-3 Define Loss and Optimizer

In [6]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)    # 最后一个参数是冲量

## Step-4 Training the Model

In [7]:
def train(epoch: int):
    running_loss = 0.0
    for batch_idx, (inputs, target) in enumerate(train_loader):
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if batch_idx % 100 == 99:
            print(f'[{epoch + 1}, {batch_idx + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

def test():
    correct = 0
    total = 0
    with torch.no_grad():
        for (images, labels) in test_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy on test set: {100 * correct / total: .4f}%')

if __name__ == '__main__':
    for epoch in range(10):
        train(epoch)
        if epoch % 10 == 9:
            test()

[1,   100] loss: 2.295
[1,   200] loss: 2.264
[1,   300] loss: 2.114
[1,   400] loss: 1.538
[1,   500] loss: 0.890
[1,   600] loss: 0.620
[1,   700] loss: 0.480
[1,   800] loss: 0.442
[1,   900] loss: 0.412
[2,   100] loss: 0.359
[2,   200] loss: 0.340
[2,   300] loss: 0.308
[2,   400] loss: 0.297
[2,   500] loss: 0.261
[2,   600] loss: 0.264
[2,   700] loss: 0.240
[2,   800] loss: 0.233
[2,   900] loss: 0.223
[3,   100] loss: 0.209
[3,   200] loss: 0.189
[3,   300] loss: 0.184
[3,   400] loss: 0.173
[3,   500] loss: 0.176
[3,   600] loss: 0.158
[3,   700] loss: 0.153
[3,   800] loss: 0.149
[3,   900] loss: 0.142
[4,   100] loss: 0.142
[4,   200] loss: 0.129
[4,   300] loss: 0.126
[4,   400] loss: 0.130
[4,   500] loss: 0.135
[4,   600] loss: 0.121
[4,   700] loss: 0.109
[4,   800] loss: 0.101
[4,   900] loss: 0.109
[5,   100] loss: 0.095
[5,   200] loss: 0.098
[5,   300] loss: 0.092
[5,   400] loss: 0.097
[5,   500] loss: 0.108
[5,   600] loss: 0.086
[5,   700] loss: 0.099
[5,   800] 

KeyboardInterrupt: 

## 作业

- [Cross Entropy Loss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss) 和 [NLL Loss](https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html#torch.nn.NLLLoss) 的区别
- 尝试理解: Cross Entropy Loss $\Leftrightarrow$ Log Softmax + NLL Loss
- 完成Kaggle中的[Otto Group Product Classification任务](https://www.kaggle.com/c/otto-group-product-classification-challenge/data)

## Understanding

Cross Entropy Loss 和 NLL Loss 都是常用的分类任务的损失函数，它们的区别在于 Cross Entropy Loss 内部包含了 softmax 和 log 操作，而 NLL Loss 只是对输入的对数概率向量和目标标签进行负对数似然计算¹²。也就是说，Cross Entropy Loss = NLL Loss + softmax + log。因此，如果使用 Cross Entropy Loss 作为损失函数，神经网络的最后一层就不需要加入 softmax 或者 log softmax，而如果使用 NLL Loss，就需要在最后一层加入 log softmax³⁴。下面是一个简单的例子，说明两种损失函数的计算过程：

---

```python
import torch
from torch import nn

# 随机生成一个神经网络的最后一层，3行4列，那就是有4个标签
input = torch.randn(3, 4)
# input的第一行设置为标签1，第二行为标签0, 第三行为标签2
label = torch.tensor([1, 0, 2])

# 定义损失函数为NLLLoss
loss = nn.NLLLoss()
# 定义log softmax函数，也就是将input中的每一行转化为带有负号的数字
m = nn.LogSoftmax(dim=1)
# 计算损失，损失就是一个值。
loss_value = loss(m(input), label)
print(loss_value) # tensor(1.7075)

# 定义损失函数为CrossEntropyLoss
loss = nn.CrossEntropyLoss()
# 计算损失，损失就是一个值。
loss_value = loss(input, label)
print(loss_value) # tensor(1.7075)
```

---

可以看到，两种损失函数的结果是一样的，只是 CrossEntropyLoss 不需要额外的 log softmax 操作。⁵

Source: Conversation with Bing, 2/21/2024 \
(1) 神经网络中NLLLoss和CrossEntropyLoss的快速理解 - 知乎. https://zhuanlan.zhihu.com/p/589631793. \
(2) 详解pytorch的损失函数：NLLLoss()和CrossEntropyLoss() - 知乎. https://zhuanlan.zhihu.com/p/570118948. \
(3) 详解pytorch的损失函数：NLLLoss()和CrossEntropyLoss() - 知乎. https://bing.com/search?q=Cross+Entropy+Loss+%e5%92%8c+NLL+Loss+%e7%9a%84%e5%8c%ba%e5%88%ab. \
(4) pytorch中F.cross_entropy和F.nll_loss的区别-CSDN博客. https://blog.csdn.net/code_plus/article/details/115481575. \
(5) pytorch中CrossEntropyLoss和NLLLoss的区别与联系 - CSDN博客. https://blog.csdn.net/qq_25105061/article/details/107381316.