# 第一章节
## 2.多分类学习

### 数据集准备

In [1]:
import torch

In [3]:
from torchvision import datasets

In [5]:
from torchvision import transforms

In [6]:
from torch.utils.data import DataLoader

In [35]:
transform = transforms.Compose([
        transforms.ToTensor()
])

In [36]:
batch_size = 10

In [37]:
train_loader = DataLoader(
    dataset = datasets.FashionMNIST(
        root = ".",
        train = True,
        transform=transform,
        download=True
    ),
    batch_size = batch_size,
    shuffle = True,
)

In [38]:
valid_loader = DataLoader(
    dataset = datasets.FashionMNIST(
        root = ".",
        train = False,
        transform=transform,
        download=True
    ),
    batch_size = batch_size,
    shuffle = True,
)

### 模型参数初始化
图片输入为28 * 28 = 784， 因为总共有10个类别，所以w维度为 784 * 10, b的维度为10

In [47]:
next(enumerate(train_loader))[1][0].size()

torch.Size([10, 1, 28, 28])

In [123]:
next(enumerate(train_loader))[1][1].view(-1, 1)

tensor([[2],
        [7],
        [4],
        [5],
        [7],
        [9],
        [2],
        [9],
        [8],
        [8]])

In [109]:
next(enumerate(train_loader))[1][0].view(-1, 784).size()

torch.Size([10, 784])

In [58]:
w = torch.normal(0, 1, size=(784, 10), requires_grad=True)

In [59]:
b = torch.zeros(10, requires_grad=True)

In [130]:
params = [w, b]

### 模型定义
softmax运算会先通过exp函数对每个元素做指数运算，再对exp矩阵同行元素求和，最后令矩阵每行各元素与该行元素之和相除。这样一来，最终得到的矩阵每行元素和为1且非负。因此，该矩阵每行都是合法的概率分布。softmax运算的输出矩阵中的任意一行元素代表了一个样本在各个输出类别上的预测概率。

![image.png](attachment:c12f59f9-3e8c-44c9-8b30-34adf95abe29.png)


In [60]:
#### softmax 函数练习

In [111]:
prod_a = torch.randn(10, 10)

In [116]:
def softmax(x): # 输入为 样本数 x 样本预测种类数量
    x_exp = torch.exp(x)
    part = x_exp.sum(dim=1, keepdim=True) # dim=0代表统计列, dim=1代表统计行
    return x_exp / part # 这里使用到了广播机制
    

In [117]:
a = softmax(prod_a)

In [118]:
a.sum(1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000])

In [121]:
def net(x):
    return softmax(x.view(-1, 784) @ w + b)

### 损失函数定义
为了得到标签的预测概率，我们可以使用gather函数。在下面的例子中，变量y_hat是2个样本在3个类别的预测概率，变量y是这2个样本的标签类别。通过使用gather函数，我们得到了2个样本的标签的预测概率。数学表述中标签类别离散值从1开始逐一递增不同，在代码中，标签类别的离散值是从0开始逐一递增的。

In [137]:
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]]) # 2个样本, 每个样本三个类别的预测
y = torch.LongTensor([0, 2])
y_hat.gather(1, y.view(-1, 1))

tensor([[0.1000],
        [0.5000]])

In [138]:
y.view(-1, 1)

tensor([[0],
        [2]])

### 交叉熵的定义
![image.png](attachment:2646fc7e-558c-4608-b4b8-eafb5141cc41.png)

In [128]:
def cross_entropy(yhat, y): # yhat -> 10(batch_size)x10的矩阵, y -> 10的矩阵
    # gather dim = 0 输出行的矩阵 dim=1 输出列的矩阵
    return - torch.log(yhat.gather(1, y.view(-1, 1))) #这里dim=1表示需要返回10x1的矩阵

### 计算分类准确率
给定一个类别的预测概率分布y_hat，我们把预测概率最大的类别作为输出类别。如果它与真实类别y一致，说明这次预测是正确的。分类准确率即正确预测数量与总预测数量之比。

为了演示准确率的计算，下面定义准确率accuracy函数。其中y_hat.argmax(dim=1)返回矩阵y_hat每行中最大元素的索引，且返回结果与变量y形状相同。相等条件判断式(y_hat.argmax(dim=1) == y)是一个类型为ByteTensor的Tensor，我们用float()将其转换为值为0（相等为假）或1（相等为真）的浮点型Tensor。

In [129]:
def accuracy(y_hat, y):
    return (y_hat.argmax(dim=1) == y).float().mean().item()

In [129]:
def evaluate_accuracy(y_hat, y):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

### 梯度下降

In [132]:
def SGD(params, lr, batch_size):
    for param in params:
        param.data -= lr * param.grad / batch_size

### 训练模型

In [167]:
epochs = 5
lr = 0.1
lossfn = cross_entropy
for epoch in range(epochs):
    total_loss = 0.0
    for idx, (data, label) in enumerate(train_loader):
        pred = net(data)
        loss = lossfn(pred, label).sum()
        total_loss+=loss
        loss.backward()
        SGD(params, lr, 10)
        for param in params:
            param.grad.data.zero_()    
    
    acc_sum, n = 0.0, 0
    
    for idx, (data, label) in enumerate(train_loader):
        r = (net(data).argmax(dim=1) == label).float().sum().item()
        if idx == 0:
            print (net(data).argmax(dim=1))
        acc_sum += r
        n += label.shape[0]
    print (f"acc_sum: {acc_sum}, n: {n}")
    print('epoch %d, train acc %.3f, loss %.3f'
              % (epoch + 1, acc_sum / n, total_loss / n))
    
    acc_sum, n = 0.0, 0
    
    for idx, (data, label) in enumerate(valid_loader):
        acc_sum += (net(data).argmax(dim=1) == label).float().sum().item()
        n += label.shape[0]
    print('epoch %d, test acc %.3f'
              % (epoch + 1, acc_sum / n))


tensor([2, 8, 9, 0, 5, 3, 2, 3, 0, 2])
acc_sum: 49115.0, n: 60000
epoch 1, train acc 0.819, loss 0.558
epoch 1, test acc 0.797
tensor([4, 1, 4, 2, 1, 9, 8, 7, 2, 5])
acc_sum: 50347.0, n: 60000
epoch 2, train acc 0.839, loss 0.551
epoch 2, test acc 0.815
tensor([0, 8, 7, 0, 9, 9, 1, 1, 3, 5])
acc_sum: 50294.0, n: 60000
epoch 3, train acc 0.838, loss 0.550
epoch 3, test acc 0.807
tensor([6, 9, 5, 7, 5, 6, 6, 3, 7, 8])
acc_sum: 48741.0, n: 60000
epoch 4, train acc 0.812, loss 0.545
epoch 4, test acc 0.785
tensor([7, 4, 3, 0, 4, 0, 2, 0, 0, 0])
acc_sum: 50983.0, n: 60000
epoch 5, train acc 0.850, loss 0.537
epoch 5, test acc 0.825
