# Using TensorBoard with PyTorch - Deep Learning Metrics

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)

from torch.utils.tensorboard import SummaryWriter   # 用来将网络数据发送到tensorboard中

  warn(f"Failed to load image Python extension: {e}")


In [3]:
print(torch.__version__)
print(torchvision.__version__)

1.13.1
0.14.1


In [4]:
def get_num_correct(preds, labels):       # 预测正确的数量
    return preds.argmax(dim=1).eq(labels).sum().item()

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)     # Linear = fully connected(fc) = dense
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
        # (1) input layer
        t = t
        
        # (2) hidden conv layer
        t = self.conv1(t)
        t = F.relu(t)                   # 卷积层中不包含激活函数，需要手动添加
        t = F.max_pool2d(t, kernel_size=2, stride=2)     # 池化操作（没有权重的函数不能称为层）
        
        # (3) hidden conv layer
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
        # (4) hidden linear layer
        t = t.reshape(-1, 12*4*4)       # 必须手动flatten
        t = self.fc1(t)
        t = F.relu(t)
        
        # (5) hidden linear layer
        t = self.fc2(t)
        t = F.relu(t)
        
        # (6) output layer
        t = self.out(t)
        # t = F.softmax(t, dim=1)           # 不直接在forward中用softmax，而是在训练过程中用cross-entropy损失函数计算loss，其中自带softmax
        
        return t
    
    
train_set = torchvision.datasets.FashionMNIST(
    root = './data/FashionMNIST',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor()
    ])
)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=100)

## Starting out with TensorBoard (Network Graph and Images)

In [5]:
tb = SummaryWriter()

network = Network()
images, labels = next(iter(train_loader))
grid = torchvision.utils.make_grid(images)

tb.add_image("image", grid)         # 添加图像
tb.add_graph(network, images)       # 添加网络和图像张量
tb.close()

In [7]:
network = Network()
optimizer = optim.Adam(network.parameters(), lr=0.01)   # 优化器使用Adam

images, labels = next(iter(train_loader))
grid = torchvision.utils.make_grid(images)

tb = SummaryWriter()
tb.add_image("image", grid)         # 添加图像
tb.add_graph(network, images)       # 添加网络和图像张量

for epoch in range(10):
    total_loss = 0
    total_correct = 0
    for batch in train_loader:  # get batch 
        images, labels = batch

        preds = network(images)  # pass batch
        loss = F.cross_entropy(preds, labels)  # calculate loss

        optimizer.zero_grad()    # 计算梯度之前要确保当前没有梯度值（PyTorch会自动累加计算过的梯度）
        loss.backward()     # calculate gradients
        optimizer.step()    # update weights

        total_loss += loss.item()
        total_correct += get_num_correct(preds, labels)

    tb.add_scalar("Loss", total_loss, epoch)              # scalar标量，即添加一个数字
    tb.add_scalar("Number Correct", total_correct, epoch)
    tb.add_scalar("Accuracy", total_correct / len(train_set), epoch)
    
    tb.add_histogram("conv1.bias", network.conv1.bias, epoch)  # 添加直方图
    tb.add_histogram("conv1.weight", network.conv1.weight, epoch)
    tb.add_histogram("conv1.weight.grad", network.conv1.weight.grad, epoch)
    
    print("epoch", epoch, "total_correct:", total_correct, "loss:", total_loss)
    
tb.close()

epoch 0 total_correct: 47184 loss: 336.92509910464287
epoch 1 total_correct: 51538 loss: 230.06804628670216
epoch 2 total_correct: 52374 loss: 206.95529808104038
epoch 3 total_correct: 52672 loss: 198.8001050800085
epoch 4 total_correct: 52965 loss: 190.70277906954288
epoch 5 total_correct: 53239 loss: 184.03623577952385
epoch 6 total_correct: 53300 loss: 180.3358246088028
epoch 7 total_correct: 53453 loss: 178.6155837327242
epoch 8 total_correct: 53520 loss: 176.7629645317793
epoch 9 total_correct: 53662 loss: 173.8362509161234


In [8]:
network = Network()
optimizer = optim.Adam(network.parameters(), lr=0.01)   # 优化器使用Adam

images, labels = next(iter(train_loader))
grid = torchvision.utils.make_grid(images)

tb = SummaryWriter()
tb.add_image("image", grid)         # 添加图像
tb.add_graph(network, images)       # 添加网络和图像张量

for epoch in range(1):
    total_loss = 0
    total_correct = 0
    for batch in train_loader:  # get batch 
        images, labels = batch

        preds = network(images)  # pass batch
        loss = F.cross_entropy(preds, labels)  # calculate loss

        optimizer.zero_grad()    # 计算梯度之前要确保当前没有梯度值（PyTorch会自动累加计算过的梯度）
        loss.backward()     # calculate gradients
        optimizer.step()    # update weights

        total_loss += loss.item()
        total_correct += get_num_correct(preds, labels)

    tb.add_scalar("Loss", total_loss, epoch)              # scalar标量，即添加一个数字
    tb.add_scalar("Number Correct", total_correct, epoch)
    tb.add_scalar("Accuracy", total_correct / len(train_set), epoch)
    
#     tb.add_histogram("conv1.bias", network.conv1.bias, epoch)  # 添加直方图
#     tb.add_histogram("conv1.weight", network.conv1.weight, epoch)
#     tb.add_histogram("conv1.weight.grad", network.conv1.weight.grad, epoch)

    for name, weight in network.named_parameters():        # 将每层的权重和梯度全部画进直方图
        tb.add_histogram(name, weight, epoch)
        tb.add_histogram(f"{name}.grad", weight.grad, epoch)
    
    print("epoch", epoch, "total_correct:", total_correct, "loss:", total_loss)
    
tb.close()

epoch 0 total_correct: 46521 loss: 353.7856948375702


In [9]:
for name, weight in network.named_parameters():
    print(name, weight.shape)

conv1.weight torch.Size([6, 1, 5, 5])
conv1.bias torch.Size([6])
conv2.weight torch.Size([12, 6, 5, 5])
conv2.bias torch.Size([12])
fc1.weight torch.Size([120, 192])
fc1.bias torch.Size([120])
fc2.weight torch.Size([60, 120])
fc2.bias torch.Size([60])
out.weight torch.Size([10, 60])
out.bias torch.Size([10])


In [10]:
for name, weight in network.named_parameters():
    print(f"{name}.grad", weight.grad.shape)

conv1.weight.grad torch.Size([6, 1, 5, 5])
conv1.bias.grad torch.Size([6])
conv2.weight.grad torch.Size([12, 6, 5, 5])
conv2.bias.grad torch.Size([12])
fc1.weight.grad torch.Size([120, 192])
fc1.bias.grad torch.Size([120])
fc2.weight.grad torch.Size([60, 120])
fc2.bias.grad torch.Size([60])
out.weight.grad torch.Size([10, 60])
out.bias.grad torch.Size([10])


# Hyperparameter Tuning

In [11]:
from itertools import product

In [12]:
parameters = dict(                # 先用字典存放
    lr = [0.01, 0.001],
    batch_size = [10, 100, 1000],
    shuffle = [True, False]
)

In [13]:
param_values = [v for v in parameters.values()]  # 再转换成列表
param_values

[[0.01, 0.001], [10, 100, 1000], [True, False]]

In [14]:
for lr, batch_size, shuffle in product(*param_values):  # 取得列表元素的所有组合
    print(lr, batch_size, shuffle)

0.01 10 True
0.01 10 False
0.01 100 True
0.01 100 False
0.01 1000 True
0.01 1000 False
0.001 10 True
0.001 10 False
0.001 100 True
0.001 100 False
0.001 1000 True
0.001 1000 False


In [17]:
for lr, batch_size, shuffle in product(*param_values):
    comment = f"batch_size={batch_size} lr={lr} shuffle={shuffle}"
    
    # training process
    network = Network()
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
    optimizer = optim.Adam(network.parameters(), lr=lr)
    
    images, labels = next(iter(train_loader))
    grid = torchvision.utils.make_grid(images)
    
    tb = SummaryWriter(comment=comment)  # Summary会将comment参数的内容设置为本次运行的名称，从而可以用参数值来表示每次运行
    tb.add_image("image", grid)         # 添加图像
    tb.add_graph(network, images)       # 添加网络和图像张量
    
    for epoch in range(5):
        total_loss = 0
        total_correct = 0
        for batch in train_loader:  # get batch 
            images, labels = batch
            preds = network(images)  # pass batch
            loss = F.cross_entropy(preds, labels)  # calculate loss
            optimizer.zero_grad()    # 计算梯度之前要确保当前没有梯度值（PyTorch会自动累加计算过的梯度）
            loss.backward()     # calculate gradients
            optimizer.step()    # update weights

            total_loss += loss.item() * batch_size      # loss太小，乘batch_size方便比较大小
            total_correct += get_num_correct(preds, labels)

        tb.add_scalar("Loss", total_loss, epoch)              # scalar标量，即添加一个数字
        tb.add_scalar("Number Correct", total_correct, epoch)
        tb.add_scalar("Accuracy", total_correct / len(train_set), epoch)

    #     tb.add_histogram("conv1.bias", network.conv1.bias, epoch)  # 添加直方图
    #     tb.add_histogram("conv1.weight", network.conv1.weight, epoch)
    #     tb.add_histogram("conv1.weight.grad", network.conv1.weight.grad, epoch)

        for name, weight in network.named_parameters():        # 将每层的权重和梯度全部画进直方图
            tb.add_histogram(name, weight, epoch)
            tb.add_histogram(f"{name}.grad", weight.grad, epoch)

        print("epoch", epoch, "total_correct:", total_correct, "loss:", total_loss)

tb.close()    

epoch 0 total_correct: 45495 loss: 38554.31323544588
epoch 1 total_correct: 48130 loss: 32533.88545371301
epoch 2 total_correct: 48428 loss: 32515.408893563435
epoch 3 total_correct: 48587 loss: 32911.61459766561
epoch 4 total_correct: 48615 loss: 32486.069676255574
epoch 0 total_correct: 46429 loss: 36786.75653106999
epoch 1 total_correct: 48587 loss: 32004.350982231117
epoch 2 total_correct: 48838 loss: 31077.62347409269
epoch 3 total_correct: 48821 loss: 31543.936647808878
epoch 4 total_correct: 49104 loss: 31230.010721804574
epoch 0 total_correct: 45676 loss: 37385.97422838211
epoch 1 total_correct: 50726 loss: 25126.520177721977
epoch 2 total_correct: 51479 loss: 23055.81790059805
epoch 3 total_correct: 51792 loss: 22320.764541625977
epoch 4 total_correct: 52071 loss: 21543.058294057846
epoch 0 total_correct: 47617 loss: 33181.12445324659
epoch 1 total_correct: 51632 loss: 22732.87554383278
epoch 2 total_correct: 52338 loss: 20794.487385451794
epoch 3 total_correct: 52644 loss: 20