In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter


In [2]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=300, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)

    
    def forward(self, x):
#         1ayer 1
        t = self.conv1(x)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
#         layer 2
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
#         layer 3
        t = t.reshape(-1, 300)
        t = self.fc1(t)
        t = F.relu(t)
        
#         layer 4
        t = self.fc2(t)
        t = F.relu(t)
        
#         layer 5
        t = self.out(t)
    
        return t

In [4]:
def get_num_correct(pred, labels):
    return pred.argmax(dim=1).eq(labels).sum().item()

In [5]:
batch_size = 100
lr = 0.01

network = Network()

trainset = torchvision.datasets.CIFAR10("../pytorch_test/testdata", 
                                        train=False, 
                                        download=False, 
                                        transform=torchvision.transforms.ToTensor())
loader = DataLoader(trainset, batch_size=batch_size)

optimizer = optim.Adam(network.parameters(), lr=lr)

In [24]:
comment = f'batch_size {batch_size} lr {lr}'
tb = SummaryWriter(comment=comment)

### 多轮超参数组合

In [2]:
from itertools import product

from collections import namedtuple
from collections import OrderedDict
import time

import pandas as pd

from IPython.display import clear_output, display


In [20]:
# 不同超参数组合，利用tensorboard方便调参
param_dict = dict(
    batch_size = [10, 100],
    lr = [0.01, 0.001]
)
param_dict

{'batch_size': [10, 100], 'lr': [0.01, 0.001]}

In [25]:
param_list = [v for v in param_dict.values()]
param_list

[[10, 100], [0.01, 0.001]]

In [26]:
epochs = 3

for batch_size, lr in product(*param_list):
    network = Network()
    
    loader = DataLoader(trainset, batch_size=batch_size)
    optimizer = optim.Adam(network.parameters(), lr=lr)

    comment = f'batch_size {batch_size} lr {lr}'
    tb = SummaryWriter(comment=comment)

#     comment = f'batch_size:{batch_size} lr={lr}'
#     tb = SummaryWriter(log_dir=".\\runs\\2024-4-15C", comment=comment)

    for epoch in range(epochs):
        total_correct = 0
        total_loss = 0

        for batch in loader:
            images, labels = batch

            preds = network(images)

            loss = F.cross_entropy(preds, labels)

        #     计算梯度之前，由于梯度会累积，要清零
            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

    #         使不同batch_size的loss可比较
            total_loss += loss.item() * batch_size

            total_correct += get_num_correct(preds, labels)

        # add info to tb
        tb.add_scalar("Loss", total_loss, epoch)
        tb.add_scalar("Number_Correct", total_correct, epoch)
        tb.add_scalar("Accuracy", total_correct / len(trainset), epoch)

    #     tb.add_histogram("conv1.weight", network.conv1.weight, epoch)
    #     tb.add_histogram("conv1.weight.grad", network.conv1.weight.grad, epoch)
    #     tb.add_histogram("conv1.bias", network.conv1.bias, epoch)

    #   del hard code
        for name, weight in network.named_parameters():
            tb.add_histogram(name, weight, epoch)
            tb.add_histogram(f'{name}.grad', weight.grad, epoch)


        print(f'epoch {epoch}: total_correct:{total_correct}， total_loss: {total_loss}')

epoch 0: total_correct:1000， total_loss: 23060.205283164978
epoch 1: total_correct:1000， total_loss: 23060.205283164978
epoch 2: total_correct:1000， total_loss: 23060.205283164978
epoch 0: total_correct:1000， total_loss: 23053.6079788208
epoch 1: total_correct:1000， total_loss: 23053.6079788208
epoch 2: total_correct:1000， total_loss: 23053.6079788208
epoch 0: total_correct:1000， total_loss: 23051.1855840683
epoch 1: total_correct:1000， total_loss: 23051.1855840683
epoch 2: total_correct:1000， total_loss: 23051.1855840683
epoch 0: total_correct:1000， total_loss: 23039.924359321594
epoch 1: total_correct:1000， total_loss: 23039.924359321594
epoch 2: total_correct:1000， total_loss: 23039.924359321594


### 封装类，优化代码

In [7]:
class RunManager:
    def __init__(self):
#         self.epoch_num = 0 # 总数
        self.epoch_count = 0 # 当前epoch
        self.epoch_loss = 0
        self.epoch_correct_num = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None        
        self.tb = None
    
    def begin_run(self, run, network, loader):
        self.run_params = run
        self.run_count += 1
        self.run_start_time = time.time() 
        
        self.network = network
        self.loader = loader
        #     comment：tensorboard文件目录后缀，用于区分不同超参数组合
        self.tb = SummaryWriter(comment=f"-{run}")
        
    def end_run(self):
        self.tb.close()
        self.epoch_count = 0
        
    def begin_epoch(self):
        self.epoch_count += 1
        self.epoch_start_time = time.time()
        self.epoch_loss = 0
        self.epoch_correct_num = 0
        
    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        acc = self.epoch_correct_num / len(self.loader.dataset)
        loss = self.epoch_loss / len(self.loader.dataset)
        
        self.tb.add_scalar("Accuracy", acc, self.epoch_count)
        self.tb.add_scalar("Loss", loss, self.epoch_count)
        
        for name, weight in self.network.named_parameters():
            self.tb.add_histogram(name, weight, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', weight.grad, self.epoch_count)
            
        result = OrderedDict()
        result['run'] = self.run_count
        result['epoch'] = self.epoch_count
        result['acc'] = acc
        result['loss'] = loss
        result['run_duration'] = run_duration
        result['epoch_duration'] = epoch_duration
        for k, v in self.run_params._asdict().items():
            result[k] = v
        
        self.run_data.append(result)
        
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        
        # notebook使用
        clear_output(wait=True)
        display(df)
        
        
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * self.loader.batch_size
    
    def track_correct(self, preds, labels):
        self.epoch_correct_num += self._get_num_correct(preds, labels)
    
#     @torch.no_grad() #推理阶段使用
    def _get_num_correct(self, pred, labels):
        return pred.argmax(dim=1).eq(labels).sum().item()
        
    
    def save(self, filename):
#         保存为csv
        pd.DataFrame.from_dict(
            self.run_data,
            orient='columns'
        ).to_csv(f'{filename}.csv')
    
#         保存为json
#         with open(f'{filename}.json', 'w', encoding='utf-8') as f:
#             json.dump(self.run_data, f, ensure_ascii=False, indent=4)
    

In [8]:
# 构建RunBuilder，方便超参数元组获取, 最外层循环

class RunBuilder:
    @staticmethod
    def get_runs(params):
        Run = namedtuple('Run', params.keys())
        
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))
        
        return runs

In [6]:
# 不同超参数组合，利用tensorboard方便调参
param_dict = OrderedDict(
    batch_size = [10, 100],
    lr = [0.01, 0.001]
)
param_dict

OrderedDict([('batch_size', [10, 100]), ('lr', [0.01, 0.001])])

In [7]:
trainset = torchvision.datasets.CIFAR10("../pytorch_test/testdata", 
                                        train=False, 
                                        download=False, 
                                        transform=torchvision.transforms.ToTensor())

In [8]:
epochs = 3 # 也可加入 param_dict

r = RunManager()

for run in RunBuilder.get_runs(param_dict):
    network = Network() # network 权重随机初始化，在测试不同超参数时不符合控制变量法
    loader = DataLoader(trainset, batch_size=run.batch_size)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    r.begin_run(run, network, loader)
    for epoch in range(epochs):
        
        r.begin_epoch()
        for batch in loader:
            images, labels = batch
            preds = network(images)
            loss = F.cross_entropy(preds, labels)

        #     计算梯度之前，由于梯度会累积，要清零
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            r.track_loss(loss)
            r.track_correct(preds, labels)

        r.end_epoch()
#         print(f'epoch {epoch}: total_correct:{total_correct}， total_loss: {total_loss}')
    
    r.end_run()

# 保存文件
r.save('first_trial')

Unnamed: 0,run,epoch,acc,loss,run_duration,epoch_duration,batch_size,lr
0,1,1,0.0957,2.305632,3.949818,3.947663,10,0.01
1,1,2,0.0977,2.305288,8.986098,4.973881,10,0.01
2,1,3,0.0977,2.305291,13.805143,4.772722,10,0.01
3,2,1,0.2208,2.070462,4.154894,4.152935,10,0.001
4,2,2,0.3308,1.813963,8.785284,4.549803,10,0.001
5,2,3,0.385,1.658225,13.372157,4.500383,10,0.001
6,3,1,0.1187,2.290103,2.360542,2.358541,100,0.01
7,3,2,0.2377,2.020378,4.907425,2.470867,100,0.01
8,3,3,0.2913,1.891319,7.449193,2.465861,100,0.01
9,4,1,0.1657,2.186775,2.511396,2.508522,100,0.001


In [11]:
pd.DataFrame.from_dict(r.run_data).sort_values('acc', ascending=False)

Unnamed: 0,run,epoch,acc,loss,run_duration,epoch_duration,batch_size,lr
5,2,3,0.385,1.658225,13.372157,4.500383,10,0.001
4,2,2,0.3308,1.813963,8.785284,4.549803,10,0.001
11,4,3,0.3173,1.853072,7.623313,2.41689,100,0.001
8,3,3,0.2913,1.891319,7.449193,2.465861,100,0.01
10,4,2,0.2781,1.962341,5.117784,2.530179,100,0.001
7,3,2,0.2377,2.020378,4.907425,2.470867,100,0.01
3,2,1,0.2208,2.070462,4.154894,4.152935,10,0.001
9,4,1,0.1657,2.186775,2.511396,2.508522,100,0.001
6,3,1,0.1187,2.290103,2.360542,2.358541,100,0.01
1,1,2,0.0977,2.305288,8.986098,4.973881,10,0.01


In [10]:
# 上图中可以发现最佳组合：Run(batch_size=10, lr=0.001)
run = RunBuilder.get_runs(param_dict)
run

[Run(batch_size=10, lr=0.01),
 Run(batch_size=10, lr=0.001),
 Run(batch_size=100, lr=0.01),
 Run(batch_size=100, lr=0.001)]

### RunBuilder test

In [13]:
param_dict = dict(
    batch_size=[10, 100],
    lr=[0.1, 0.01],
    shuffle=[True, False]
)

In [16]:
param_list = [v for v in param_dict.values()]
param_list

[[10, 100], [0.1, 0.01], [True, False]]

In [18]:
for batch_size, lr, shuffle in product(*param_list):
    print(f'{batch_size}, {lr}, {shuffle}')

10, 0.1, True
10, 0.1, False
10, 0.01, True
10, 0.01, False
100, 0.1, True
100, 0.1, False
100, 0.01, True
100, 0.01, False


In [5]:
param_dict = OrderedDict(
    batch_size=[10, 100],
    lr=[0.1, 0.01],
    shuffle=[True, False]
)

In [34]:
param_dict.keys()

odict_keys(['batch_size', 'lr', 'shuffle'])

In [6]:
# 构建RunBuilder，方便超参数元组获取

class RunBuilder:
    @staticmethod
    def get_runs(params):
        Run = namedtuple('Run', params.keys())
        
        runs = []
        for v in product(*params.values()):
            print(v)
            print("===")
            print(*v)
            print("=====")
            runs.append(Run(*v))
        
        return runs

In [7]:
runs = RunBuilder.get_runs(param_dict)

(10, 0.1, True)
===
10 0.1 True
=====
(10, 0.1, False)
===
10 0.1 False
=====
(10, 0.01, True)
===
10 0.01 True
=====
(10, 0.01, False)
===
10 0.01 False
=====
(100, 0.1, True)
===
100 0.1 True
=====
(100, 0.1, False)
===
100 0.1 False
=====
(100, 0.01, True)
===
100 0.01 True
=====
(100, 0.01, False)
===
100 0.01 False
=====


In [11]:
runs[0]._asdict()

{'batch_size': 10, 'lr': 0.1, 'shuffle': True}

In [33]:
runs

[Run(batch_size=10, lr=0.1, shuffle=True),
 Run(batch_size=10, lr=0.1, shuffle=False),
 Run(batch_size=10, lr=0.01, shuffle=True),
 Run(batch_size=10, lr=0.01, shuffle=False),
 Run(batch_size=100, lr=0.1, shuffle=True),
 Run(batch_size=100, lr=0.1, shuffle=False),
 Run(batch_size=100, lr=0.01, shuffle=True),
 Run(batch_size=100, lr=0.01, shuffle=False)]

In [35]:
for run in runs:
    print(run.batch_size)
    print(run.lr)
    print(run.shuffle)

10
0.1
True
10
0.1
False
10
0.01
True
10
0.01
False
100
0.1
True
100
0.1
False
100
0.01
True
100
0.01
False


### Normalize(Standardize 更具体的数据缩放变换)

In [5]:
# 数据预处理阶段，读取trainset，计算mean，std，之后训练时重新加载trainset，这里给出假设值，测试重新加载
mean = 0.
std = 1.

trainset = torchvision.datasets.CIFAR10("../pytorch_test/testdata", 
                                        train=False, 
                                        download=False, 
                                        transform=torchvision.transforms.Compose([
                                            torchvision.transforms.ToTensor(),
                                            torchvision.transforms.Normalize(mean, std)
                                        ]))

### Sequential Module

In [7]:
layers = OrderedDict([
    ('flat', nn.Flatten(start_dim=1)),
    ('hidden', nn.Linear(in_features=784, out_features=392)),
    ('output', nn.Linear(in_features=392, out_features=10))
])

In [8]:
network2 = nn.Sequential(layers)
network2

Sequential(
  (flat): Flatten(start_dim=1, end_dim=-1)
  (hidden): Linear(in_features=784, out_features=392, bias=True)
  (output): Linear(in_features=392, out_features=10, bias=True)
)

In [9]:
# 最初定义的Network类，可使用nn.Sequential构建等价的网络

network_seq1 = nn.Sequential(
    nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    
    nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=300, out_features=120),
    nn.ReLU(),
    nn.Linear(in_features=120, out_features=60),
    nn.ReLU(),
    nn.Linear(in_features=60, out_features=10)
)
network_seq1

Sequential(
  (0): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (3): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (4): ReLU()
  (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (6): Flatten(start_dim=1, end_dim=-1)
  (7): Linear(in_features=300, out_features=120, bias=True)
  (8): ReLU()
  (9): Linear(in_features=120, out_features=60, bias=True)
  (10): ReLU()
  (11): Linear(in_features=60, out_features=10, bias=True)
)

In [10]:
# 使用OrderedDict 创建，指定每层的名字

seq2_layers = OrderedDict([
    ('conv1', nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)),
    ('relu1', nn.ReLU()),
    ('maxPool1', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('conv2', nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)),
    ('relu2', nn.ReLU()),
    ('maxPool2', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('flatten', nn.Flatten(start_dim=1)),
    ('lin1', nn.Linear(in_features=300, out_features=120)),
    ('relu3', nn.ReLU()),
    ('lin2', nn.Linear(in_features=120, out_features=60)),
    ('relu4', nn.ReLU()),
    ('output', nn.Linear(in_features=60, out_features=10))
    
])

network_seq2 = nn.Sequential(seq2_layers)
network_seq2

Sequential(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxPool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxPool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (lin1): Linear(in_features=300, out_features=120, bias=True)
  (relu3): ReLU()
  (lin2): Linear(in_features=120, out_features=60, bias=True)
  (relu4): ReLU()
  (output): Linear(in_features=60, out_features=10, bias=True)
)

In [11]:
# 增加 Batch_norm

seq_batch_layers = OrderedDict([
    ('conv1', nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)),
    ('relu1', nn.ReLU()),
    ('maxPool1', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('batchNorm1', nn.BatchNorm2d(6)),
    
    ('conv2', nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)),
    ('relu2', nn.ReLU()),
    ('maxPool2', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('flatten', nn.Flatten(start_dim=1)),
    ('lin1', nn.Linear(in_features=300, out_features=120)),
    ('relu3', nn.ReLU()),
    
    ('batchNorm2', nn.BatchNorm1d(120)),
    
    ('lin2', nn.Linear(in_features=120, out_features=60)),
    ('relu4', nn.ReLU()),
    ('output', nn.Linear(in_features=60, out_features=10))
    
])

network_seq3 = nn.Sequential(seq_batch_layers)
network_seq3

Sequential(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxPool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (batchNorm1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxPool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (lin1): Linear(in_features=300, out_features=120, bias=True)
  (relu3): ReLU()
  (batchNorm2): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin2): Linear(in_features=120, out_features=60, bias=True)
  (relu4): ReLU()
  (output): Linear(in_features=60, out_features=10, bias=True)
)

In [12]:
net_dict = {
    'network1': network_seq2,
    'network2': network_seq3
}
net_dict

{'network1': Sequential(
   (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
   (relu1): ReLU()
   (maxPool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
   (relu2): ReLU()
   (maxPool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   (flatten): Flatten(start_dim=1, end_dim=-1)
   (lin1): Linear(in_features=300, out_features=120, bias=True)
   (relu3): ReLU()
   (lin2): Linear(in_features=120, out_features=60, bias=True)
   (relu4): ReLU()
   (output): Linear(in_features=60, out_features=10, bias=True)
 ),
 'network2': Sequential(
   (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
   (relu1): ReLU()
   (maxPool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   (batchNorm1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
   (r

In [14]:
list(net_dict.keys())

['network1', 'network2']

In [4]:
data = torch.ones(size=(2, 2, 3, 4))
data[0][0][0][0] = 25
print("data = ", data)

data =  tensor([[[[25.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.]],

         [[ 1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.]]],


        [[[ 1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.]],

         [[ 1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.]]]])


In [9]:
x = torch.cat((data[0][0], data[1][0]), dim=1)
x.shape

torch.Size([3, 8])

In [10]:
x_mean = torch.Tensor.mean(x)
x_var = torch.Tensor.var(x, False)
print(x_mean)
print(x_var)

tensor(2.)
tensor(23.)


### batch_norm 测试

In [3]:
layers_normal = OrderedDict([
    ('conv1', nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)),
    ('relu1', nn.ReLU()),
    ('maxPool1', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('conv2', nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)),
    ('relu2', nn.ReLU()),
    ('maxPool2', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('flatten', nn.Flatten(start_dim=1)),
    ('lin1', nn.Linear(in_features=300, out_features=120)),
    ('relu3', nn.ReLU()),
    ('lin2', nn.Linear(in_features=120, out_features=60)),
    ('relu4', nn.ReLU()),
    ('output', nn.Linear(in_features=60, out_features=10))
    
])

In [4]:
layers_with_batch = OrderedDict([
    ('conv1', nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)),
    ('relu1', nn.ReLU()),
    ('maxPool1', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('batchNorm1', nn.BatchNorm2d(6)),
    
    ('conv2', nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)),
    ('relu2', nn.ReLU()),
    ('maxPool2', nn.MaxPool2d(kernel_size=2, stride=2)),
    
    ('flatten', nn.Flatten(start_dim=1)),
    ('lin1', nn.Linear(in_features=300, out_features=120)),
    ('relu3', nn.ReLU()),
    
    ('batchNorm2', nn.BatchNorm1d(120)),
    
    ('lin2', nn.Linear(in_features=120, out_features=60)),
    ('relu4', nn.ReLU()),
    ('output', nn.Linear(in_features=60, out_features=10))
    
])

In [18]:
class NetworkFactory:
    @staticmethod
    def get_factory(name):
        if name == "network":
            torch.manual_seed(50)
            return nn.Sequential(layers_normal)
        elif name == "network_with_batchnorm":
            torch.manual_seed(50)
            return nn.Sequential(layers_with_batch)
        else:
            return None
        

In [9]:
# 不同超参数组合，利用tensorboard方便调参
param_dict = OrderedDict(
    batch_size = [10],
    lr = [0.001],
    network = ['network', 'network_with_batchnorm']
)
param_dict

OrderedDict([('batch_size', [10]),
             ('lr', [0.001]),
             ('network', ['network', 'network_with_batchnorm'])])

In [17]:
param_dict.values()

odict_values([[10], [0.001], ['network', 'network_with_batchnorm']])

In [16]:
for x in product(*param_dict.values()):
    print(x)

(10, 0.001, 'network')
(10, 0.001, 'network_with_batchnorm')


In [19]:
trainset = torchvision.datasets.CIFAR10("../pytorch_test/testdata", 
                                        train=False, 
                                        download=False, 
                                        transform=torchvision.transforms.ToTensor())

In [20]:
loader = DataLoader(trainset, batch_size=len(trainset), num_workers=1)

In [23]:
# 等价写法
# data = next(iter(loader))
# data[0].mean(), data[0].std()

images, labels = next(iter(loader))
images.mean(), images.std()

(tensor(0.4766), tensor(0.2512))

In [24]:
mean = images.mean()
std = images.std()

In [26]:
images.shape

torch.Size([10000, 3, 32, 32])

In [27]:
images.numel()

30720000

In [None]:
# trainset = torchvision.datasets.CIFAR10("../pytorch_test/testdata", 
#                                         train=False, 
#                                         download=False, 
#                                         transform=torchvision.transforms.ToTensor())
# loader = DataLoader(trainset, batch_size=len(trainset), num_workers=1)
# images, labels = next(iter(loader))

# mean = images.mean()
# std = images.std()



# pixel_total = images.numel()

# images, labels = next(iter(loader))
# mean = images.sum() / pixel_total
# powsum = (images - mean).pow(2).sum()

# std = torch.sqrt(powsum / pixel_total)

In [28]:
trainset_normal = torchvision.datasets.CIFAR10("../pytorch_test/testdata", 
                                        train=False, 
                                        download=False, 
                                        transform=torchvision.transforms.Compose([
                                            torchvision.transforms.ToTensor(),
                                            torchvision.transforms.Normalize(mean, std)
                                        ]))

In [29]:
epochs = 3

r = RunManager()

for run in RunBuilder.get_runs(param_dict):
    network = NetworkFactory.get_factory(run.network)
    loader = DataLoader(trainset_normal, batch_size=run.batch_size)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    r.begin_run(run, network, loader)
    for epoch in range(epochs):
        
        r.begin_epoch()
        for batch in loader:
            images, labels = batch
            preds = network(images)
            loss = F.cross_entropy(preds, labels)

        #     计算梯度之前，由于梯度会累积，要清零
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            r.track_loss(loss)
            r.track_correct(preds, labels)

        r.end_epoch()
#         print(f'epoch {epoch}: total_correct:{total_correct}， total_loss: {total_loss}')
    
    r.end_run()

# 保存文件
r.save('batch_compare')

Unnamed: 0,run,epoch,acc,loss,run_duration,epoch_duration,batch_size,lr,network
0,1,1,0.3259,1.825468,4.553105,4.551018,10,0.001,network
1,1,2,0.4309,1.55561,9.347095,4.688157,10,0.001,network
2,1,3,0.4855,1.417843,14.191598,4.762349,10,0.001,network
3,2,1,0.3414,1.79544,4.944481,4.942454,10,0.001,network_with_batchnorm
4,2,2,0.4357,1.553575,10.358989,5.312581,10,0.001,network_with_batchnorm
5,2,3,0.4785,1.457439,15.701468,5.232157,10,0.001,network_with_batchnorm


In [32]:
pd.DataFrame.from_dict(r.run_data).sort_values('acc', ascending=False)

Unnamed: 0,run,epoch,acc,loss,run_duration,epoch_duration,batch_size,lr,network
2,1,3,0.4855,1.417843,14.191598,4.762349,10,0.001,network
5,2,3,0.4785,1.457439,15.701468,5.232157,10,0.001,network_with_batchnorm
4,2,2,0.4357,1.553575,10.358989,5.312581,10,0.001,network_with_batchnorm
1,1,2,0.4309,1.55561,9.347095,4.688157,10,0.001,network
3,2,1,0.3414,1.79544,4.944481,4.942454,10,0.001,network_with_batchnorm
0,1,1,0.3259,1.825468,4.553105,4.551018,10,0.001,network


In [31]:
# for run in RunBuilder.get_runs(param_dict):
#     network = NetworkFactory.get_factory(run.network)
#     print(network)
#     print("=========")

Sequential(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxPool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxPool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (lin1): Linear(in_features=300, out_features=120, bias=True)
  (relu3): ReLU()
  (lin2): Linear(in_features=120, out_features=60, bias=True)
  (relu4): ReLU()
  (output): Linear(in_features=60, out_features=10, bias=True)
)
Sequential(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxPool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (batchNorm1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxPool2): MaxPool2d(kernel_s

In [36]:
for run in RunBuilder.get_runs(param_dict):
    print(run.batch_size)
    loader = DataLoader(trainset_normal, batch_size=run.batch_size)
    print(len(loader)) # 有几个batch
    print(loader.num_workers)
    print(loader.batch_size)
    print(len(loader.dataset)) # 有多少数据
    print("=========")

10
1000
0
10
10000
10
1000
0
10
10000


### 多头注意力

In [4]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiheadAttention).__init__()
        
        self.d_model = d_model
        self.n_head = n_head
        
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_combine = nn.Linear(d_model, d_model)
        
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, q, k, v):
        batch, time, dimension = q.shape
        
        n_d = self.d_model / self.n_head
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        
        q = q.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)
        k = k.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)     
        v = v.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)
        
        score = q @ k.transpose(2, 3) / math.sqrt(n_d)
        mask = torch.tril(torch.ones(time, time, dtype=bool))
        score = score.masked_fill(mask == 0, float("-inf"))
        score = self.softmax(score) @ v
        
        # 连接多个头
        score = score.permute(0, 2, 1, 3).contiguous().view(batch, time, dimension)
        
        # 输入到最后的线性层
        output = self.w_combine(score)
        
        return output

In [13]:
ts = torch.tensor([[1., 2, 3], [4,5,6], [7,8,9]])
ts.dtype

torch.float32

In [14]:
mask = torch.tril(torch.ones(3, 3, dtype=bool))
mask

tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])

In [15]:
ts.masked_fill(mask == 0, float("-inf"))

tensor([[1., -inf, -inf],
        [4., 5., -inf],
        [7., 8., 9.]])

In [16]:
ts

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])

In [27]:
ts = torch.Tensor(2, 3, 4, 5)
ts.shape

torch.Size([2, 3, 4, 5])

In [29]:
ts.transpose(0, 1).shape

torch.Size([3, 2, 4, 5])

In [30]:
ts.transpose(0, 2).shape

torch.Size([4, 3, 2, 5])

In [31]:
ts.transpose(0, 3).shape

torch.Size([5, 3, 4, 2])

# Transformer 学习

# 嵌入表示层

In [3]:
# 原始序列不具有token之间的相对位置信息
# 最初单词（token）表示和位置编码相加，利用三角函数的性质加入单词之间的距离信息

class PositionalEncoder(nn.Module):
    def __init__(self, dmodel, max_seq_len=80):
        super().__init__()
        self.dmodel = dmodel
        
        # 根据 pos 和 i 创建一个常量 PE 矩阵
        pe = torch.zeros(max_seq_len, dmodel)
        for pos in range(max_seq_len):
            for i in range(0, dmodel, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (2 * i / dmodel)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** (2 * (i + 1) / dmodel)))
        
#         在最外层（第0维度）增加一个维度，以适应batch
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x * math.sqrt(self.dmodel)
        seq_len = x.size(1)
        x = x + Variable(self.pe[:, :seq_len], requires_grad=False).cuda()
        return x


In [12]:
ts = torch.tensor([[1., 2.], [4,5], [7,8]])
ts

tensor([[1., 2.],
        [4., 5.],
        [7., 8.]])

In [14]:
ts.size(0)

3

# 多头注意力层

In [16]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
        
    def attention(q, k, v, d_k, mask=None, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
        # 掩盖掉那些为了填补长度增加的单元，使其通过 softmax 计算后为 0
        if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
            scores = F.softmax(scores, dim=-1)
        if dropout is not None:
            scores = dropout(scores)
            output = torch.matmul(scores, v)
        return output
    
    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        # 进行线性操作划分为成 h 个头
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # 矩阵转置
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        # 计算 attention
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        # 连接多个头并输入到最后的线性层
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat)
        return output

# 位置感知前馈层

In [18]:
class FeedForward(nn.Module):
    def __init__(self, dmodel, d_ff=2048, dropout=0.1):
        super().__init__()
        
        self.linear1 = nn.Linear(dmodel, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, dmodel)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.linear1(x)))
        x = self.linear2(x)
        return x

# 残差连接与层归一化

In [19]:
class NormLayer(nn.Module):
    def __init__(self, dmodel, eps=1e-6):
        super().__init__()
        
        self.size = dmodel
        # 层归一化包含两个可学习的参数
        self.alpha = nn.Parameter(torch.ones(dmodel))
        self.bias = nn.Parameter(torch.zeros(dmodel))
        
        self.eps = eps
    
    # x 是残差连接后的tensor
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [20]:
ts = torch.tensor([[1., 2.], [4,5], [7,8]])
ts

tensor([[1., 2.],
        [4., 5.],
        [7., 8.]])

In [28]:
ts.std(dim=-1, keepdim=True)

tensor([[0.7071],
        [0.7071],
        [0.7071]])

# Transformer完整架构代码

In [29]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
    
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
        
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)
    

In [30]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        
    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [31]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)
        
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output