## 1. subsequent_mask

> 生成掩码矩阵，用于遮盖数据，返回的是布尔值的torch.tensor矩阵，其中 true 表示在解码时能看到的数据，False表示遮蔽，可以看到随着解码的进行数据也逐渐增加。

In [2]:
import numpy as np
import torch

In [3]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [5]:
ans = subsequent_mask(5)
print(ans, type(ans))

tensor([[[ True, False, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True]]]) <class 'torch.Tensor'>


## 2. Batch 类

In [8]:
import numpy as np
import torch
from torch.autograd import Variable

In [94]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        # 输入数据的 mask ，即数值是否等于 pad 构成的二值矩阵
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            # 丢掉目标数据的最后一个值
            self.trg = trg[:, :-1]
            # 丢掉目标数据的第一个值
            self.trg_y = trg[:, 1:]
            # 根据给定的 pad 来制作目标值的 mask
            self.trg_mask = self.make_std_mask(self.trg, pad)
            # ntokens 即为目标数据中除去 pad 还有多少数字
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        # 将目标数据中等于 pad 的数值遮盖
        tgt_mask = (tgt != pad).unsqueeze(-2)
        # 进一步进行目标值的上三角遮盖，维度为最后一个维度，即数据维度
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        # 最终返回的 shape 为 batchsize * tgt * tgt，即针对每个目标值的一个上三角遮盖矩阵
        # 另外还遮盖了其中数值等于 pad 的部分
        return tgt_mask

### unsqueeze

> unsqueeze 作用为在指定维度进行升维，举例来说如原始为两行的数据，进行升维即将此数据看成另一个维度上仅有一列，即变为两行一列的数据。

In [34]:
a = torch.ones((1,2,3))
print(a, a.shape)
b = a.unsqueeze(-2)
print(b, b.shape)

tensor([[[1., 1., 1.],
         [1., 1., 1.]]]) torch.Size([1, 2, 3])
tensor([[[[1., 1., 1.]],

         [[1., 1., 1.]]]]) torch.Size([1, 2, 1, 3])


In [35]:
pad = 0
c = (a != pad).unsqueeze(-2)
c

tensor([[[[True, True, True]],

         [[True, True, True]]]])

### torch 索引与切片

In [55]:
d = torch.randn(3,4)
d

tensor([[ 0.6570,  0.7658,  0.0468,  1.2239],
        [-0.6662, -0.3183,  1.6796,  0.5557],
        [ 1.0227, -1.0913, -0.0940,  0.4967]])

> 冒号表示对维度内的切片操作，逗号表示对维度的索引

In [56]:
d[:, 0]  # 所有行的第 0 列

tensor([ 0.6570, -0.6662,  1.0227])

In [57]:
d[1:, 0]  # 第一行开始的第 0 列

tensor([-0.6662,  1.0227])

In [59]:
d[:-1, 0]  # 不要最后一行的第一列

tensor([ 0.6570, -0.6662])

In [62]:
d[0, 0]  # 第 0 行第 0 列

tensor(0.6570)

In [65]:
d[:, :-1]  # 不要第二个维度的最后一个值

tensor([[ 0.6570,  0.7658,  0.0468],
        [-0.6662, -0.3183,  1.6796],
        [ 1.0227, -1.0913, -0.0940]])

In [66]:
d[:, 1:]  # 不要第二个维度的第一个值

tensor([[ 0.7658,  0.0468,  1.2239],
        [-0.3183,  1.6796,  0.5557],
        [-1.0913, -0.0940,  0.4967]])

In [91]:
e = torch.tensor([[[1,0,2],[0,3,4]], [[5,0,5], [7,8,0]]])
f = (e != 0).unsqueeze(-2)
print(e)
print(f)

tensor([[[1, 0, 2],
         [0, 3, 4]],

        [[5, 0, 5],
         [7, 8, 0]]])
tensor([[[[ True, False,  True]],

         [[False,  True,  True]]],


        [[[ True, False,  True]],

         [[ True,  True, False]]]])


In [93]:
g = subsequent_mask(3)
print(g)
print(f & g)

tensor([[[ True, False, False],
         [ True,  True, False],
         [ True,  True,  True]]])
tensor([[[[ True, False, False],
          [ True, False, False],
          [ True, False,  True]],

         [[False, False, False],
          [False,  True, False],
          [False,  True,  True]]],


        [[[ True, False, False],
          [ True, False, False],
          [ True, False,  True]],

         [[ True, False, False],
          [ True,  True, False],
          [ True,  True, False]]]])


## NoamOpt类

> 优化器类，更新模型参数及调整学习率

In [102]:
import numpy as np
import torch
from torch.autograd import Variable
import time

In [100]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    
def get_std_opt(model):
    return NoamOpt(
            model_size = model.src_embed[0].d_model,  # 词向量特征维度
            factor = 2, 
            warmup = 4000, # 超参
            optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9),  # 优化器 Adam
           )

## LabelSmoothing类

> 标签平滑，防止过拟合，感觉没什么必要用，这里的作用即为平滑 tgt 数据之后再计算 loss，实际中感觉可能直接实例化一个 loss 对象就足够。

In [2]:
import numpy as np
import torch
from torch.autograd import Variable
import time
from torch import nn

In [2]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        # self.criterion = nn.KLDivLoss(size_average=False)
        self.criterion = nn.KLDivLoss(reduction='sum')  # 修改于 2022.12.2
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

### nonzero

> 返回 tensor 中满足条件的索引

In [3]:
a = torch.tensor([[1,2,3], [3,2,1]])
b = torch.nonzero(a <= 2)
b.dim()

2

In [19]:
x = torch.tensor([1,1,2], dtype = torch.float64)
target = torch.tensor([1,1,1], dtype = torch.float64)
loss = nn.CrossEntropyLoss()
loss(x, target)

tensor(3.6543, dtype=torch.float64)

## SimpleLossCompute类

In [None]:
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        # return loss.data[0] * norm
        return loss.data.item() * norm 

## run_epoch

In [96]:
import numpy as np
import torch
from torch.autograd import Variable
import time

In [97]:
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        # 每 50 条数据打印一波
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

## greedy_decode

In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys

# 总结：

> 总体来说这个包的功能就是定义了一些训练用的函数，如优化器，dataloader等；具体情境中还需要自己重新定义功能。

In [21]:
from torch.nn.modules import Transformer

In [26]:
net = Transformer(d_model = 6, nhead = 2, dim_feedforward = 20, batch_first = True)
x = torch.randn(2,3,6)
out = torch.randn(2,1,6)
net(x, out)

tensor([[[-0.6576, -1.0978,  1.6274, -0.8762,  0.0320,  0.9722]],

        [[ 0.3052, -1.4259,  0.9165, -1.3449,  0.6635,  0.8857]]],
       grad_fn=<NativeLayerNormBackward0>)

In [56]:
# net

In [10]:
x = torch.randint(1, 10, (3, 10))
x

tensor([[8, 9, 8, 2, 9, 3, 3, 5, 4, 1],
        [8, 8, 2, 9, 7, 2, 4, 2, 2, 3],
        [7, 9, 4, 4, 3, 7, 3, 9, 1, 5]])

In [11]:
embd = torch.nn.Embedding(10, 4)
embd(x)

tensor([[[ 1.3096,  0.4286,  0.5982, -0.9742],
         [ 0.9723, -1.3930,  1.2595,  0.7597],
         [ 1.3096,  0.4286,  0.5982, -0.9742],
         [ 0.2701, -1.0025,  2.3486, -0.6930],
         [ 0.9723, -1.3930,  1.2595,  0.7597],
         [-0.6460, -0.2801,  0.1728,  0.7232],
         [-0.6460, -0.2801,  0.1728,  0.7232],
         [-1.4891,  1.3442,  0.0901,  0.8951],
         [ 0.3977, -1.6202, -0.0646, -0.6646],
         [-0.9903, -0.9874, -0.6288,  0.3669]],

        [[ 1.3096,  0.4286,  0.5982, -0.9742],
         [ 1.3096,  0.4286,  0.5982, -0.9742],
         [ 0.2701, -1.0025,  2.3486, -0.6930],
         [ 0.9723, -1.3930,  1.2595,  0.7597],
         [-0.2337,  0.8694,  1.4526, -0.4023],
         [ 0.2701, -1.0025,  2.3486, -0.6930],
         [ 0.3977, -1.6202, -0.0646, -0.6646],
         [ 0.2701, -1.0025,  2.3486, -0.6930],
         [ 0.2701, -1.0025,  2.3486, -0.6930],
         [-0.6460, -0.2801,  0.1728,  0.7232]],

        [[-0.2337,  0.8694,  1.4526, -0.4023],
         

In [8]:
x = torch.randint(1, 10, (2, 10))
x

tensor([[4, 3, 3, 2, 6, 3, 9, 1, 9, 2],
        [2, 6, 1, 5, 7, 5, 4, 3, 9, 2]])

In [9]:
y = x.clone().detach()
z = y.to(torch.float64)
z

tensor([[4., 3., 3., 2., 6., 3., 9., 1., 9., 2.],
        [2., 6., 1., 5., 7., 5., 4., 3., 9., 2.]], dtype=torch.float64)

In [11]:
z.size(-1)

10

In [15]:
w = torch.clone(z)
w

tensor([[4., 3., 3., 2., 6., 3., 9., 1., 9., 2.],
        [2., 6., 1., 5., 7., 5., 4., 3., 9., 2.]], dtype=torch.float64)

In [27]:
import torch 
from torch import nn 
from torch.nn.modules import Transformer

class Net(nn.Module):
    def __init__(self, vocab, dim) -> None:
        super(Net, self).__init__()
        self.embd = nn.Embedding(vocab, dim)
        self.transformer = Transformer(
            d_model=dim, 
            nhead=2, 
            num_encoder_layers=1, 
            num_decoder_layers=1, 
            dim_feedforward=16, 
            batch_first=True
            )
        self.proj = nn.Linear(dim, 1)

    def forward(self, x, y, tgt_mask):
        x = self.embd(x)
        y = self.embd(y)
        x = self.transformer(x, y, tgt_mask = tgt_mask)
        x = self.proj(x)
        # print(x)
        return torch.squeeze(x)

In [50]:
net = Net(10, 4)
net

Net(
  (embd): Embedding(10, 4)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
          )
          (linear1): Linear(in_features=4, out_features=16, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=16, out_features=4, bias=True)
          (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0): TransformerDecoderLayer(
          (self_attn): MultiheadAttention(
           

In [54]:
net = Transformer(num_encoder_layers=1, num_decoder_layers=1)
# print(net, id(net))
newnet = nn.Sequential(*list(net.children()))
# print(newnet, id(newnet))

In [59]:
src = torch.randint(1, 10, (2, 5))
res = torch.ones(1, 1).fill_(0).type_as(src.data)
res

tensor([[0]])

In [61]:
res = torch.cat([res, torch.ones(1, 1).type_as(src.data).fill_(1)], dim=1)
print(res)
print(res.shape)

tensor([[0, 1]])
torch.Size([1, 2])


In [78]:
def attn(query, key, value):
    if key is value:
        if query is key:
            query = key = value = query.transpose(1, 0)
        else:
            query, key = [x.transpose(1, 0) for x in (query, key)]
            value = key
    else:
        query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
    return query.shape, key.shape, value.shape

In [79]:
# attn = torch.nn.MultiheadAttention(embed_dim=4, num_heads=1)
query = torch.randn(1, 1, 4)
key = value = torch.randn(1, 10, 4)
attn(query, key, value)
# query.dim()

(torch.Size([1, 1, 4]), torch.Size([10, 1, 4]), torch.Size([10, 1, 4]))