Transformer-解码器部分
===

![images](images/028.png)

- 由N个解码器层堆叠而成
- 每个解码器层由三个子层连接结构组成
- 第一个子层连接结构包括一个多头自注意力子层和规范化层以及一个残差连接
- 第二个子层连接结构包括一个多头注意力子层和规范化层以及一个残差连接
- 第三个子层连接结构包括一个前馈全连接子层和规范化层以及一个残差连接

# 1.解码器层
作为解码器的组成单元, 每个解码器层根据给定的输入向目标方向进行特征提取操作，即解码过程.

In [4]:
from lib.PositionalEncoding import PositionalEncoding
from lib.Embeddings import Embeddings
import torch

d_model = 512
dropout = 0.2
max_len=60
vocab = 1000

pe = PositionalEncoding(d_model, dropout, max_len)
emb = Embeddings(d_model, vocab)
# 输入x是一个使用Variable封装的长整型张量, 形状是2 x 4
x = torch.autograd.Variable(torch.LongTensor([[100,2,421,508],[491,998,1,221]]))
embr = emb(x)
# 输入x是Embedding层的输出的张量, 形状是2 x 4 x 512
x = embr
pe_result = pe(x)
x = pe_result
x

tensor([[[-27.7572,  29.6178,  33.7259,  ...,   0.0000,  -4.9358,  23.9576],
         [  0.0000, -47.3996,  -8.3996,  ...,   0.0000,  28.9097,  42.4398],
         [ 39.8116, -13.7847,   0.0000,  ...,  -0.0000,   0.0000, -21.2629],
         [ -0.0000, -28.5188,  25.8290,  ..., -14.9359, -17.5258,  18.3313]],

        [[  0.0000,   5.1443,   4.8328,  ...,  51.9236,  -4.9627,   0.0000],
         [ -2.8123,   5.7593,  29.0198,  ...,  28.0604,   0.0000,  24.1394],
         [-16.3330,  -0.0000, -30.9909,  ...,   1.3160,  17.7589,  -0.0000],
         [-15.2141,  -0.0000,   2.5617,  ...,   0.0000,  -0.6455,  22.9396]]],
       grad_fn=<MulBackward0>)

In [6]:
from lib.MultiHeadedAttention import MultiHeadedAttention
from lib.PositionwiseFeedForward import PositionwiseFeedForward
import copy
from lib.EncoderLayer import EncoderLayer
from lib.Encoder import Encoder
from lib.DecoderLayer import DecoderLayer

# 类的实例化参数与解码器层类似, 相比多出了src_attn, 但是和self_attn是同一个类.
head = 8
size = 512
d_ff = 64

self_attn = src_attn = MultiHeadedAttention(head, d_model, dropout)

# 前馈全连接层也和之前相同 
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
# x是来自目标数据的词嵌入表示, 但形式和源数据的词嵌入表示相同, 这里使用per充当.

# memory是来自编码器的输出
c = copy.deepcopy
attn = MultiHeadedAttention(head, d_model)
layer = EncoderLayer(size, c(attn), c(ff), dropout)
mask = torch.autograd.Variable(torch.zeros(8, 4, 4))
# 编码器中编码器层的个数N
N = 8
en = Encoder(layer, N)
en_result = en(x, mask)
memory = en_result

# 实际中source_mask和target_mask并不相同, 这里为了方便计算使他们都为mask
source_mask = target_mask = mask
dl = DecoderLayer(size, self_attn, src_attn, ff, dropout)
dl_result = dl(x, memory, source_mask, target_mask)
print(dl_result)
print(dl_result.shape)

tensor([[[-27.9119,  29.4952,  34.2609,  ...,   1.0484,  -4.9317,  23.4846],
         [  0.3368, -47.0936,  -8.6950,  ...,   0.6991,  28.6446,  42.1238],
         [ 40.0375, -13.9946,   0.5067,  ...,   0.8552,   0.1961, -21.8864],
         [  1.0073, -28.5008,  25.9462,  ..., -14.4552, -17.4981,  18.0162]],

        [[  0.8172,   4.6504,   5.5159,  ...,  51.9619,  -5.1755,   0.2245],
         [ -2.2191,   5.2802,  30.0046,  ...,  28.0927,  -0.3330,  24.4751],
         [-15.7612,  -0.9180, -29.8939,  ...,   1.9881,  17.6986,   0.5592],
         [-14.5487,  -0.1786,   3.3623,  ...,  -0.4087,  -0.3726,  23.0910]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


# 2.解码器
根据编码器的结果以及上一次预测的结果, 对下一次可能出现的'值'进行特征表示.

In [7]:
import copy
from lib.MultiHeadedAttention import MultiHeadedAttention
from lib.PositionwiseFeedForward import PositionwiseFeedForward
from lib.DecoderLayer import DecoderLayer
from lib.Decoder import Decoder
# 分别是解码器层layer和解码器层的个数N
size = 512
d_model = 512
head = 8
d_ff = 64
dropout = 0.2
c = copy.deepcopy
attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout)
N = 8

# 输入参数与解码器层的输入参数相同
x = pe_result
memory = en_result
mask = torch.autograd.Variable(torch.zeros(8, 4, 4))
source_mask = target_mask = mask

de = Decoder(layer, N)
de_result = de(x, memory, source_mask, target_mask)
print(de_result)
print(de_result.shape)

tensor([[[-0.8165,  1.2935,  1.1358,  ..., -0.1313, -0.0205,  0.8622],
         [ 0.1687, -1.7185, -0.4857,  ..., -0.1639,  1.3899,  1.6002],
         [ 2.0399, -0.3050,  0.1206,  ..., -0.2707,  0.0828, -0.8054],
         [ 0.2085, -0.9998,  0.9480,  ..., -0.8538, -0.5872,  0.6681]],

        [[ 0.2624,  0.2000,  0.0410,  ...,  2.1882, -0.2590, -0.0736],
         [ 0.0030,  0.3192,  1.0609,  ...,  1.0737, -0.1478,  0.8669],
         [-0.4218,  0.0692, -1.2368,  ...,  0.0297,  0.5732,  0.1169],
         [-0.3526,  0.2811,  0.1286,  ...,  0.1508, -0.0563,  1.0132]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])
