In [1]:
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode, Derek Miller @dmmiller612, modify by wmathor
  Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch
              https://github.com/JayParks/transformer
'''
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps
sentences = [
        # enc_input           dec_input         dec_output
        ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
        ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
]


## 构建词表
    # 编码端的词表
# Padding Should be Zero
src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5} 
src_vocab_size = len(src_vocab) #实际情况下，它的长度应该是所有德语单词的个数 # 输入6
 # 解码端的词表
tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8}
idx2word = {i: w for i, w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab) # 实际情况下，它应该是所有英语单词个数 8

src_len = 5 # enc_input max sequence length  编码端的输入长度  
tgt_len = 6 # dec_input(=dec_output) max sequence length

# Transformer Parameters
d_model = 512  # Embedding Size
d_ff = 2048 # FeedForward dimension   前馈神经网络映射到多少维度
d_k = d_v = 64  # dimension of K(=Q), V
n_layers = 6  # number of Encoder of Decoder Layer
n_heads = 8  # number of heads in Multi-Head Attention

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sentences

[['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
 ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']]

In [3]:
src_vocab

{'P': 0, 'bier': 4, 'cola': 5, 'ein': 3, 'ich': 1, 'mochte': 2}

In [None]:
src_vocab_size 

6

In [None]:
tgt_vocab


{'.': 8,
 'E': 7,
 'P': 0,
 'S': 6,
 'a': 3,
 'beer': 4,
 'coke': 5,
 'i': 1,
 'want': 2}

In [None]:
idx2word

{0: 'P',
 1: 'i',
 2: 'want',
 3: 'a',
 4: 'beer',
 5: 'coke',
 6: 'S',
 7: 'E',
 8: '.'}

In [3]:
def make_data(sentences):   # make  X 
    enc_inputs, dec_inputs, dec_outputs = [], [], []
    for i in range(len(sentences)):
       #把单词序列转换为数字序列  矩阵X
      enc_input = [[src_vocab[n] for n in sentences[i][0].split()]] # [[1, 2, 3, 4, 0], [1, 2, 3, 5, 0]]
      dec_input = [[tgt_vocab[n] for n in sentences[i][1].split()]] # [[6, 1, 2, 3, 4, 8], [6, 1, 2, 3, 5, 8]]
      dec_output = [[tgt_vocab[n] for n in sentences[i][2].split()]] # [[1, 2, 3, 4, 8, 7], [1, 2, 3, 5, 8, 7]]

      enc_inputs.extend(enc_input)
      dec_inputs.extend(dec_input)
      dec_outputs.extend(dec_output)

    return torch.LongTensor(enc_inputs), torch.LongTensor(dec_inputs), torch.LongTensor(dec_outputs)

enc_inputs, dec_inputs, dec_outputs = make_data(sentences)
"""自定义DataLoader"""
class MyDataSet(Data.Dataset):
   
  def __init__(self, enc_inputs, dec_inputs, dec_outputs):
    super(MyDataSet, self).__init__()
    self.enc_inputs = enc_inputs
    self.dec_inputs = dec_inputs
    self.dec_outputs = dec_outputs
  
  def __len__(self):
    return self.enc_inputs.shape[0]
  
  def __getitem__(self, idx):
    return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[idx]

loader = Data.DataLoader(MyDataSet(enc_inputs, dec_inputs, dec_outputs), 2, True)  # dataset=MyDataset, batch=2, shuffe=True

In [None]:
enc_inputs

tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 5, 0]])

In [None]:
enc_inputs.shape

torch.Size([2, 5])

In [None]:
dec_inputs

tensor([[6, 1, 2, 3, 4, 8],
        [6, 1, 2, 3, 5, 8]])

In [None]:
dec_outputs

tensor([[1, 2, 3, 4, 8, 7],
        [1, 2, 3, 5, 8, 7]])

In [None]:
loader

<torch.utils.data.dataloader.DataLoader at 0x7f073769d1d0>

In [None]:
[*loader] #一个batch 一个两个样本（句子） 表示K Q不一定要维度一样

[[tensor([[1, 2, 3, 4, 0],
          [1, 2, 3, 5, 0]]), tensor([[6, 1, 2, 3, 4, 8],
          [6, 1, 2, 3, 5, 8]]), tensor([[1, 2, 3, 4, 8, 7],
          [1, 2, 3, 5, 8, 7]])]]

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
       #max_len: 一个标量。文本序列的最大长度   一种可能是一个句子的长度   ->  应该是一种是词库的长度 forward 
        super(PositionalEncoding, self).__init__()
        # 从理解来讲，需要注意的就是偶数和奇数在公式上有一个共同部分，我们使用log函数把次方拿下来，方便计算；
        # pos代表的是单词在句子中的索引，这点需要注意；比如max_len是128个，那么索引就是从0，1，2，...,127
        # 假设我的d_model是512，2i那个符号中i从0取到了255，那么2i对应取值就是0,2,4...510

        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)#[5000,512]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)# [5000,1]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) #[256]
        pe[:, 0::2] = torch.sin(position * div_term) #从0开始到最后面，步长为2，其实代表的就是偶数位置 512   div_term为256
        pe[:, 1::2] = torch.cos(position * div_term) # 从1开始到最后面，步长为2，其实代表的就是奇数位置
     # 下面这个代码之后，我们得到的pe形状是：[max_len*1*d_model]
        pe = pe.unsqueeze(0).transpose(0, 1) #[5000,1,512]
         # 定一个缓冲区，其实简单理解为这个参数不更新就可以

        self.register_buffer('pe', pe)
# 不同句子词的相同位置加的是同一种位置编码 ，因为max_len代表的是词表长度。 
    def forward(self, x):
        '''
        x: [seq_len, batch_size, d_model]
        '''
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
 


In [None]:
torch.zeros(5000, d_model).shape

torch.Size([5000, 512])

In [None]:
torch.arange(0, 5000, dtype=torch.float).unsqueeze(1).shape

torch.Size([5000, 1])

In [None]:
torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)).shape

torch.Size([256])

In [None]:
pe = torch.zeros(5000, d_model)
position = torch.arange(0, 5000, dtype=torch.float).unsqueeze(1) 
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term) #从0开始到最后面，步长为2，其实代表的就是偶数位置 512   div_term为256
pe[:, 1::2] = torch.cos(position * div_term) # 从1开始到最后面，步长为2，其实代表的就是奇数位置


In [None]:
pe.shape


torch.Size([5000, 512])

In [None]:
pe.unsqueeze(0).shape

torch.Size([1, 5000, 512])

In [None]:
pe.unsqueeze(0).transpose(0, 1)

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00]],

        [[ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
           1.0366e-04,  1.0000e+00]],

        [[ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
           2.0733e-04,  1.0000e+00]],

        ...,

        [[ 9.5625e-01, -2.9254e-01,  9.3594e-01,  ...,  8.5926e-01,
           4.9515e-01,  8.6881e-01]],

        [[ 2.7050e-01, -9.6272e-01,  8.2251e-01,  ...,  8.5920e-01,
           4.9524e-01,  8.6876e-01]],

        [[-6.6395e-01, -7.4778e-01,  1.4615e-03,  ...,  8.5915e-01,
           4.9533e-01,  8.6871e-01]]])

In [None]:
pe.unsqueeze(0).transpose(0, 1).shape

torch.Size([5000, 1, 512])

In [None]:
 pe=pe.unsqueeze(0).transpose(0, 1)

In [5]:
## 4. get_attn_pad_mask

## 比如说，我现在的句子长度是5，在后面注意力机制的部分，我们在计算出来QK转置除以根号之后，softmax之前，我们得到的形状
## len_input * len_input  代表每个单词对其余包含自己的单词的影响力
## 所以这里我需要有一个同等大小形状的矩阵，告诉我哪个位置是PAD部分，之后在计算softmax之前会把这里置为无穷大；
## 一定需要注意的是这里得到的矩阵形状是batch_size x len_q x len_k，我们是对k中的pad符号进行标识，并没有对k中的做标识，因为没必要
## seq_q 和 seq_k 不一定一致(我自己的理解是原文是德文，翻译成英文，而原文的德语的单词个数和英语的单词个数不一样多，所以这儿可能不一致)，
#在交互注意力，q来自解码端，k来自编码端，所以告诉模型编码这边pad符号信息就可以，解码端的pad信息在交互注意力层是没有用到的；

 # pad mask的作用：在对value向量加权平均的时候，可以让pad对应的alpha_ij=0，这样注意力就不会考虑到pad向量
def get_attn_pad_mask(seq_q, seq_k):
    '''
    seq_q: [batch_size, seq_len]
    seq_k: [batch_size, seq_len]
    seq_len could be src_len or it could be tgt_len
    '''
    
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # [batch_size, len_q, len_k][2,5,5]


In [None]:
enc_inputs.shape

torch.Size([2, 5])

In [None]:
enc_inputs.size()

torch.Size([2, 5])

In [None]:
enc_inputs.data

tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 5, 0]])

In [None]:
enc_inputs.data.eq(0)

tensor([[False, False, False, False,  True],
        [False, False, False, False,  True]])

In [None]:
enc_inputs.data.eq(0).shape

torch.Size([2, 5])

In [None]:
enc_inputs.data.eq(0).unsqueeze(1)

tensor([[[False, False, False, False,  True]],

        [[False, False, False, False,  True]]])

In [None]:
enc_inputs.data.eq(0).unsqueeze(1).shape

torch.Size([2, 1, 5])

In [None]:
enc_inputs.data.eq(0).unsqueeze(1).expand(2, 5,5)

tensor([[[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]],

        [[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])

In [None]:
yy=enc_inputs.data.eq(0).unsqueeze(1).expand(2, 5,5)

In [None]:
enc_inputs.data.eq(0).unsqueeze(1).expand(2, 5,5).shape

torch.Size([2, 5, 5])

In [6]:

def get_attn_subsequence_mask(seq):
    '''
    seq: [batch_size, tgt_len]
    '''
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequence_mask = np.triu(np.ones(attn_shape), k=1) # Upper triangular matrix  https://blog.csdn.net/hufei_neo/article/details/100773462
    subsequence_mask = torch.from_numpy(subsequence_mask).byte()
    return subsequence_mask # [batch_size, tgt_len, tgt_len]

In [20]:
#seq=dec_inputs
dec_inputs

tensor([[6, 1, 2, 3, 5, 8],
        [6, 1, 2, 3, 4, 8]], device='cuda:0')

In [21]:
dec_inputs.shape

torch.Size([2, 6])

In [23]:
attn_shape = [dec_inputs.size(0),dec_inputs.size(1), dec_inputs.size(1)]
np.triu(np.ones(attn_shape), k=1)

array([[[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]],

       [[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]]])

In [24]:
np.triu(np.ones(attn_shape), k=1).shape

(2, 6, 6)

In [25]:
torch.from_numpy(np.triu(np.ones(attn_shape), k=1)).byte()

tensor([[[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]],

        [[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)

In [7]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        '''
        Q: [batch_size, n_heads, len_q, d_k] [2,8,5,64]encoder
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v(=len_k), d_v]
        attn_mask: [batch_size, n_heads, seq_len, seq_len]
        '''
        
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, len_q, len_k]
        
        # 下面这个就是用到了我们之前的attn_mask，把被mask的地方置为无限小，softmax之后基本就是0，对q的单词不起作用
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.  
        
        attn = nn.Softmax(dim=-1)(scores) #attention矩阵
        context = torch.matmul(attn, V) # [batch_size, n_heads, len_q, d_v] context相当于Z
        return context, attn

In [None]:
#yy, Q,K,V
Q.shape, K.shape

(torch.Size([2, 8, 5, 64]), torch.Size([2, 8, 5, 64]))

In [None]:
yy.shape

torch.Size([2, 5, 5])

In [None]:
 K.shape,K.transpose(-1, -2).shape

(torch.Size([2, 8, 5, 64]), torch.Size([2, 8, 64, 5]))

In [None]:
scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)

In [None]:
scores.shape

torch.Size([2, 8, 5, 5])

In [None]:
yy.shape

torch.Size([2, 5, 5])

In [None]:
scores.masked_fill_(yy, -1e9).shape

torch.Size([2, 8, 5, 5])

In [None]:
V.shape

torch.Size([2, 8, 5, 64])

In [8]:
  """这个Attention类可以实现:
    Encoder的Self-Attention
    Decoder的Masked Self-Attention
    Encoder-Decoder的Attention
    """
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
         # 输入进来的QKV是相等的，我们会使用映射linear做一个映射得到参数矩阵Wq, Wk,Wv
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False) # q,k必须维度相同，不然无法做点积 64*8
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)
        self.layer_norm = nn.LayerNorm(d_model)
    def forward(self, input_Q, input_K, input_V, attn_mask):
       # 这个多头分为这几个步骤，首先映射分头，然后计算atten_scores，然后计算atten_value;
        # 输入进来的数据形状：
        '''
        input_Q: [batch_size, len_q, d_model] [2,5,512]
        input_K: [batch_size, len_k, d_model]  
        input_V: [batch_size, len_v(=len_k), d_model]
        attn_mask: [batch_size, seq_len, seq_len]
        ''' 
        residual, batch_size = input_Q, input_Q.size(0)
        # (B, S, D) -proj-> (B, S, D_new) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # Q: [batch_size, n_heads, len_q, d_k]  写-1不写定的原因是因为一句话不同batch可能长度不一样 八组不同的Q
        K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # K: [batch_size, n_heads, len_k, d_k]
        V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # V: [batch_size, n_heads, len_v(=len_k), d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, len_q, d_v] [2,8,5,64]
        #attn: [batch_size, n_heads, len_q, len_k]
        context, attn = ScaledDotProductAttention()(Q, K, V, attn_mask)
        context = context.transpose(1, 2).reshape(batch_size, -1, n_heads * d_v) # context: [batch_size, len_q, n_heads * d_v][2.5.512]

        output = self.fc(context) # [batch_size, len_q, d_model]
        return self.layer_norm.cuda()(output + residual), attn


In [None]:
enc_outputs.shape

torch.Size([2, 5, 512])

In [None]:
W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)

In [None]:
W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
W_V = nn.Linear(d_model, d_v * n_heads, bias=False)

In [None]:
W_K(enc_outputs).view(2, -1, n_heads, d_k).shape

torch.Size([2, 5, 8, 64])

In [None]:
W_Q(enc_outputs).view(2, -1, n_heads, d_k).shape


torch.Size([2, 5, 8, 64])

In [None]:
W_Q(enc_outputs)==W_K(enc_outputs)

tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]]])

In [None]:
W_Q(enc_outputs).view(2, 5, n_heads, d_k).shape

torch.Size([2, 5, 8, 64])

In [None]:
W_Q(enc_outputs).view(2, -1, n_heads, d_k).transpose(1,2).shape

torch.Size([2, 8, 5, 64])

In [None]:
yy #padmask

tensor([[[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]],

        [[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])

In [None]:
yy.unsqueeze(1).repeat(1, n_heads, 1, 1)   

tensor([[[[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]],

         [[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]],

         [[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]],

         [[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]],

         [[False, False, False, False,  True],
     

In [None]:
yy.unsqueeze(1).shape 


torch.Size([2, 1, 5, 5])

In [None]:
yy.unsqueeze(1).repeat(1, n_heads, 1, 1).shape # 复制

torch.Size([2, 8, 5, 5])

In [None]:
yy=yy.unsqueeze(1).repeat(1, n_heads, 1, 1)

In [None]:
Q =W_Q(enc_outputs).view(2, -1, n_heads, d_k).transpose(1,2)  # Q: [batch_size, n_heads, len_q, d_k]  写-1不写定的原因是因为一句话不同batch可能长度不一样 八组不同的Q
K =W_K(enc_outputs).view(2, -1, n_heads, d_k).transpose(1,2)  # K: [batch_size, n_heads, len_k, d_k]
V =W_V(enc_outputs).view(2, -1, n_heads, d_v).transpose(1,2)  # V: [batch_size, n_heads, len_v(=len_k), d_v]


In [None]:
Q.shape

torch.Size([2, 8, 5, 64])

In [9]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff, bias=False),
            nn.ReLU(),
            nn.Linear(d_ff, d_model, bias=False)
        )
    def forward(self, inputs):
        '''
        inputs: [batch_size, seq_len, d_model]
        '''
        residual = inputs
        output = self.fc(inputs)
        return nn.LayerNorm(d_model).cuda()(output + residual) # [batch_size, seq_len, d_model]


In [10]:
# EncoderLayer ：包含两个部分，多头注意力机制和前馈神经网络
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        '''
        enc_inputs: [batch_size, src_len, d_model] [2,5,512]  位置编码过后
        enc_self_attn_mask: [batch_size, src_len, src_len][2,5,5] padmask矩阵
        enc_outputs: [batch_size, src_len, d_model] [2,5,512]
        attn: [batch_size, n_heads, src_len, src_len] 
        '''
      # 下面这个就是做自注意力层，输入是enc_inputs，形状是[batch_size x seq_len_q x d_model]
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
        return enc_outputs, attn


In [11]:
class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        '''
        dec_inputs: [batch_size, tgt_len, d_model]
        enc_outputs: [batch_size, src_len, d_model]
        dec_self_attn_mask: [batch_size, tgt_len, tgt_len]
        dec_enc_attn_mask: [batch_size, tgt_len, src_len]
        '''
        # dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len]
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask) # 这里的Q,K,V全是Decoder自己的输入
        # dec_outputs: [batch_size, tgt_len, d_model], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
        
        # Attention层的Q(来自decoder) 和 K,V(来自encoder)
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs) # [batch_size, tgt_len, d_model]
        return dec_outputs, dec_self_attn, dec_enc_attn  # dec_self_attn, dec_enc_attn这两个是为了可视化的


In [12]:
#Encoder 部分包含三个部分：词向量embedding，位置编码部分，注意力层及后续的前馈神经网络 
class Encoder(nn.Module):
    
    def __init__(self):
        super(Encoder, self).__init__()
        #  Embedding构建一张词表 https://blog.csdn.net/qq_41477675/article/details/114645012
        # 10，3词表   (2,4)输入  （2,4,3）https://blog.csdn.net/qq_39540454/article/details/115215056     
        self.src_emb = nn.Embedding(src_vocab_size, d_model)# 定义生成一个矩阵，大小是 src_vocab_size * d_model   #embed = torch.nn.Embedding(n_vocabulary,embedding_size)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

    def forward(self, enc_inputs):
        '''
        enc_inputs: [batch_size, src_len] 2个句子 和每个句子长度 [2,5]
        '''
        enc_outputs = self.src_emb(enc_inputs) # [batch_size, src_len, d_model][2,5,512]
        #enc_outputs.transpose(0, 1) 为什么要改维度 https://www.jianshu.com/p/63e7acc5e890  为了方便加posti_encoding
        enc_outputs = self.pos_emb(enc_outputs.transpose(0, 1)).transpose(0, 1) # [batch_size, src_len, d_model] [2,5,512]
        # get_attn_pad_mask是为了得到句子中pad的位置信息，给到模型后面，在计算自注意力和交互注意力的时候去掉pad符号的影响，去看一下这个函数
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) # [batch_size, src_len, src_len] [2,5,5]
        enc_self_attns = [] # 在计算中不需要用到，它主要用来保存你接下来返回的attention的值（这个主要是为了你画热力图等，用来看各个词之间的关系
        # for循环访问nn.ModuleList对象
            # 上一个block的输出enc_outputs作为当前block的输入
        for layer in self.layers:
            # enc_outputs: [batch_size, src_len, d_model][2,5,512] 经过位置编码的out
            #enc_self_attn: [batch_size, n_heads, src_len, src_len] [2,8,5,5]
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)# 传入的enc_outputs其实是input，传入mask矩阵是因为你要做self attention
            enc_self_attns.append(enc_self_attn) #只是为了画图这个不是重点
        return enc_outputs, enc_self_attns


In [None]:
enc_inputs

tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 5, 0]])

In [None]:
src_emb = nn.Embedding(src_vocab_size, d_model)

In [None]:
src_emb 

Embedding(6, 512)

In [None]:
src_emb(enc_inputs).shape

torch.Size([2, 5, 512])

In [None]:
enc_outputs=src_emb(enc_inputs)

In [None]:
enc_outputs.transpose(0, 1).shape

torch.Size([5, 2, 512])

In [None]:
pe.shape

torch.Size([5000, 1, 512])

In [None]:
x1=enc_outputs.transpose(0, 1)

In [None]:
x1.shape

torch.Size([5, 2, 512])

In [None]:
x1

tensor([[[ 2.0358,  0.0350,  1.3098,  ..., -0.2427, -0.8998,  1.1425],
         [ 2.0358,  0.0350,  1.3098,  ..., -0.2427, -0.8998,  1.1425]],

        [[-0.3532,  0.5222,  0.1360,  ...,  0.6693, -0.7690, -0.5097],
         [-0.3532,  0.5222,  0.1360,  ...,  0.6693, -0.7690, -0.5097]],

        [[-0.6686,  0.8590,  0.5581,  ...,  1.3988,  0.0475, -1.9611],
         [-0.6686,  0.8590,  0.5581,  ...,  1.3988,  0.0475, -1.9611]],

        [[ 1.2501,  0.0552, -0.3065,  ..., -0.4092, -0.3561, -0.3484],
         [ 0.7179,  0.3352, -0.9570,  ..., -0.3286, -0.4114, -0.9273]],

        [[-0.7196,  2.0046,  0.3455,  ...,  1.0214, -0.0042, -1.3199],
         [-0.7196,  2.0046,  0.3455,  ...,  1.0214, -0.0042, -1.3199]]],
       grad_fn=<TransposeBackward0>)

In [None]:
x1.size(0)

5

In [None]:
pe[:x1.size(0),:].shape

torch.Size([5, 1, 512])

In [None]:
pe[:x1.size(0),:] 

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00]],

        [[ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
           1.0366e-04,  1.0000e+00]],

        [[ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
           2.0733e-04,  1.0000e+00]],

        [[ 1.4112e-01, -9.8999e-01,  2.4509e-01,  ...,  1.0000e+00,
           3.1099e-04,  1.0000e+00]],

        [[-7.5680e-01, -6.5364e-01, -6.5717e-01,  ...,  1.0000e+00,
           4.1465e-04,  1.0000e+00]]])

In [None]:
(pe[:x1.size(0),:]+x1).shape

torch.Size([5, 2, 512])

In [13]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model) #[9,512]
        self.pos_emb = PositionalEncoding(d_model) 
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        '''
        dec_inputs: [batch_size, tgt_len]
        enc_intpus: [batch_size, src_len]
        enc_outputs: [batsh_size, src_len, d_model]
        
        '''
        # pad 和mask都是对原输入词矩阵做 其实并没有要矩阵的信息只是为了获得维度
        dec_outputs = self.tgt_emb(dec_inputs) # [batch_size, tgt_len, d_model] [ 2,,6,512 ]
        dec_outputs = self.pos_emb(dec_outputs.transpose(0, 1)).transpose(0, 1).cuda() # [batch_size, tgt_len, d_model] [2,6,512]
        # Decoder输入序列的pad mask矩阵（这个例子中decoder是没有加pad的，实际应用中都是有pad填充的） 输入端dec_inputs没有 P
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs).cuda() # [batch_size, tgt_len, tgt_len] [2,6,6]
         # Masked Self_Attention：当前时刻是看不到未来的信息的
 
        dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs).cuda() # [batch_size, tgt_len, tgt_len][2,6,6] 0,1矩阵看不到的为1 看到的为0
        # Decoder中把两种mask矩阵相加（既屏蔽了pad的信息，也屏蔽了未来时刻的信息）
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequence_mask), 0).cuda() # [batch_size, tgt_len, tgt_len]

        # tgt_len =q  ,src_len=k  ## 这个做的是交互注意力机制中的mask矩阵，enc的输入是k，我去看这个k里面哪些是pad符号，给到后面的模型；注意哦，我q肯定也是有pad符号，但是这里我不在意的
#  这个mask主要用于encoder-decoder attention层

     # get_attn_pad_mask主要是enc_inputs的pad mask矩阵(因为enc是处理K,V的，求Attention时是用v1,v2,..vm去加权的，要把pad对应的v_i的相关系数设为0，这样注意力就不会关注pad向量)
        # dec_inputs只是提供expand的size的 
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs) # [batc_size, tgt_len, src_len] ?

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            # dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns


In [26]:
aa = get_attn_pad_mask(dec_inputs, dec_inputs).cuda()
bb=dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs).cuda()

In [30]:
aa

tensor([[[False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False]],

        [[False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False]]], device='cuda:0')

In [31]:
bb

tensor([[[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]],

        [[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]]], device='cuda:0', dtype=torch.uint8)

In [29]:
aa.shape,bb.shape

(torch.Size([2, 6, 6]), torch.Size([2, 6, 6]))

In [28]:
torch.gt((aa +bb), 0).cuda()

tensor([[[False,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False, False,  True,  True],
         [False, False, False, False, False,  True],
         [False, False, False, False, False, False]],

        [[False,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False, False,  True,  True],
         [False, False, False, False, False,  True],
         [False, False, False, False, False, False]]], device='cuda:0')

In [27]:
aa # Decoder输入序列的pad mask矩阵（这个例子中decoder是没有加pad的，实际应用中都是有pad填充的）

tensor([[[False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False]],

        [[False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False],
         [False, False, False, False, False, False]]], device='cuda:0')

In [None]:
tgt_emb = nn.Embedding(tgt_vocab_size, d_model)


In [None]:
tgt_emb 

Embedding(9, 512)

In [None]:
pos_emb = PositionalEncoding(d_model)

In [None]:
pos_emb 

PositionalEncoding(
  (dropout): Dropout(p=0.1, inplace=False)
)

In [14]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder().cuda()
        self.decoder = Decoder().cuda()
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False).cuda()# 输出层 d_model 是我们解码层每个token输出的维度大小，之后会做一个 tgt_vocab_size 大小的softmax
# 一个德语单词被翻译成英语，它会对应为那个单词，所以这里输入就是一个单词在词表中的维度，这里的维度是512，在词表中一个单词的维度是512。
#如果一句话有n个单词，那么在翻译的整个过程中就会调用n次这个全连接函数。然后假设英语单词有100000个，那么这儿的tgt_vocab_size就是1000000个 
# 到达这儿，就好像是一个分类任务，看这个单词属于这100000个类中的哪一个类，最后全连接分类的结果然后再进行一个softmax就会得到这100000个单词每个单词的概率。
#哪个单词的概率最大，那么我们就把这个德语单词翻译成那个单词。也就是我们这儿的projection就是那个德语单词被翻译成英语单词的词。 
    def forward(self, enc_inputs, dec_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, tgt_len]
        enc_outputs: [batch_size, src_len, d_model], enc_self_attns: [n_layers, batch_size, n_heads, src_len, 
        src_len]
        dec_outputs: [batch_size, tgt_len, d_model], dec_self_attns: [n_layers, batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [n_layers, batch_size, tgt_len, src_len]
        # dec_logits: [batch_size, tgt_len, tgt_vocab_size]
        '''
        # 这里有两个数据进行输入，一个是enc_inputs 形状为[batch_size, src_len]，主要是作为编码段的输入，
        # 一个dec_inputs，形状为[batch_size, tgt_len]，主要是作为解码端的输入
        # enc_outputs就是主要的输出，enc_self_attns这里没记错的是QK转置相乘之后softmax之后的矩阵值，代表的是每个单词和其他单词相关性；
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)#enc_inputs？
        # dec_outputs做映射到词表大小
        dec_logits = self.projection(dec_outputs) 
        
        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns

model = Transformer().cuda()
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)

In [18]:
  tgt_len

6

In [19]:
tgt_vocab_size

9

In [15]:
for epoch in range(1000):
    for enc_inputs, dec_inputs, dec_outputs in loader:
      '''
      enc_inputs: [batch_size, src_len]
      dec_inputs: [batch_size, tgt_len]
      dec_outputs: [batch_size, tgt_len]
      '''
      enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(), dec_inputs.cuda(), dec_outputs.cuda()
      # outputs: [batch_size * tgt_len, tgt_vocab_size]
      outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
      loss = criterion(outputs, dec_outputs.view(-1))
      print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0001 loss = 2.306256
Epoch: 0002 loss = 2.222728
Epoch: 0003 loss = 1.922032
Epoch: 0004 loss = 1.692269
Epoch: 0005 loss = 1.544990
Epoch: 0006 loss = 1.389395
Epoch: 0007 loss = 1.115644
Epoch: 0008 loss = 0.914461
Epoch: 0009 loss = 0.707665
Epoch: 0010 loss = 0.475579
Epoch: 0011 loss = 0.453225
Epoch: 0012 loss = 0.372238
Epoch: 0013 loss = 0.310289
Epoch: 0014 loss = 0.255047
Epoch: 0015 loss = 0.212950
Epoch: 0016 loss = 0.138464
Epoch: 0017 loss = 0.119408
Epoch: 0018 loss = 0.081709
Epoch: 0019 loss = 0.060753
Epoch: 0020 loss = 0.055467
Epoch: 0021 loss = 0.043069
Epoch: 0022 loss = 0.041861
Epoch: 0023 loss = 0.040490
Epoch: 0024 loss = 0.041463
Epoch: 0025 loss = 0.035719
Epoch: 0026 loss = 0.033651
Epoch: 0027 loss = 0.026825
Epoch: 0028 loss = 0.023173
Epoch: 0029 loss = 0.017695
Epoch: 0030 loss = 0.017236
Epoch: 0031 loss = 0.013711
Epoch: 0032 loss = 0.016845
Epoch: 0033 loss = 0.009329
Epoch: 0034 loss = 0.010111
Epoch: 0035 loss = 0.007688
Epoch: 0036 loss = 0

Epoch: 0294 loss = 0.000086
Epoch: 0295 loss = 0.000044
Epoch: 0296 loss = 0.000015
Epoch: 0297 loss = 0.000013
Epoch: 0298 loss = 0.000031
Epoch: 0299 loss = 0.000012
Epoch: 0300 loss = 0.000013
Epoch: 0301 loss = 0.000020
Epoch: 0302 loss = 0.000008
Epoch: 0303 loss = 0.000017
Epoch: 0304 loss = 0.000013
Epoch: 0305 loss = 0.000020
Epoch: 0306 loss = 0.000026
Epoch: 0307 loss = 0.000031
Epoch: 0308 loss = 0.000030
Epoch: 0309 loss = 0.000015
Epoch: 0310 loss = 0.000011
Epoch: 0311 loss = 0.000013
Epoch: 0312 loss = 0.000011
Epoch: 0313 loss = 0.000024
Epoch: 0314 loss = 0.000007
Epoch: 0315 loss = 0.000018
Epoch: 0316 loss = 0.000011
Epoch: 0317 loss = 0.000012
Epoch: 0318 loss = 0.000007
Epoch: 0319 loss = 0.000013
Epoch: 0320 loss = 0.000015
Epoch: 0321 loss = 0.000026
Epoch: 0322 loss = 0.000016
Epoch: 0323 loss = 0.000018
Epoch: 0324 loss = 0.000011
Epoch: 0325 loss = 0.000015
Epoch: 0326 loss = 0.000013
Epoch: 0327 loss = 0.000015
Epoch: 0328 loss = 0.000014
Epoch: 0329 loss = 0

Epoch: 0587 loss = 0.000003
Epoch: 0588 loss = 0.000003
Epoch: 0589 loss = 0.000003
Epoch: 0590 loss = 0.000003
Epoch: 0591 loss = 0.000002
Epoch: 0592 loss = 0.000004
Epoch: 0593 loss = 0.000003
Epoch: 0594 loss = 0.000004
Epoch: 0595 loss = 0.000003
Epoch: 0596 loss = 0.000006
Epoch: 0597 loss = 0.000003
Epoch: 0598 loss = 0.000005
Epoch: 0599 loss = 0.000004
Epoch: 0600 loss = 0.000006
Epoch: 0601 loss = 0.000004
Epoch: 0602 loss = 0.000004
Epoch: 0603 loss = 0.000006
Epoch: 0604 loss = 0.000003
Epoch: 0605 loss = 0.000005
Epoch: 0606 loss = 0.000003
Epoch: 0607 loss = 0.000004
Epoch: 0608 loss = 0.000003
Epoch: 0609 loss = 0.000003
Epoch: 0610 loss = 0.000003
Epoch: 0611 loss = 0.000003
Epoch: 0612 loss = 0.000004
Epoch: 0613 loss = 0.000002
Epoch: 0614 loss = 0.000003
Epoch: 0615 loss = 0.000003
Epoch: 0616 loss = 0.000005
Epoch: 0617 loss = 0.000003
Epoch: 0618 loss = 0.000003
Epoch: 0619 loss = 0.000003
Epoch: 0620 loss = 0.000004
Epoch: 0621 loss = 0.000003
Epoch: 0622 loss = 0

Epoch: 0883 loss = 0.000003
Epoch: 0884 loss = 0.000002
Epoch: 0885 loss = 0.000001
Epoch: 0886 loss = 0.000003
Epoch: 0887 loss = 0.000002
Epoch: 0888 loss = 0.000002
Epoch: 0889 loss = 0.000002
Epoch: 0890 loss = 0.000002
Epoch: 0891 loss = 0.000002
Epoch: 0892 loss = 0.000003
Epoch: 0893 loss = 0.000002
Epoch: 0894 loss = 0.000002
Epoch: 0895 loss = 0.000001
Epoch: 0896 loss = 0.000003
Epoch: 0897 loss = 0.000003
Epoch: 0898 loss = 0.000003
Epoch: 0899 loss = 0.000003
Epoch: 0900 loss = 0.000003
Epoch: 0901 loss = 0.000002
Epoch: 0902 loss = 0.000002
Epoch: 0903 loss = 0.000002
Epoch: 0904 loss = 0.000002
Epoch: 0905 loss = 0.000001
Epoch: 0906 loss = 0.000002
Epoch: 0907 loss = 0.000002
Epoch: 0908 loss = 0.000001
Epoch: 0909 loss = 0.000003
Epoch: 0910 loss = 0.000003
Epoch: 0911 loss = 0.000002
Epoch: 0912 loss = 0.000003
Epoch: 0913 loss = 0.000002
Epoch: 0914 loss = 0.000002
Epoch: 0915 loss = 0.000002
Epoch: 0916 loss = 0.000002
Epoch: 0917 loss = 0.000001
Epoch: 0918 loss = 0

In [None]:
def greedy_decoder(model, enc_input, start_symbol):
    """
    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the
    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.
    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding
    :param model: Transformer Model
    :param enc_input: The encoder input
    :param start_symbol: The start symbol. In this example it is 'S' which corresponds to index 4
    :return: The target input
    """
    enc_outputs, enc_self_attns = model.encoder(enc_input)
    dec_input = torch.zeros(1, 0).type_as(enc_input.data)
    terminal = False
    next_symbol = start_symbol
    while not terminal:         
        dec_input = torch.cat([dec_input.detach(),torch.tensor([[next_symbol]],dtype=enc_input.dtype).cuda()],-1)
        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)
        projected = model.projection(dec_outputs)
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[-1]
        next_symbol = next_word
        if next_symbol == tgt_vocab["."]:
            terminal = True
        print(next_word)            
    return dec_input

In [None]:
# Test
enc_inputs, _, _ = next(iter(loader))
enc_inputs = enc_inputs.cuda()
for i in range(len(enc_inputs)):
    greedy_dec_input = greedy_decoder(model, enc_inputs[i].view(1, -1), start_symbol=tgt_vocab["S"])
    predict, _, _, _ = model(enc_inputs[i].view(1, -1), greedy_dec_input)
    predict = predict.data.max(1, keepdim=True)[1]
    print(enc_inputs[i], '->', [idx2word[n.item()] for n in predict.squeeze()])