In [281]:
import torch
import torch.nn as nn
import torch.functional as f
from torch.utils.data import Dataset,DataLoader
from torch import Tensor

# 通用编码器架构

In [5]:
class Encoder(nn.Module):
    """编码器 解码器架构的基本编码器接口"""
    def __init__(self,**kwargs):
        super(Encoder, self).__init__()
    def forward(self,x,*args):
        raise NotImplementedError

# 通用解码器架构

In [317]:
class Decoder(nn.Module):
    """编码器 解码器架构的基本解码器接口"""
    def __init__(self,**kwargs):
        super(Decoder, self).__init__()
    def forward(self,x,state):
        raise NotImplementedError
    def init_state(self,enc_outputs,*args):
        raise NotImplementedError

# 通用Seq2Seq架构

In [6]:
class EncoderDecoder(nn.Module):
    def __init__(self,encoder:Encoder,decoder:Decoder,**kwargs):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self,enc_x,dec_x,*args):
        enc_outputs = self.encoder(enc_x,*args)
        dec_state = self.decoder.init_state(enc_outputs,*args)
        return self.decoder(dec_x,dec_state)

# 实现一个Seq2SeqEncoder

In [11]:
class Seq2SeqEncoder(Encoder):
    def __init__(self,vocab_size,embedding_dim,num_hidden,num_layers,dropout=0,**kwargs):
        super(Seq2SeqEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.rnn = nn.GRU(embedding_dim,num_hidden,num_layers,dropout=dropout,batch_first=True)
    def forward(self,x,*args):
        """
        :param x:  形状[batch_size,seq_len]
        :param args:
        :return:
        """
        # 进行词嵌入 embedded形状 [batch_size,seq_len,embedding_dim]
        embedded = self.embedding(x)
        # 获取最后输出 与 隐层状态
        # output形状与embedded相同(需要设置batch_first=True)
        # state形状 [num_layers,batch_size,embedding_dim]
        output,state = self.rnn(embedded)
        return output,state

试一试
构造一个[batch_size=4,seq_len=7]的批量数据
output形状:[batch_size,seq_len,num_hidden]
state形状 :[num_layer,batch_size,num_hidden]

In [140]:
encoder = Seq2SeqEncoder(vocab_size=10,embedding_dim=256,num_hidden=256,num_layers=6)
encoder.eval()
x = torch.zeros((4,7),dtype=torch.long)
output,state = encoder(x)
output.shape,state.shape

(torch.Size([4, 7, 256]), torch.Size([6, 4, 256]))

# 实现一个Seq2SeqDecoder
需要注意的地方:
1.从编码器ht中获取额外的输入序列信息:只选了最后一个layer进行获取
2.torch下的RNN模型输出的state(ht)的形状为 [num_layer,batch_size,num_hidden]
3.给定的输入序列的形状为 [batch_size,seq_len] 在与2拼接的时候需要注意转置

In [316]:
class Seq2SeqDecoder(Decoder):
    def __init__(self,vocab_size,embedding_dim,num_hidden,num_layers,dropout=0,**kwargs):
        super(Seq2SeqDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.rnn = nn.GRU(embedding_dim+num_hidden,num_hidden,num_layers,dropout=dropout,batch_first=True)
        self.classify = nn.Linear(num_hidden,vocab_size)
    def forward(self,x,state):
        """
        :param x: 形状batch_size,seq_len
        :param state:
        :return:
        """
        # 目标句子词嵌入
        x = self.embedding(x)
        # 拿编码器ht状态中的某一个layer和输入进行拼接 进一步获取输入序列的信息
        # layer的形状[batch_size,num_hidden]
        # 将其重复seq_len次 变成[seq_len,batch_size,num_hidden]
        context = decoder_state[-1].repeat(x.shape[1],1,1)
        # 进行拼接 形状 [batch_size,seq_len,num_hidden+embedding_dim]
        x_and_context = torch.cat((x, context.permute(1,0,2)), 2)
        # 将拼接后的信息作为rnn输入 输入下一步的hidden和out
        output, state = self.rnn(x_and_context, state)
        output = self.classify(output)
        return output, state

## Have a Try
解码器,编码器的 num_hidden,num_layer需要一致

In [214]:
x = torch.zeros((4,7),dtype=torch.long)
encoder_output,decoder_state = encoder(x)
decoder = Seq2SeqDecoder(vocab_size=10, embedding_dim=256, num_hidden=256,num_layers=6)
decoder.eval()
output,state = decoder(x,decoder_state)
output.shape,state.shape

(torch.Size([4, 7, 10]), torch.Size([6, 4, 256]))

# 实现一个用于屏蔽额外序列信息的长度
为了批量加载数据,额外定义了PAD,SOS,EOS词
对填充词元的预测应该排除在损失函数的计算之外

神奇的torch
tensor的形状[M,N]
对tensor进行 [:,None,:]的切片 会直接将tensor进行unsqueeze(1)的操作
返回的tensor形状[M,1,N]

In [284]:
def sequence_mask(x:torch.Tensor,valid_len,value=0):
    max_length = x.size(1)
    mask = torch.arange(max_length, dtype=torch.float32, device=x.device)[None, :] < valid_len[:, None]
    x[~mask] = value
    return x

构造一个batch数据来测试

In [286]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]])
valid_len = torch.tensor([1,2])
x

tensor([[1, 2, 3],
        [4, 5, 6]])

In [285]:
masked_x = sequence_mask(x,valid_len)
masked_x

tensor([[1, 0, 0],
        [4, 5, 0]])

# 拓展softmax遮蔽不相关的预测
首先是构造一个全1矩阵weights
用weights和valid_len调用sequence_mask生成屏蔽矩阵
然后按照常规流程计算交叉熵损失
最后用屏蔽矩阵屏蔽无效损失,对每一个词对应有效损失做均值,返回对应位置的损失

In [308]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    def forward(self, input: Tensor, target: Tensor,valid_len:Tensor,value=0) -> Tensor:
        weights = torch.ones_like(target)
        weights = sequence_mask(weights,valid_len,value)
        self.reduction = 'none'
        unweight_loss = super(MaskedSoftmaxCELoss, self).forward(input.permute(0,2,1),target)
        weighted_loss = (unweight_loss*weights).mean(dim=1)
        return weighted_loss

试一试
input的形状[batch_size,seq_len,vocab_size]
target的形状[batch_size,seq_len]

In [309]:
input,target = torch.ones(3, 4, 10),torch.ones((3, 4),dtype=torch.long)
valid_len = torch.tensor([4,2,0])

In [312]:
loss = MaskedSoftmaxCELoss()

In [314]:
loss(input,target,valid_len)

tensor([2.3026, 1.1513, 0.0000])

先下班
搞懂了一个点
对于Decoder训练时可以将真实标签作为(教师强制)一次性输入
但是Decoder在预测时,每次只能构造一个[1,1] (batch=1,长度为1的句子)输入
获取其output,进行argmax求当前预测概率最大的单词
然后用单词构造batch作为下一次的预测,直到到达设定长度,或者其预测结果为EOS停止