In [2]:
import sys
sys.path.append('/home/kesci/input/d2l9528/')
import collections
import d2l
import zipfile
from d2l.data.base import Vocab
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim

In [3]:
# 读取语料
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
      raw_text = f.read()
print(raw_text[0:1000])

Go.	Va !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
Hi.	Salut !	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)
Hi.	Salut.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)
Run!	Cours !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)
Run!	Courez !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)
Who?	Qui ?	CC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #4366796 (gillux)
Wow!	Ça alors !	CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #374631 (zmoo)
Fire!	Au feu !	CC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #4627939 (sacredceltic)
Help!	À l'aide !	CC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #128430 (sysko)
Jump.	Saute.	CC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #2416938 (Phoenix)
Stop!	Ça suffit !	CC-BY 2.0 (France) Attribution: tato

In [4]:
# 数据清洗
def preprocess_raw(text):
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')
    out = ''
    for i, char in enumerate(text.lower()):  # 转化小写
        if char in (',', '!', '.') and i > 0 and text[i-1] != ' ':
            out += ' '  # 加上空格，以此一一对应
        out += char
    return out
text = preprocess_raw(raw_text)
print("查看清洗效果：", text[0:1000])

查看清洗效果： go .	va !	cc-by 2 .0 (france) attribution: tatoeba .org #2877272 (cm) & #1158250 (wittydev)
hi .	salut !	cc-by 2 .0 (france) attribution: tatoeba .org #538123 (cm) & #509819 (aiji)
hi .	salut .	cc-by 2 .0 (france) attribution: tatoeba .org #538123 (cm) & #4320462 (gillux)
run !	cours !	cc-by 2 .0 (france) attribution: tatoeba .org #906328 (papabear) & #906331 (sacredceltic)
run !	courez !	cc-by 2 .0 (france) attribution: tatoeba .org #906328 (papabear) & #906332 (sacredceltic)
who?	qui ?	cc-by 2 .0 (france) attribution: tatoeba .org #2083030 (ck) & #4366796 (gillux)
wow !	ça alors !	cc-by 2 .0 (france) attribution: tatoeba .org #52027 (zifre) & #374631 (zmoo)
fire !	au feu !	cc-by 2 .0 (france) attribution: tatoeba .org #1829639 (spamster) & #4627939 (sacredceltic)
help !	à l'aide !	cc-by 2 .0 (france) attribution: tatoeba .org #435084 (lukaszpp) & #128430 (sysko)
jump .	saute .	cc-by 2 .0 (france) attribution: tatoeba .org #631038 (shishir) & #2416938 (phoenix)
stop !	ça suffi

In [5]:
# 分词操作。
num_examples = 50000  # 先去部分进行分词操作，看看效果
source, target = [], []
for i, line in enumerate(text.split('\n')):  # 将英语和法语分别对应起来，放置同一列表
    if i > num_examples:
        break
    parts = line.split('\t')  # 以tab为分隔符
    if len(parts) >= 2:
        source.append(parts[0].split(' '))  # eng分割
        target.append(parts[1].split(' '))  # fra分割
source[0:4], target[0:4]

([['go', '.'], ['hi', '.'], ['hi', '.'], ['run', '!']],
 [['va', '!'], ['salut', '!'], ['salut', '.'], ['cours', '!']])

In [6]:
# 建立词典类
class Vocab(object):
    def __init__(self, tokens, min_freq=0, use_special_tokens=False):
        counter = collections.Counter(tokens)
        token_freqs = sorted(counter.items(), key=lambda x: x[0])
        token_freqs.sort(key=lambda x: x[1], reverse=True)
        if use_special_tokens:
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            tokens = ['<pad>', '<bos>', '<eos>', '<unk>']
        else:
            self.unk = 0
            tokens = ['<unk>']
        tokens += [token for token, freq in token_freqs if freq >= min_freq]  # 去除出现频率过小的单词
        self.idx_to_token = []
        self.token_to_idx = dict()
        for token in tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    
    def __len__(self):  # 专有方法重载
        return len(self.idx_to_token)
        
    def __getitem__(self, tokens):  # 专有方法重载
        if not isinstance(tokens, (list, tuple)):  # 判断类型
            return self.token_to_idx.get(tokens, self.unk)
        else:
            return [self.__getitem__(token) for token in tokens]
            
# 建立词典
def build_dict(tokens):
    word_list = [token for line in tokens for token in line]
    # word_list = []
    # for line in tokens:
    #     for word in line:
    #         word_list.append(word)
    return Vocab(word_list, min_freq=3, use_special_tokens=True)

src_vocab = build_dict(source)
len(src_vocab)

3789

In [7]:
# 补全语料长度（保证输入和输出长度一致）
def pad(line, max_len, padding_token):
    if len(line) > max_len:  # 句子长度大于设定值，组需要切割
        return line[:max_len]
    return line + [padding_token] * (max_len - len(line))  # 用指定padding符合补全
pad(src_vocab[source[0]], 10, src_vocab.pad)

[38, 4, 0, 0, 0, 0, 0, 0, 0, 0]

In [8]:
# 接下来需要将每个句子组合起来
def build_array(lines, vocab, max_len, is_source):
    # temp_lines = [vocab[line] for line in lines]
    temp_lines = []
    for line in lines:
        temp_lines.append(vocab[line])  # 自动调用上述字典类Vocab的专有方法__getitem__
    if not is_source:
        temp_lines = [[vocab.bos] + line + [vocab.eos] for line in temp_lines]
    array = torch.tensor([pad(line, max_len, vocab.pad) for line in temp_lines])  # 转化为符合训练的tensor格式
    valid_len = (array != vocab.pad).sum(1) #第一个维度-计算语料有效长度，后期计算loss值有用处
    return array, valid_len

In [9]:
def load_data_nmt(batch_size, max_len): # This function is saved in d2l.
    src_vocab, tgt_vocab = build_dict(source), build_dict(target)  # 分别建立源语料和目标语料词典
    src_array, src_valid_len = build_array(source, src_vocab, max_len, True)  # 语料格式转化
    tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
    # 利用pytorch内置函数TensorDataset将语料合并，以便后期训练
    train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len) 
    # 利用pytorch内置函数DataLoader将语料以batch_size批量分割
    train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
    return src_vocab, tgt_vocab, train_iter

In [10]:
# 查看效果
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size=2, max_len=8)
for X, X_valid_len, Y, Y_valid_len, in train_iter:
    print('X =', X.type(torch.int32), '\nValid lengths for X =', X_valid_len,
        '\nY =', Y.type(torch.int32), '\nValid lengths for Y =', Y_valid_len)
    break

X = tensor([[   5,   48,   79,  140,    4,    0,    0,    0],
        [  12,    8,    7, 1248,  383,    4,    0,    0]], dtype=torch.int32) 
Valid lengths for X = tensor([5, 6]) 
Y = tensor([[   1,    5,    9,  382,    7,   42,   60,    4],
        [   1,   15,   14,   19, 4797,  624,    4,    2]], dtype=torch.int32) 
Valid lengths for Y = tensor([8, 8])


In [11]:
# 以上处理好训练所需的数据（格式）
# 接下来开始搭建网络
class Encoder(nn.Module):
    def __init__(self, **kwargs):
        super(Encoder, self).__init__(**kwargs)

    def forward(self, X, *args):
        raise NotImplementedError

class Decoder(nn.Module):
    def __init__(self, **kwargs):
        super(Decoder, self).__init__(**kwargs)

    def init_state(self, enc_outputs, *args):
        raise NotImplementedError

    def forward(self, X, state):
        raise NotImplementedError
        
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, dec_state)
        
        

In [12]:
# 搭建seq2seq模型encoder部分-具体地以LSTM为基本单元
class Seq2SeqEncoder(d2l.Encoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqEncoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embed_size)  #nn模块的词嵌入方法
        self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout)
    
    # 隐藏状态初始化
    def begin_state(self, batch_size, device):
        return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device),
                torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)]

    def forward(self, X, *args):
        # batch_size表*个句子，seq_len表每个句子*个单词，embedding_size表每个单词*维向量表示
        X = self.embedding(X)  # embedding后shape:(batch_size, seq_len, embedding_size)
        X = X.transpose(0, 1)  # 将第一和第二维度进行调换。 X shape:(seq_len, batch_size, embedding_size)
        out, state = self.rnn(X)
        # 输出的out包含每个隐藏状态 shape: (seq_len, batch_size, num_hiddens)
        # state包含两个内容：最后一个时间步的隐层状态和记忆细胞 shape: (num_layers, batch_size, num_hiddens)
    
        return out, state

In [13]:
# 人为构造输入语料，4个句子，7单词/句子
# 观察下直观的效果
x = torch.zeros((4, 7), dtype=torch.long)
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=10, num_layers=2)
output, state = encoder(x)
output.shape, len(state), state[0].shape, state[1].shape

(torch.Size([7, 4, 10]), 2, torch.Size([2, 4, 10]), torch.Size([2, 4, 10]))

In [14]:
# 接下来是Seq2seq模型decoder部分
class Seq2SeqDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqDecoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)  #nn模块的词嵌入方法
        self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Linear(num_hiddens,vocab_size)  # 全连接层。用于映射至词典大小维度，方便后续匹配词向量
    
    # encoder作为输入的隐藏状态
    def init_state(self, enc_outputs, *args):
        return enc_outputs[1]
    
    def forward(self, X, *args):
        # batch_size表*个句子，seq_len表每个句子*个单词，embedding_size表每个单词*维向量表示
        X = self.embedding(X)  # embedding后shape:(batch_size, seq_len, embedding_size)
        X = X.transpose(0, 1)  # 将第一和第二维度进行调换。 X shape:(seq_len, batch_size, embedding_size)
        out, state = self.rnn(X)
        # 此处我们只需要out输出单词序列
        out = self.dense(out)
        out = out.transpose(0, 1)  # 还原初始shape
        return out, state

In [15]:
# 查看下decoder的直观效果
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
state = decoder.init_state(encoder(x))
out, state = decoder(x, state)
out.shape, len(state)

(torch.Size([4, 7, 10]), 2)

In [16]:
# 由于前面输入语料进行了补全
# 进行损失值计算时，补全的语料为无效部分
# 此时需要一个使预测结果补全部分无效化的函数
def SequenceMask(X, X_len,value=0):
    maxlen = X.size(1)
    mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None]   
    X[~mask]=value
    return X
# 看下效果
X = torch.tensor([[1,2,3], [4,5,6]])
SequenceMask(X,torch.tensor([1,2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [17]:
# 损失函数使用交叉熵
# 由于涉及语料的有效长度，需要再定义
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    # pred shape: (batch_size, seq_len, vocab_size)
    # label shape: (batch_size, seq_len)
    # valid_length shape: (batch_size, )
    def forward(self, pred, label, valid_length):
        # the sample weights shape should be (batch_size, seq_len)
        weights = torch.ones_like(label)
        weights = SequenceMask(weights, valid_length).float()
        self.reduction='none'
        output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)
        return (output*weights).mean(dim=1)

# 查看下效果
loss = MaskedSoftmaxCELoss()
loss(torch.ones((3, 4, 10)), torch.ones((3,4),dtype=torch.long), torch.tensor([4,3,0])).sum()

tensor(4.0295)

In [18]:
# 训练函数
def train_ch7(model, data_iter, lr, num_epochs):
    # model.to(device)
    loss = MaskedSoftmaxCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        l_sum, num_tokens_sum = 0.0, 0.0
        for data in data_iter:
            X, X_valid_len, Y, Y_valid_len = data
            # 由于decoder部分的输入组成为: BOS ...words EOS。 易知包含两类特殊标签
            # Y作为decoder输入是去除EOS标签，作为输出时则去除BOS标签，最终有效长度需再-1
            Y_input, Y_label, Y_valid_len = Y[:, :-1], Y[:, 1:], Y_valid_len-1  # ??
            
            Y_hat, _ = model(X, Y_input, X_valid_len, Y_valid_len)
            l = loss(Y_hat, Y_label, Y_valid_len).sum()
            l.backward()
            optimizer.zero_grad()
            with torch.no_grad():
                d2l.grad_clipping_nn(model, 5)
            num_tokens = Y_valid_len.sum().item()
            optimizer.step()
            l_sum += l.sum().item()
            num_tokens_sum += num_tokens
        if epoch % 50 == 0:
            print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format( 
                  epoch, (l_sum/num_tokens_sum), time.time()-tic))

In [19]:
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(
    batch_size, max_len,num_examples)
encoder = Seq2SeqEncoder(
    len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
    len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
train_ch7(model, train_iter, lr, num_epochs)

TypeError: train_ch7() takes 4 positional arguments but 5 were given