In [1]:
from __future__ import unicode_literals,print_function
from io import open
import unicodedata
import string
import re#正则表达式模块，帮助你检查一个字符串是否与某种模式匹配
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda")

In [2]:
SOS_token = 0#标记文本序列开始
EOS_token = 1#标记文本序列结束

class Lang:
    def __init__(self,name):
        self.name = name
        self.word2index = {}#把单词映射为索引的词典
        self.word2count = {}#统计出现过的单词出现次数
        self.index2word = {0:'SOS',1:'EOS'}#把索引映射为单词的词典
        self.n_words = 2#词典中单词数量
    def addSentence(self,sentence):
        #把句子按空格分割，把句子每个单词都加入词典
        for word in sentence.split(' '):#以空格为基准分割句子
            self.addWord(word)
    
    def addWord(self,word):
        #如果单词之前没有出现过，加入词典
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        #单词之前出现
        else:
            self.word2count[word] += 1

In [3]:
# 将unicode码转换为普通的ASCII码
def unicodeToAscii(s):
    return ''.join(
    c for c in unicodedata.normalize('NFD',s)
    if unicodedata.category(c)!='Mn')
#def sub(pattern, repl, string, count=0, flags=0):
#	    """Return the string obtained by replacing the leftmost
#	    non-overlapping occurrences of the pattern in string by the
#	    replacement repl.  repl can be either a string or a callable;
#	    if a string, backslash escapes in it are processed.  If it is
#	    a callable, it's passed the match object and must return
#	    a replacement string to be used."""
#	    return _compile(pattern, flags).sub(repl, string, count)

#将句子中所有字母转换为小写，一处一些不是字母的字符
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())#转换为小写
    s = re.sub(r"([.!?])",r"\1",s)# 在.!?前加一个空格
    s = re.sub(r"[^a-zA-Z.!?]+",r" ",s)#匹配多个连续的非字母，并将多个连续的非字母替换为一个' '
    return s
    

In [4]:
def readLangs(lang1, lang2, reverse = False):#加入bool reverse是因为方便双向翻译,即交换输入序列与输出序列 
    print("Reading lines...")
    #读入文件，按回车分行，每一行都存储在Lines中
    lines = open('data/%s-%s.txt'%(lang1,lang2),encoding = 'utf-8').read().strip().split('\n')#因为文件名长这样：eng-fra.txt
    
    #每一行 用tab分割，前面是英文。后面是法文
    pairs = [[normalizeString(s) for s in line.split('\t')] for line in lines]
    
    #reverse = True法译英
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    return input_lang,output_lang,pairs
    

In [5]:
MAX_LENGTH = 10 #句子最大长度是10
# 过滤出一些长度不超过10，以一下前缀开头的句子做训练集
eng_prefixes = (
'i am','i m',
'he is','he s',
'she is','she s',
'you are','you re',
'we are','we re',
'they are','they re')
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and\
      len(p[1].split(' ')) < MAX_LENGTH and\
      p[1].startswith(eng_prefixes)#startswith()判断是否以此字符串从指定索引开始的子字符串是否以指定前缀开始

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


In [6]:
def prepareData(lang1, lang2, reverse= False):
    input_lang, output_lang,pairs = readLangs(lang1,lang2,reverse)
    print("Read %s sentence pairs %len(pairs)")
    pairs = filterPairs(pairs)
    print('Trimmed to %s sentence pairs'%len(pairs))
    print('Counting words...')
    for pair in pairs:
        input_lang.addSentence(pair[0])#input_lang为句子pair[0]创建词典
        output_lang.addSentence(pair[1])
    print('Counted words:')
    print(input_lang.name,input_lang.n_words)
    print(output_lang.name,output_lang.n_words)
    return input_lang,output_lang,pairs
        
input_lang,output_lang,pairs=prepareData('eng','fra',True)#法译英
print(random.choice(pairs))#返回一个列表，元组或字符串的随机项

Reading lines...
Read %s sentence pairs %len(pairs)
Trimmed to 12900 sentence pairs
Counting words...
Counted words:
fra 6501
eng 4389
['elle est rapide pour tout.', 'she is quick at everything.']


In [7]:
#创建句子的tensor
def indexesFromSentence(lang,sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]#返回lang.word2index[word] 索引

#在句子的tensor中，加入EOS符号
def tensorFromSentence(lang,sentence):
    # 对句子进行分割并遍历每一个词汇, 然后使用lang的word2index方法找到它对应的索引
    # 这样就得到了该句子对应的数值列表
    indexes = indexesFromSentence(lang,sentence)
    indexes.append(EOS_token)
    # 将其使用torch.tensor封装成张量, 并改变它的形状为nx1, 以方便后续计算
    return torch.tensor(indexes,dtype = torch.long,device = device).view(1,-1)

def tensorsFromPair(pair):
    #将语言对转换为张量对，参数pair为一个语言对
    input_tensor = tensorFromSentence(input_lang,pair[0])# 输入序列的张量
    target_tensor = tensorFromSentence(output_lang,pair[1])#输出序列的张量
    return (input_tensor,target_tensor)

sample_pairs = random.choice(pairs)
print(sample_pairs)
input_tensor,target_tensor = tensorsFromPair(sample_pairs)
print('input:',input_tensor)
print('target:',target_tensor)

['je suis en bas.', 'i m downstairs.']
input: tensor([[   5,   10,   13, 1086,    1]], device='cuda:0')
target: tensor([[  2,   3, 603,   1]], device='cuda:0')


In [8]:
class EncoderRNN(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1,1,-1)
        output, hidden = self.gru(embedded,hidden)
        return output,hidden

    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size, device=device)

    
        

In [9]:
#含注意力机制的解码器 （分解向量C）

       #初始化函数中的参数有4个, hidden_size代表解码器中GRU的输入尺寸，也是它的隐层节点数
       #output_size代表整个解码器的输出尺寸, 也是我们希望得到的指定尺寸即目标语言的词表大小
       # dropout_p代表我们使用dropout层时的置零比率，默认0.1, max_length代表句子的最大长度"""

        # 根据attention的QKV理论，attention的输入参数为三个Q，K，V，
        # 第一步，使用Q与K进行attention权值计算得到权重矩阵, 再与V做矩阵乘法, 得到V的注意力表示结果.
        # 这里常见的计算方式有三种:
        # 1，将Q，K进行纵轴拼接, 做一次线性变化, 再使用softmax处理获得结果最后与V做张量乘法
        # 2，将Q，K进行纵轴拼接, 做一次线性变化后再使用tanh函数激活, 然后再进行内部求和, 最后使用softmax处理获得结果再与V做张量乘法
        # 3，将Q与K的转置做点积运算, 然后除以一个缩放系数, 再使用softmax处理获得结果最后与V做张量乘法
 
        # 说明：当注意力权重矩阵和V都是三维张量且第一维代表为batch条数时, 则做bmm运算.
 
        # 第二步, 根据第一步采用的计算方法, 如果是拼接方法，则需要将Q与第二步的计算结果再进行拼接, 
        # 如果是转置点积, 一般是自注意力, Q与V相同, 则不需要进行与Q的拼接.因此第二步的计算方式与第一步采用的全值计算方法有关.
        # 第三步，最后为了使整个attention结构按照指定尺寸输出, 使用线性层作用在第二步的结果上做一个线性变换. 得到最终对Q的注意力表示.
 
        # 我们这里使用的是第一步中的第一种计算方式, 因此需要一个线性变换的矩阵, 实例化nn.Linear
        # 因为它的输入是Q，K的拼接, 所以输入的第一个参数是self.hidden_size * 2，第二个参数是self.max_length
        # 这里的Q是解码器的Embedding层的输出, K是解码器GRU的隐层输出，因为首次隐层还没有任何输出，会使用编码器的隐层输出
        # 而这里的V是编码器层的输出

         # 接着我们实例化另外一个线性层, 它是attention理论中的第四步的线性层，用于规范输出尺寸
        # 这里它的输入来自第三步的结果, 因为第三步的结果是将Q与第二步的结果进行拼接, 因此输入维度是self.hidden_size * 2
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, 
                dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size # 目标语言的单词数量
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2,self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2,self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        # input是编码器的上一步输出 或者 真实的前一个单词
        embedded = self.embedding(input).view(1,1,-1)
        embedded = self.dropout(embedded)
        
        # 计算注意力权重
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0],hidden[0]),1)),dim=1)
        
        # torch.bmm(a,b):计算两个tensor的Hadamard乘积，
        # tensor a 的大小为(b,h1,w),
        # tensor b 的大小为(b,w,h2)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), # 1, 1， max_length
                    encoder_outputs.unsqueeze(0)) # 1, max_length， hidden_size
        
        # 输出的attn_applied 大小为 (1, 1, hidden_size)
        # embedded: (1, 1, hidden_size)
        output = torch.cat((embedded[0], attn_applied[0]),1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output,hidden)

        output = F.log_softmax(self.out(output[0]),dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size,device=device)


In [10]:
#训练，用Teacing forcing稍微帮助他一下
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder,
      encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)  # 源语言句子长度
    target_length = target_tensor.size(0) # 目标语言句子长度

    encoder_outputs = torch.zeros(max_length,encoder.hidden_size,device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei],encoder_hidden)
        encoder_outputs[ei] = encoder_output[0,0]  # 保存encoder每一步的隐藏层状态
  
    decoder_input = torch.tensor([[SOS_token]],device=device) # decoder的第一个输入是SOS

    decoder_hidden = encoder_hidden # encoder最后一步隐藏层状态

    use_teacher_forcing = True if random.random()<teacher_forcing_ratio else False

    if use_teacher_forcing:
    # 强制输入target的input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
              decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]

    else:
    # 输入预测的input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
              decoder_input, decoder_hidden, encoder_outputs)
            topv,topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, target_tensor[di])

            if decoder_input.item() == EOS_token: break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()/target_length


In [11]:
import time
import math
def asMinutes(s):
    m = math.floor(s/60)
    s -= m*60
    return '%dm%ds'%(m,s)
def timeSince(since,percent):
    now = time.time()
    s = now -since
    es = s/(percent)
    rs = es -s
    return '%s(-%S)'%(asMinutes(s),asMinutes(rs))

In [12]:
%matplotlib inline 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def trainIters(encoder, decoder, n_iters, print_every=1000,
               plot_every=100,learning_rate = 0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0 
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(),lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(),lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1,n_iters+1):
        training_pair = training_pairs[iter-1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor,encoder,
                     decoder, encoder_optimizer, decoder_optimizer,criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every ==0:
            print_loss_avg = print_loss_total/print_every
            print_loss_total = 0 
            print("%s (%d %d%%) %.4f"%(timeSince(start,iter/n_iters),
             iter, iter / n_iters*100, print_loss_avg))
      
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total =0

    showPlot(plot_losses)

hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1,attn_decoder1,75000,print_every=5000)


RuntimeError: input.size(-1) must be equal to input_size. Expected 256, got 1280

In [None]:
def evaluate(encoder, decoder, sentence, max_length =MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang,sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length,encoder.hidden_size,device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
            encoder_outputs[ei] += encoder_output[0,0]

        decoder_input = torch.tensor([[SOS_token]],device=device)

        decoder_hidden=encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden,decoder_attention = decoder(decoder_input,decoder_hidden,encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di+1]
