In [1]:
from __future__ import unicode_literals, print_function, division
import pickle as pkl
from io import open
import unicodedata
import string
import re
import random
import torch
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import numpy as np, pandas as pd
from torch.autograd import Variable
from sacrebleu import corpus_bleu


In [2]:
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
teacher_forcing_ratio = 0.5
import math


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

## Data Pre-processing

In [4]:
SOS_token = 0
EOS_token = 1
PAD_IDX = 2
UNK_IDX = 3
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2:"PAD", 3:"UNK"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def normalizeString(s):
#     s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"&apos;m", r"am", s)
    s = re.sub(r"&apos;s", r"is", s)
    s = re.sub(r"&apos;re", r"are", s)
    s = re.sub(r"&apos;", r"", s)
    return s

In [6]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [7]:
def loadingLangs(sourcelang, targetlang, setname):
    input_ls = []
    output_ls = []
    print('Reading lines...')
    # Read the file 
    with open('../iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,sourcelang)) as f:
        for line in f.readlines():
            input_ls.append([normalizeString(word) for word in line.split()])
    with open('../iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,targetlang)) as f:
        for line in f.readlines():
            output_ls.append([normalizeString(word) for word in line.split()])
    pairs = list(zip(input_ls, output_ls))
    print('Read %s sentence pairs'%(len(input_ls)))
    input_lang = Lang(sourcelang)
    output_lang = Lang(targetlang)
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [8]:
source_tra, target_tra, pairs_tra = loadingLangs('zh', 'en', 'train')
source_val, target_val, pairs_val = loadingLangs('zh', 'en', 'dev')
source_tes, target_tes, pairs_tes = loadingLangs('zh', 'en', 'test')

Reading lines...
Read 213377 sentence pairs
Counting words...
Counted words:
zh 88918
en 69063
Reading lines...
Read 1261 sentence pairs
Counting words...
Counted words:
zh 6133
en 4015
Reading lines...
Read 1397 sentence pairs
Counting words...
Counted words:
zh 5215
en 3518


## Dataset 

In [9]:
print("95% of chinese sentences length = {0}".format(np.percentile([len(x[0]) for x in pairs_tra], 95)))
print("95% of english sentences length = {0}".format(np.percentile([len(x[1]) for x in pairs_tra], 95)))
print(random.choice(pairs_tra))

95% of chinese sentences length = 44.0
95% of english sentences length = 48.0
(['我', '电邮', '了', '杀手', '艾', '尔', 'Dunlap', '90', '年代', '开始', '的', '资产', '掠夺', '掠夺者'], ['I', 'emailed', '&quot;', 'Chainsaw', 'Al', '&quot;', 'Dunlap', ',', 'the', 'asset', 'stripper', 'from', 'the', '1990s', ' .'])


## Dataset

In [10]:
MAX_SENT_LEN = 38
BATCH_SIZE = 32

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair,source,target):
    input_lang = source
    output_lang = target
    input_tensor = tensorFromSentence(input_lang, pair[0]).reshape((-1))
    target_tensor = tensorFromSentence(output_lang, pair[1]).reshape((-1))
    return (input_tensor, input_tensor.shape[0], target_tensor, target_tensor.shape[0])

In [12]:
class NMTDataset(Dataset):
    def __init__(self, source, target, pairs):
        self.source = source
        self.target = target
        self.pairs = pairs
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        inp_ten, inp_len, tar_ten, tar_len = tensorsFromPair(self.pairs[key], self.source, self.target)
        item = {}
        item['inputtensor'] = inp_ten[:MAX_SENT_LEN]
        item['inputlen'] = min(inp_len, MAX_SENT_LEN)
        item['targettensor'] = tar_ten[:MAX_SENT_LEN]
        item['targetlen'] = min(tar_len, MAX_SENT_LEN)
        return item

In [13]:
train_data = NMTDataset(source_tra, target_tra, pairs_tra)
val_data = NMTDataset(source_tra, target_tra, pairs_val)
test_data = NMTDataset(source_tra, target_tra, pairs_tes)

In [14]:
train_data.__getitem__(234)

{'inputtensor': tensor([  49,  871,   16, 1235,  454, 1112,    6,   84,   85,  322,  398,  310,
            6, 1236, 1237,  735,   57, 1238,  391,  621,  611,  612,  613,   84,
           85,   16, 1239,   18,  885,    6, 1240,    1]),
 'inputlen': 32,
 'targettensor': tensor([ 48,  89,  52,  53, 577, 206,  77,  30, 113,  25,  54,  21, 210, 831,
          21,  22,  23,  56,  77, 206,  52,  53, 921, 915,  44,   1]),
 'targetlen': 26}

## Dataloader

In [14]:
#collate function

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    src_data, tar_data, src_len, tar_len = [], [], [], []
    for datum in batch:        
        src_datum = np.pad(np.array(datum['inputtensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['inputlen'])),
                                mode="constant", constant_values=PAD_IDX)
        tar_datum = np.pad(np.array(datum['targettensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['targetlen'])),
                                mode="constant", constant_values=PAD_IDX)
        src_data.append(src_datum)
        tar_data.append(tar_datum)
        src_len.append(datum['inputlen'])
        tar_len.append(datum['targetlen'])
    return [torch.from_numpy(np.array(src_data)).to(device),torch.from_numpy(np.array(tar_data)).to(device),
               torch.from_numpy(np.array(src_len)).to(device),torch.from_numpy(np.array(tar_len)).to(device)]

In [15]:
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_func)

In [16]:
# sample data loader
count = 0
for data in train_loader:
    count+=1
    print('input sentence batch: ')
    print(data[0])
    print('input batch dimension: {}'.format(data[0].size()))
    print('target sentence batch: ')
    print(data[1])
    print('target batch dimension: {}'.format(data[1].size()))
    print('input sentence len: ')
    print(data[2])
    print('target sentence len: ')
    print(data[3])
    if count == 1:
        break

input sentence batch: 
tensor([[52808,   403,  6865,  ...,     2,     2,     2],
        [   16,  1584,    31,  ...,     2,     2,     2],
        [   49,   871,    16,  ...,     2,     2,     2],
        ...,
        [  277,  3093,   494,  ...,     2,     2,     2],
        [   23,   163,   240,  ...,     2,     2,     2],
        [   75,    20,   176,  ...,     2,     2,     2]])
input batch dimension: torch.Size([32, 38])
target sentence batch: 
tensor([[  201,    25,     6,  ...,     1,     2,     2],
        [   61,   584, 10675,  ...,     2,     2,     2],
        [   48,    89,   273,  ...,     2,     2,     2],
        ...,
        [   45,    25,     6,  ...,     2,     2,     2],
        [   51,    73,    25,  ...,     2,     2,     2],
        [  156,   130,  8298,  ...,     2,     2,     2]])
target batch dimension: torch.Size([32, 38])
input sentence len: 
tensor([23, 10, 17,  4, 38,  6, 12, 11,  8, 13, 17, 11, 16, 10,  1, 26, 13, 13,
        17,  9, 15,  5,  7,  4, 38,  7,

----------------------------------------------

In [17]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers = 1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True) 
        self.fc1 = nn.Linear(2*hidden_size, hidden_size)
    def initHidden(self, batch_size):
        return torch.zeros(2, batch_size, self.hidden_size, device=device) 
    
    def forward(self, input, hidden):
        batch_size = input.size()[0]
        embedded = self.embedding(input).view(1, batch_size, -1)  
        output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)  
        output = self.fc1(output)
        return output, hidden

In [21]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_SENT_LEN):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
#         print('input size is ',input.size())
        input = input.view(1,-1)
        batch_size = input.size()[1]
        
        embedded = self.embedding(input).view(1, batch_size, -1)
        embedded = self.dropout(embedded)
        
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)   
        attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                 encoder_outputs.transpose(0,1))
        
        output = torch.cat((embedded[0], attn_applied.transpose(0,1)[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)

        return output, hidden, attn_weights

In [20]:
def train(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion, mask = None):
    encoder_hidden = encoder.initHidden(BATCH_SIZE)
    encoder_optimizer.zero_grad()  
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size()[0] 
    target_length = target_tensor.size()[0]
    encoder_outputs = torch.zeros(target_length, BATCH_SIZE, encoder.hidden_size, device=device) 

    loss = 0
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden) 
#         print(encoder_output.size())
        encoder_outputs[ei] = encoder_output[0]
        
        
    encoder_hidden = nn.Linear(2*hidden_size,hidden_size)(
        torch.cat((encoder_hidden[0],encoder_hidden[1]),dim = 1)).unsqueeze(0)
    
    decoder_input = torch.tensor([[SOS_token]*32], device=device)  # decoder_input: torch.Size([1, 32])
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            
            decoder_input = target_tensor[di]  
            
            temp_loss = criterion(decoder_output, target_tensor[di])
            loss += temp_loss * mask[di:di+1].float()  
            ave_loss = loss.sum()/BATCH_SIZE 
            
    else:
        for di in range(target_length):
            
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
           

            decoder_input = topi.transpose(0,1).detach()  # detach from history as input
            
            temp_loss = criterion(decoder_output, target_tensor[di])
            loss += temp_loss * mask[di:di+1].float()
            ave_loss = loss.sum()/BATCH_SIZE  
            
    ave_loss.backward()
    
    
    encoder_optimizer.step()   
    decoder_optimizer.step()

    return ave_loss.item() / target_length

In [24]:
val_loader = torch.utils.data.DataLoader(val_data,
                                           batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_func)

In [31]:
def evaluate_rnn(encoder, decoder, data_loader, max_length=MAX_SENT_LEN):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param sentence: string, a sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    @output decoder_attentions: a list of vector, each of which sums up to 1.0
    """
    encoder.eval()
    decoder.eval()
    corpus = []
    truths = []

    for i, (input_sentences, target_sentences,len1,len2) in enumerate(data_loader): 
#         print('v',i)
        input_tensor = input_sentences.transpose(0,1).to(device)  
        target_tensor = target_sentences.to(device)
        truths.append(target_tensor)
        input_length = input_tensor.size()[0]
        batch_size = input_tensor.size()[1]
#         print(batch_size)

    # encode the source lanugage
        encoder_hidden = encoder.initHidden(batch_size)
        encoder_outputs = Variable(torch.zeros(max_length,batch_size, encoder.hidden_size)).to(device)
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] =  encoder_output[0]
        encoder_hidden = nn.Linear(2*hidden_size,hidden_size)(
        torch.cat((encoder_hidden[0],encoder_hidden[1]),dim = 1)).unsqueeze(0)
        
    # decode the context vector
        decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence
        decoder_input = Variable(torch.LongTensor([[SOS_token]*batch_size])).to(device) # SOS
#         print(decoder_input.size()) #[1,32]
        # output of this function
        decoded_words = torch.zeros(batch_size, max_length)
        decoder_attentions = torch.zeros(max_length, max_length)
        # unfold
        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            # hint: print out decoder_output and decoder_attention
#             decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            ni = topi
        
            decoded_words[:,di] = ni.squeeze()

#             # stop unfolding whenever '<EOS>' token is returned
#             if ni == EOS_token:
#                 decoded_words.append('<EOS>')
#                 break
#             else:
#                 decoded_words.append(target_tra.index2word[ni])

            decoder_input = Variable(torch.LongTensor(ni.transpose(0,1))).to(device)
#             print(decoded_words.size())
        corpus.append(decoded_words)
#             attns.append(decoder_attentions[:di + 1])
    #truths = [t.transpose(0,1) for t in truths]
    return corpus, truths

In [32]:
def convert_idx_2_sent(pred_tensor, truth_tensor,lang_obj):
    pred_word_list = []
    truth_word_list = []
    for i in pred_tensor:
        if i.item() not in set([PAD_IDX,EOS_token,SOS_token]):
            pred_word_list.append(lang_obj.index2word[i.item()])
    for j in truth_tensor:
        if j.item() not in set([PAD_IDX,EOS_token,SOS_token]):
            truth_word_list.append(lang_obj.index2word[j.item()])
    pred_sent = (' ').join(pred_word_list)
    truth_sent = (' ').join(truth_word_list)
    return pred_sent, truth_sent

In [33]:
def bleu(corpus, truths):
    '''
    corpus: list, NBs * BATCHSIZE * MAX_LEN
    truths: list, NBs * BATCHSIZE * MAX_LEN
    
    return: array of length NBs, avg blue score for each batch
    '''
    n = len(corpus)
    bleus = [0]*n
    for i in range(n):
        pred, true = corpus[i], truths[i]
        sumbleu = 0.0
        for j in range(len(corpus[i])):
            pred_tensor, true_tensor = pred[j], true[j]
            pred_sent, true_sent = convert_idx_2_sent(pred_tensor, true_tensor, target_tra)
            sumbleu += corpus_bleu(true_sent, pred_sent).score
        avgbleu = sumbleu / len(corpus[i])
#         print(avgbleu)
        bleus[i] = avgbleu
    return bleus

In [34]:
hidden_size = 256
learning_rate=0.01
num_epoch = 10
print_every=1
plot_every=1

encoder1 = EncoderRNN(source_tra.n_words,hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, target_tra.n_words, dropout_p=0.1).to(device)

start = time.time()

encoder_optimizer = optim.Adam(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(attn_decoder1.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(reduce = False) 

for epoch in range(1, num_epoch + 1):
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(train_loader): 
#         print(i)
        input_tensor = input_sentences.transpose(0,1)   
        target_tensor = target_sentences.transpose(0,1)
        mask = target_tensor.ge(1)   
        loss = train(input_tensor, target_tensor, encoder1,
                     attn_decoder1, encoder_optimizer, decoder_optimizer, criterion, mask = mask)
        print_loss_total += loss
        plot_loss_total += loss

        if i % print_every == 0:
            corpus, truths = evaluate_rnn(encoder1, attn_decoder1, val_loader, max_length=MAX_SENT_LEN)
            score_ls = bleu(corpus, truths)
            avg_score = np.array(score_ls).mean()
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}, Average BLEU: {}'.format(
                    timeSince(start, i + 1/len(train_loader)), epoch, num_epoch, i, 
                    len(train_loader),print_loss_avg, avg_score))

        if i > 0 and i % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

                
    print(plot_losses)
        




Time: 1m 21s (- 9095m 58s), Epoch: [1/10], Step: [0/6669], Train Loss: 11.007058394582648, Average BLEU: 1.4285476040400251
Time: 2m 52s (- -1m 59s), Epoch: [1/10], Step: [1/6669], Train Loss: 7.059808831465872, Average BLEU: 1.4257662727480405


KeyboardInterrupt: 