In [1]:
from __future__ import unicode_literals, print_function, division
import pickle as pkl
from io import open
import unicodedata
import string
import re
import random
import torch
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import numpy as np, pandas as pd


In [2]:
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
teacher_forcing_ratio = 1
import math


In [3]:
from sacrebleu import raw_corpus_bleu, corpus_bleu

In [4]:
def mask_ind(arr):
    arr = arr.cpu().numpy()
    batch_size = arr.shape[1]

    for i in range(batch_size):
        if 1 in arr[:,i]:
            ind = np.where(arr[:,i]== 1)[0][0]
        
            arr[:,i][:ind+1]=1
            arr[:,i][ind+1:]=0
        else:
            arr[:,i]=1
        
    
    return arr, np.count_nonzero(arr)
                

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'

In [6]:
device

'cuda'

## Data Pre-processing

In [7]:
SOS_token = 0
EOS_token = 1
PAD_IDX = 2
UNK_IDX = 3
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2:"PAD", 3:"UNK"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [8]:
def normalizeString(s):
#     s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"&apos;m", r"am", s)
    s = re.sub(r"&apos;s", r"is", s)
    s = re.sub(r"&apos;re", r"are", s)
    s = re.sub(r"&apos;", r"", s)
    return s

In [9]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [10]:
def loadingLangs(sourcelang, targetlang, setname):
    input_ls = []
    output_ls = []
    print('Reading lines...')
    # Read the file 
    with open('../iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,sourcelang)) as f:
        for line in f.readlines():
            input_ls.append([normalizeString(word) for word in line.split()])
    with open('../iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,targetlang)) as f:
        for line in f.readlines():
            output_ls.append([normalizeString(word) for word in line.split()])
    pairs = list(zip(input_ls, output_ls))
    print('Read %s sentence pairs'%(len(input_ls)))
    input_lang = Lang(sourcelang)
    output_lang = Lang(targetlang)
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [11]:
source_tra, target_tra, pairs_tra = loadingLangs('zh', 'en', 'train')
source_val, target_val, pairs_val = loadingLangs('zh', 'en', 'dev')
source_tes, target_tes, pairs_tes = loadingLangs('zh', 'en', 'test')

Reading lines...
Read 213377 sentence pairs
Counting words...
Counted words:
zh 88918
en 69063
Reading lines...
Read 1261 sentence pairs
Counting words...
Counted words:
zh 6133
en 4015
Reading lines...
Read 1397 sentence pairs
Counting words...
Counted words:
zh 5215
en 3518


## Dataset 

In [12]:
print("95% of chinese sentences length = {0}".format(np.percentile([len(x[0]) for x in pairs_tra], 95)))
print("95% of english sentences length = {0}".format(np.percentile([len(x[1]) for x in pairs_tra], 95)))
print(random.choice(pairs_tra))

95% of chinese sentences length = 44.0
95% of english sentences length = 48.0
(['晶体', '晶体管', '变得', '越来', '越来越', '小', '才', '使', '这', '一切', '得以', '实现', '而', '技术', '更是', '得益', '得益于', '益于', '于此'], ['Transistors', 'are', 'getting', 'smaller', 'to', 'allow', 'this', 'to', 'happen', ',', 'and', 'technology', 'has', 'really', 'benefitted', 'from', 'that', ' .'])


## Dataset

In [13]:
MAX_SENT_LEN = 45
BATCH_SIZE = 80

In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair,source,target):
    input_lang = source
    output_lang = target
    input_tensor = tensorFromSentence(input_lang, pair[0]).reshape((-1))
    target_tensor = tensorFromSentence(output_lang, pair[1]).reshape((-1))
    return (input_tensor, input_tensor.shape[0], target_tensor, target_tensor.shape[0])


In [15]:
class NMTDataset(Dataset):
    def __init__(self, source, target, pairs):
        self.source = source
        self.target = target
        self.pairs = pairs
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        inp_ten, inp_len, tar_ten, tar_len = tensorsFromPair(self.pairs[key], self.source, self.target)
        item = {}
        item['inputtensor'] = inp_ten[:MAX_SENT_LEN]
        item['inputlen'] = min(inp_len, MAX_SENT_LEN)
        item['targettensor'] = tar_ten[:MAX_SENT_LEN]
        item['targetlen'] = min(tar_len, MAX_SENT_LEN)
        return item

In [16]:
train_data = NMTDataset(source_tra, target_tra, pairs_tra)
val_data = NMTDataset(source_tra, target_tra, pairs_val)
test_data = NMTDataset(source_tra, target_tra, pairs_tes)

In [17]:
train_data.__getitem__(234)

{'inputtensor': tensor([  49,  871,   16, 1235,  454, 1112,    6,   84,   85,  322,  398,  310,
            6, 1236, 1237,  735,   57, 1238,  391,  621,  611,  612,  613,   84,
           85,   16, 1239,   18,  885,    6, 1240,    1], device='cuda:0'),
 'inputlen': 32,
 'targettensor': tensor([ 48,  89,  52,  53, 577, 206,  77,  30, 113,  25,  54,  21, 210, 831,
          21,  22,  23,  56,  77, 206,  52,  53, 921, 915,  44,   1],
        device='cuda:0'),
 'targetlen': 26}

## Dataloader

In [18]:
#collate function

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    src_data, tar_data, src_len, tar_len = [], [], [], []
    for datum in batch:        
        src_datum = np.pad(np.array(datum['inputtensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['inputlen'])),
                                mode="constant", constant_values=PAD_IDX)
        tar_datum = np.pad(np.array(datum['targettensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['targetlen'])),
                                mode="constant", constant_values=PAD_IDX)
        src_data.append(src_datum)
        tar_data.append(tar_datum)
        src_len.append(datum['inputlen'])
        tar_len.append(datum['targetlen'])
    return [torch.from_numpy(np.array(src_data)).to(device),torch.from_numpy(np.array(tar_data)).to(device),
               torch.from_numpy(np.array(src_len)).to(device),torch.from_numpy(np.array(tar_len)).to(device)]

In [19]:
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_func)

val_loader = torch.utils.data.DataLoader(val_data,
                                           batch_size=BATCH_SIZE,shuffle=False, collate_fn=collate_func)


In [20]:
# sample data loader
count = 0
for data in train_loader:
    count+=1
    print('input sentence batch: ')
    print(data[0])
    print('input batch dimension: {}'.format(data[0].size()))
    print('target sentence batch: ')
    print(data[1])
    print('target batch dimension: {}'.format(data[1].size()))
    print('input sentence len: ')
    print(data[2])
    print('target sentence len: ')
    print(data[3])
    if count == 1:
        break

input sentence batch: 
tensor([[ 953, 3265,   49,  ...,    2,    2,    2],
        [  49,   15,  277,  ...,    2,    2,    2],
        [2370,    6,  185,  ...,    2,    2,    2],
        ...,
        [ 174, 2181,  100,  ..., 1339,   80, 1487],
        [ 619,  141,  235,  ...,    2,    2,    2],
        [ 354,  185,   15,  ...,    2,    2,    2]], device='cuda:0')
input batch dimension: torch.Size([80, 45])
target sentence batch: 
tensor([[ 156,  732,  707,  ...,    2,    2,    2],
        [  51,  185,   48,  ...,    2,    2,    2],
        [ 550,  446,  330,  ...,    2,    2,    2],
        ...,
        [  51,   81,   23,  ...,   90,  548, 5080],
        [ 879,  220,  417,  ...,    2,    2,    2],
        [  74,  254,  203,  ...,    2,    2,    2]], device='cuda:0')
target batch dimension: torch.Size([80, 45])
input sentence len: 
tensor([ 9,  9, 25, 15, 31, 11, 36, 35, 45,  9,  5, 29,  9, 14, 18, 45, 16, 37,
         7, 19, 11,  6, 28, 12, 15,  7, 23, 12, 20, 27, 15, 20,  4, 11, 45, 2

----------------------------------------------

In [21]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, n_layers = 1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.emb_size = emb_size

        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, bidirectional=False, batch_first = True) #in/out (batch, seq_len, feature_size)
        self.fc = nn.Linear(hidden_size, hidden_size)
        
    def initHidden(self,batch_size):
        return torch.zeros(self.n_layers, batch_size, self.hidden_size, device=device) 
    
    def forward(self, input, hidden):
        batch_size = input.size()[0]
        seq_len = input.size()[1]
        embedded = self.embedding(input).view(batch_size, seq_len, -1) 
        output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)
            
        hidden = self.fc(hidden).view(batch_size,-1,self.hidden_size)
        return output, hidden

In [22]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, vocab_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size 
        
        self.embedding = nn.Embedding(num_embeddings = vocab_size,
                                      embedding_dim = hidden_size)
        self.gru = nn.GRU(input_size = hidden_size,
                          hidden_size = hidden_size, 
                          batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, input, hidden):
        #input shape: B*S (s=1) * I 
        #print('in decoder, input dimension is {} '.format(input.size()))
        input = input.view(-1,1)
        batch_size = input.size()[0]
        output = self.embedding(input).view(batch_size, 1, -1)
        
        hidden = hidden.view(1,batch_size,-1)
        for i in range(self.n_layers):
            #output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output))
        return output, hidden

    

In [23]:
def train(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer):
    
    batch_size = input_tensor.size()[0]
    encoder_hidden = encoder.initHidden(batch_size)
    encoder_optimizer.zero_grad()  
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size()[1] 
    target_length = target_tensor.size()[1]
    

    loss = 0    
    _, hidden = encoder(input_tensor, encoder_hidden)
    
    
    decoder_input = torch.tensor([batch_size*[SOS_token]], device=device).view(batch_size,-1) 
    decoder_hidden = hidden.to(device)
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    
    if use_teacher_forcing:
        loss = 0 
        criterion = nn.NLLLoss(reduce = True, ignore_index = 2, reduction = 'mean') 

    # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            
            decoder_input = target_tensor[:,di] 
            temp_loss = criterion(decoder_output[:,-1,:], target_tensor[:,di])
            loss += temp_loss 
            
        ave_loss = loss/target_length
                        
    else:
        loss = None 
        criterion = nn.NLLLoss(reduce = False) 
        prediction = None
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):            
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            
            decoder_input = topi.squeeze().detach()
            
            if prediction is None:
                prediction = topi.view(1,-1)
            else:
                prediction = torch.cat((prediction, topi.view(1,-1)), dim=0)            
                            
            temp_loss = criterion(decoder_output[:,-1,:], target_tensor[:,di])
            
            if loss is None:
                loss = temp_loss.view(1,-1)
            else:
                loss = torch.cat((loss, temp_loss.view(1,-1)),dim=0)
            
       
        mask, count = mask_ind(prediction)
        total_loss = torch.sum(loss * torch.from_numpy(mask).float().to(device))
        ave_loss = total_loss/count

    ave_loss.backward()
    
    
    encoder_optimizer.step()   # update parameters
    decoder_optimizer.step()

    return ave_loss.item() 

### Evaluation

In [25]:
def convert_idx_2_sent_new(idx_tensor, lang_obj):
    word_list = []
    #truth_word_list = []
    for i in idx_tensor:
        if i.item() not in set([PAD_IDX,EOS_token,SOS_token]):
            word_list.append(lang_obj.index2word[i.item()])
#     for j in truth_tensor:
#         if j.item() not in set([PAD_IDX,EOS_token,SOS_token]):
#             truth_word_list.append(lang_obj.index2word[j.item()])
    sent = (' ').join(word_list)
    #truth_sent = (' ').join(truth_word_list)
    return sent


In [26]:
def bleu_new(corpus,truths):
    n = len(corpus)
    bleu = [0]*n
    for i in range(n):
        pred, true = corpus[i], truths[i]
        pred_ls = [convert_idx_2_sent_new(sent, target_tra) for sent in pred]
        true_ls = [convert_idx_2_sent_new(sent, target_tra) for sent in true]
        bleu[i] = corpus_bleu(pred_ls, [true_ls]).score
    return np.mean(bleu)


In [27]:
def evaluate(encoder, decoder, data_loader, max_length=MAX_SENT_LEN):
    start = time.time()
    encoder.eval()
    decoder.eval()
    inputs = []
    corpus = []
    truths = []
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(data_loader):
#         if i % 5 == 0:
#             print('Time: {}, Step: [{}/{}]'.format(
#                 timeSince(start, i + 1/len(train_loader)), i, len(data_loader)))
        inputs.append(input_sentences.to(device))
        input_tensor = input_sentences.to(device)
        truths.append(target_sentences.to(device))
        target_tensor = target_sentences.to(device) 
        #truths.append(target_tensor)
        input_length = input_tensor.size()[1]
        batch_size = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden(batch_size)
        #encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size, device=device)
        encoder_outputs, encoder_hidden = encoder(input_tensor, encoder_hidden)
        
        
        decoder_hidden = encoder_hidden.to(device)
        decoder_input = torch.tensor([batch_size*[SOS_token]], device=device).view(batch_size,-1) 
        decoded_words = torch.zeros(batch_size, max_length)
    
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            decoded_words[:,di] = topi.squeeze()
            decoder_input = topi.squeeze().detach()
        corpus.append(decoded_words)
        #print(inputs[0].size(), corpus[0].size(), truths[0].size())
    return inputs, corpus, truths



In [None]:
hidden_size = 2000
learning_rate = 0.0001
num_epoch = 5
print_every = 100
plot_every = 100

encoder1 = EncoderRNN(source_tra.n_words,hidden_size, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, target_tra.n_words).to(device)

start = time.time()

encoder_optimizer = optim.Adam(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder1.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(reduce = False) 

for epoch in range(1, num_epoch + 1):
    
    plot_bleu_score_val = []
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(train_loader): 
        encoder1.train()
        decoder1.train()
        input_tensor = input_sentences
        target_tensor = target_sentences
        loss = train(input_tensor, target_tensor, encoder1,
                     decoder1, encoder_optimizer, decoder_optimizer)
        print_loss_total += loss
        plot_loss_total += loss
        if i > 0 and i % print_every == 0:
            inputs, corpus, truths = evaluate(encoder1, decoder1, val_loader, max_length=MAX_SENT_LEN)
            bleu_score_val_avg = bleu_new(corpus, truths)#np.mean(bleu_score_val)
            plot_bleu_score_val.append(bleu_score_val_avg)


            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}, BLEU: {}'.format(
                timeSince(start, i + 1/len(train_loader)), epoch, num_epoch, i, 
                len(train_loader),print_loss_avg, bleu_score_val_avg))
            print('\nInput> %s'%(' '.join([source_tra.index2word[i.item()] for i in inputs[0][3] if i.item() not in set([PAD_IDX,EOS_token,SOS_token])])))
            print('\nTarget= %s'%(convert_idx_2_sent_new(truths[0][3], target_tra)),
                    '\nPredict< %s' %(convert_idx_2_sent_new(corpus[0][3], target_tra)))
        if i > 0 and i % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
                
    print(plot_losses)
        




Time: 2m 16s (- -3m 45s), Epoch: [1/5], Step: [100/2668], Train Loss: 3.226272175312042, BLEU: 0.2792153053314296

Input> 塔利 塔利班 走 了 父亲 大声 叫 着

Target= &quot; The Taliban are gone  ! &quot; my father shouted  . 
Predict<  . Auctioneers acetic rendition semi-literate Rodrigo haranguing tantrums  . nudism gastronomy cut-grass nudism tectonically restitution tectonically epistemological Twelve Powell gizmo gizmo governor gizmo Vertes proportional propel snowflake Kulp gibberish restate restate transcendental sub-section restitution tectonically epistemological Whoosh Shyam prejudices restate million-pound 30-square-mile Marsalisly
Time: 4m 31s (- -5m 30s), Epoch: [1/5], Step: [200/2668], Train Loss: 2.6513734745979307, BLEU: 0.30803700387774513

Input> 塔利 塔利班 走 了 父亲 大声 叫 着

Target= &quot; The Taliban are gone  ! &quot; my father shouted  . 
Predict<  . dissociated 1,020  . Nessun assumed consummatory  . pro-bono Vermeulen Mecca Mecca Mecca Mecca Mecca Mecca Mecca Mecca Mecca Mecca Mecca M