In [1]:
from __future__ import unicode_literals, print_function, division
import pickle as pkl
from io import open
import unicodedata
import string
import re
import random
import torch
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import numpy as np, pandas as pd


In [2]:
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
teacher_forcing_ratio = 0.5
import math


In [3]:
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

## Data Pre-processing

In [4]:
SOS_token = 0
EOS_token = 1
PAD_IDX = 2
UNK_IDX = 3
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2:"UNK", 3:"PAD"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def normalizeString(s):
#     s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"&apos;m", r"am", s)
    s = re.sub(r"&apos;s", r"is", s)
    s = re.sub(r"&apos;re", r"are", s)
    s = re.sub(r"&apos;", r"", s)
    return s

In [6]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [7]:
def loadingLangs(sourcelang, targetlang, setname):
    input_ls = []
    output_ls = []
    print('Reading lines...')
    # Read the file 
    with open('../iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,sourcelang)) as f:
        for line in f.readlines():
            input_ls.append([normalizeString(word) for word in line.split()])
    with open('../iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,targetlang)) as f:
        for line in f.readlines():
            output_ls.append([normalizeString(word) for word in line.split()])
    pairs = list(zip(input_ls, output_ls))
    print('Read %s sentence pairs'%(len(input_ls)))
    input_lang = Lang(sourcelang)
    output_lang = Lang(targetlang)
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [8]:
source_tra, target_tra, pairs_tra = loadingLangs('zh', 'en', 'train')
source_val, target_val, pairs_val = loadingLangs('zh', 'en', 'dev')
source_tes, target_tes, pairs_tes = loadingLangs('zh', 'en', 'test')

Reading lines...
Read 213377 sentence pairs
Counting words...
Counted words:
zh 88918
en 69063
Reading lines...
Read 1261 sentence pairs
Counting words...
Counted words:
zh 6133
en 4015
Reading lines...
Read 1397 sentence pairs
Counting words...
Counted words:
zh 5215
en 3518


## Dataset 

In [9]:
print("95% of chinese sentences length = {0}".format(np.percentile([len(x[0]) for x in pairs_tra], 95)))
print("95% of english sentences length = {0}".format(np.percentile([len(x[1]) for x in pairs_tra], 95)))
print(random.choice(pairs_tra))

95% of chinese sentences length = 44.0
95% of english sentences length = 48.0
(['也', '想', '向', '你', '保证', '虽然', '我', '是', '个', '摇滚', '明星', '但', '我', '的', '愿望', '并', '不', '包括', '按摩', '浴缸'], ['And', 'though', 'I', 'am', 'a', 'rock', 'star', ',', 'I', 'just', 'want', 'to', 'assure', 'you', 'that', 'none', 'of', 'my', 'wishes', 'will', 'include', 'a', 'hot', 'tub', ' .'])


## Dataset

In [10]:
MAX_SENT_LEN = 40
BATCH_SIZE = 32

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair,source,target):
    input_lang = source
    output_lang = target
    input_tensor = tensorFromSentence(input_lang, pair[0]).reshape((-1))
    target_tensor = tensorFromSentence(output_lang, pair[1]).reshape((-1))
    return (input_tensor, input_tensor.shape[0], target_tensor, target_tensor.shape[0])

In [12]:
class NMTDataset(Dataset):
    def __init__(self, source, target, pairs):
        self.source = source
        self.target = target
        self.pairs = pairs
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        inp_ten, inp_len, tar_ten, tar_len = tensorsFromPair(self.pairs[key], self.source, self.target)
        item = {}
        item['inputtensor'] = inp_ten[:MAX_SENT_LEN]
        item['inputlen'] = min(inp_len, MAX_SENT_LEN)
        item['targettensor'] = tar_ten[:MAX_SENT_LEN]
        item['targetlen'] = min(tar_len, MAX_SENT_LEN)
        return item

In [13]:
train_data = NMTDataset(source_tra, target_tra, pairs_tra)
val_data = NMTDataset(source_val, target_val, pairs_val)
test_data = NMTDataset(source_tes, target_tes, pairs_tes)

In [14]:
train_data.__getitem__(234)

{'inputtensor': tensor([  49,  871,   16, 1235,  454, 1112,    6,   84,   85,  322,  398,  310,
            6, 1236, 1237,  735,   57, 1238,  391,  621,  611,  612,  613,   84,
           85,   16, 1239,   18,  885,    6, 1240,    1]),
 'inputlen': 32,
 'targettensor': tensor([ 48,  89,  52,  53, 577, 206,  77,  30, 113,  25,  54,  21, 210, 831,
          21,  22,  23,  56,  77, 206,  52,  53, 921, 915,  44,   1]),
 'targetlen': 26}

## Dataloader

In [15]:
#collate function

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    src_data, tar_data, src_len, tar_len = [], [], [], []
    for datum in batch:        
        src_datum = np.pad(np.array(datum['inputtensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['inputlen'])),
                                mode="constant", constant_values=PAD_IDX)
        tar_datum = np.pad(np.array(datum['targettensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['targetlen'])),
                                mode="constant", constant_values=PAD_IDX)
        src_data.append(src_datum)
        tar_data.append(tar_datum)
        src_len.append(datum['inputlen'])
        tar_len.append(datum['targetlen'])
    return [torch.from_numpy(np.array(src_data)).to(device),torch.from_numpy(np.array(tar_data)).to(device),
               torch.from_numpy(np.array(src_len)).to(device),torch.from_numpy(np.array(tar_len)).to(device)]

In [16]:
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_func)

In [17]:
# sample data loader
count = 0
for data in train_loader:
    count+=1
    print('input sentence batch: ')
    print(data[0])
    print('input batch dimension: {}'.format(data[0].size()))
    print('target sentence batch: ')
    print(data[1])
    print('target batch dimension: {}'.format(data[1].size()))
    print('input sentence len: ')
    print(data[2])
    print('target sentence len: ')
    print(data[3])
    if count == 1:
        break

input sentence batch: 
tensor([[43952, 43953, 43954,  ...,     2,     2,     2],
        [ 1318, 39803, 39804,  ...,     2,     2,     2],
        [ 1009,  8165,  1736,  ...,    31,   404,   283],
        ...,
        [  140,  1237,   110,  ...,     6,  4924,   181],
        [ 4632,   110, 13524,  ...,     2,     2,     2],
        [  282,   108,    31,  ...,     2,     2,     2]])
input batch dimension: torch.Size([32, 40])
target sentence batch: 
tensor([[30718,    16, 12286,  ...,     2,     2,     2],
        [   51,     6,  1182,  ..., 33744,    44,     1],
        [14539,  3301, 22080,  ...,   219,   135,     6],
        ...,
        [   48,    63, 17863,  ...,    56,   204,  7005],
        [   51,    73,   761,  ...,     2,     2,     2],
        [  246,    56,    53,  ...,     2,     2,     2]])
target batch dimension: torch.Size([32, 40])
input sentence len: 
tensor([13, 35, 40, 17, 12, 14, 19, 28, 15, 20, 14, 15, 12, 16, 16, 11, 40, 13,
         7, 14, 40, 22, 20, 22, 10, 25,

----------------------------------------------

In [18]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, n_layers = 1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, bidirectional=False) 
    def initHidden(self,BATCH_SIZE):
        return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=device) 
    
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(MAX_SENT_LEN, BATCH_SIZE, -1)  
        output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)  
        return output, hidden

In [19]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, input, hidden):
        output = self.embedding(input).view(BATCH_SIZE, 1, -1)
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output))

        return output, hidden

    

In [22]:
def train(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion, mask = None):
    encoder_hidden = encoder.initHidden(BATCH_SIZE)
    encoder_optimizer.zero_grad()  
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0) 
    target_length = target_tensor.size(0)
    

    loss = 0    
    _, hidden = encoder(input_tensor, encoder_hidden)
    
    
    decoder_input = torch.tensor([[SOS_token]*32], device=device)  
    decoder_hidden = hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    
    if use_teacher_forcing:
    # Teacher forcing: Feed the target as the next input
        for di in range(target_length):

            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)

            decoder_input = target_tensor[di]  
            temp_loss = criterion(decoder_output[:,-1,:], target_tensor[di])
            loss += temp_loss * mask[di:di+1].float()  
            ave_loss = loss.sum()/BATCH_SIZE 
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):            
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            
            temp_loss = criterion(decoder_output[:,-1,:], target_tensor[di])
            loss += temp_loss * mask[di:di+1].float()
            ave_loss = loss.sum()/BATCH_SIZE  

    ave_loss.backward()
    
    
    encoder_optimizer.step()   # update parameters
    decoder_optimizer.step()

    return ave_loss.item() / target_length

In [23]:
hidden_size = 256
learning_rate=0.01
num_epoch = 10
print_every=1
plot_every=1

encoder1 = EncoderRNN(source_tra.n_words,hidden_size, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, target_tra.n_words).to(device)

start = time.time()

encoder_optimizer = optim.Adam(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder1.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(reduce = False) 

for epoch in range(1, num_epoch + 1):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(train_loader): 
        input_tensor = input_sentences.transpose(0,1)   
        target_tensor = target_sentences.transpose(0,1)
        mask = target_tensor.ge(1)   
        loss = train(input_tensor, target_tensor, encoder1,
                     decoder1, encoder_optimizer, decoder_optimizer, criterion, mask = mask)
        print_loss_total += loss
        plot_loss_total += loss
        if i > 0 and i % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}'.format(
                timeSince(start, i + 1/len(train_loader)), epoch, num_epoch, i, 
                len(train_loader),print_loss_avg))

        if i > 0 and i % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
                
    print(plot_losses)
        




Time: 0m 12s (- -1m 59s), Epoch: [1/10], Step: [1/6669], Train Loss: 6.872272109985351
Time: 0m 19s (- -1m 50s), Epoch: [1/10], Step: [2/6669], Train Loss: 3.1727447509765625
Time: 0m 25s (- -1m 42s), Epoch: [1/10], Step: [3/6669], Train Loss: 3.2100948333740233
Time: 0m 32s (- -1m 35s), Epoch: [1/10], Step: [4/6669], Train Loss: 3.4660064697265627
Time: 0m 39s (- -1m 28s), Epoch: [1/10], Step: [5/6669], Train Loss: 3.467520523071289
Time: 0m 46s (- -1m 21s), Epoch: [1/10], Step: [6/6669], Train Loss: 3.4428646087646486
Time: 0m 52s (- -1m 14s), Epoch: [1/10], Step: [7/6669], Train Loss: 3.088993263244629
Time: 0m 59s (- -1m 8s), Epoch: [1/10], Step: [8/6669], Train Loss: 3.0875083923339846
Time: 1m 6s (- -1m 1s), Epoch: [1/10], Step: [9/6669], Train Loss: 3.43828125
Time: 1m 13s (- -2m 54s), Epoch: [1/10], Step: [10/6669], Train Loss: 3.439501190185547
Time: 1m 20s (- -2m 47s), Epoch: [1/10], Step: [11/6669], Train Loss: 3.432456207275391
Time: 1m 27s (- -2m 40s), Epoch: [1/10], Step:

KeyboardInterrupt: 