Import Denpendencies, Preprocessing Datas for Model

In [1]:
import requests
import torch
import torch.nn.functional as F
import torchtext

url = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/'
train_en = [line.split() for line in requests.get(url+"train.en").text.splitlines()]
train_vi = [line.split() for line in requests.get(url+"train.vi").text.splitlines()]
test_en = [line.split() for line in requests.get(url+"tst2013.en").text.splitlines()]
test_vi = [line.split() for line in requests.get(url+"tst2013.vi").text.splitlines()]



MODELNAME= "iwslt15-en-vi-rnn.model"
EPOCH= 10
BATCHSIZE= 32
LR= 0.0001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

#Make Vocabulary

def make_vocab(train_data, min_freq):
  vocab = {}
  for tokenlist in train_data:
    for token in tokenlist:
      if token not in vocab:
        vocab[token] = 0
      vocab[token]+=1
  vocablist=[('<unk>', 0),('<pad>', 0),('<cls>', 0),('<eos>', 0) ]
  vocabidx= {}
  for token, freq in vocab.items():
    if freq >= min_freq:
      idx = len(vocablist)
      vocablist.append((token, freq))
      vocabidx[token]=idx
  vocabidx['<unk>']=0
  vocabidx['<pad>']=1
  vocabidx['<cls>']=2
  vocabidx['<eos>']=3
  return vocablist, vocabidx

vocablist_en, vocabidx_en = make_vocab(train_en, 10)
vocablist_vi, vocabidx_vi = make_vocab(train_vi, 10)

#Preprocessing Dataset

def preprocess(data, vocabidx):
  rr=[]
  for tokenlist in data:
    tkl=['<cls>']
    for token in tokenlist:
      tkl.append(token if token in vocabidx else '<unk>')
    tkl.append('<eos>')
    rr.append((tkl))
  return rr

#Print example

train_en_prep = preprocess(train_en, vocabidx_en)
train_vi_prep = preprocess(train_vi, vocabidx_vi)
test_en_prep = preprocess(test_en, vocabidx_en)
for i in range(10):
  print(train_en_prep[i])
  print(train_vi_prep[i])
  print(test_en_prep[i])


train_data = list(zip(train_en_prep, train_vi_prep))
train_data.sort(key = lambda x:(len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test_en, test_vi))

#Make batch for train data

def make_batch(data, batchsize):
  bb=[]
  ben=[]
  bvi= []
  for en, vi in data:
    ben.append(en)
    bvi.append(vi)
    if len(ben) >= batchsize:
      bb.append((ben, bvi))
      ben= []
      bvi =[]
  if len(ben) > 0:
    bb.append((ben, bvi))
  return bb
train_data = make_batch(train_data, BATCHSIZE)

#padding for train data

def padding_batch(b):
  maxlen = max([len(x) for x in b])
  for tkl in b:
    for i in range(maxlen - len(tkl)):
      tkl.append('<pad>')

def padding(bb):
  for ben, bvi in bb:
    padding_batch(ben)
    padding_batch(bvi)

padding(train_data)


train_data = [([[vocabidx_en[token] for token in tokenlist] for tokenlist in ben], 
               [[vocabidx_vi[token] for token in tokenlist] for tokenlist in bvi]) for ben, bvi in train_data]
test_data =  [([vocabidx_en[token] for token in enprep], en, vi) for enprep, en, vi in test_data]

['<cls>', 'Rachel', '<unk>', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
['<cls>', 'Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu', '<eos>']
['<cls>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', '<unk>', '.', '&quot;', '<eos>']
['<cls>', 'In', '4', 'minutes', ',', 'atmospheric', 'chemist', 'Rachel', '<unk>', 'provides', 'a', 'glimpse', 'of', 'the', 'massive', 'scientific', 'effort', 'behind', 'the', 'bold', 'headlines', 'on', 'climate', 'change', ',', 'with', 'her', 'team', '--', 'one', 'of', 'thousands', 'who', 'contributed', '--', 'taking', 'a', 'risky', 'flight', 'over', 'the', 'rainforest', 'in', 'pursuit', 'of', 'data', 'on', 'a', 'key', 'molecule', '.', '<eos>']
['<cls>', 'Trong', '4', 'phút', ',', 'chuyên', 'gia', 'hoá', 'học', 'khí', 'quyển', 'Rachel', '<unk>', 'giới', 'th

In [2]:
class BiLSTMEncDec(torch.nn.Module):
    def __init__(self,vocablist_x,vocabidx_x,vocablist_y,vocabidx_y):
        super(BiLSTMEncDec,self).__init__()

        #Bidirection LSTM with Encoder and Decoder
        self.encemb = torch.nn.Embedding(len(vocablist_x),256, padding_idx=vocabidx_x['<pad>'])

        self.dropout = torch.nn.Dropout(0.5)

        self.encrnn = torch.nn.LSTM(256,516,2,dropout = 0.5,  bidirectional=True)

        self.decemb = torch.nn.Embedding(len(vocablist_x),256, padding_idx=vocabidx_y['<pad>'])

        self.decrnn = torch.nn.LSTM(256,516,2,dropout = 0.5, bidirectional=True)

        self.decout = torch.nn.Linear(516*2,len(vocabidx_y))

    def forward(self,x):
        #encoder
        #x: input
        #y:output
        x,y = x[0],x[1]
        e_x = self.encemb(x)
        n_x = e_x.size()[0]
        h = torch.zeros((4,x.size(1),516), dtype=torch.float32).to(DEVICE)
        c = torch.zeros((4,x.size(1),516), dtype=torch.float32).to(DEVICE)
        for i in range(n_x):
            output,(h,c)=self.encrnn(torch.unsqueeze(e_x[i],0),(h,c)) 
        output = torch.squeeze(output)
        #decoder
        e_y = self.decemb(y)

        n_y = e_y.size()[0]
        loss = torch.tensor(0.,dtype=torch.float32).to(DEVICE)
        for i in range(n_y-1):
            out,(h,c) = self.decrnn(torch.unsqueeze(e_y[i],0),(h,c))
            out = torch.squeeze(out)
            loss += F.cross_entropy(self.decout(out), y[i+1])
        return loss

    def evaluate(self,x,vocablist_y,vocabidx_y):
        # encoder

        e_x = self.encemb(x)
        n_x = e_x.size()[0]

        h = torch.zeros((4,x.size(1),516), dtype=torch.float32).to(DEVICE)
        c = torch.zeros((4,x.size(1),516), dtype=torch.float32).to(DEVICE)
        for i in range(n_x):
            output,(h,c)=self.encrnn(torch.unsqueeze(e_x[i],0),(h,c))
        # decoder

        y = torch.tensor([vocabidx_y['<cls>']]).to(DEVICE)
        e_y = self.decemb(y)
        pred = []
        for i in range(30):
            out,(h,c) = self.decrnn(torch.unsqueeze(e_y,0),(h,c))
            out = out.view(1,1032)
            pred_id = self.decout(out).squeeze().argmax()
            #pred_id predicts the output word ID, if pred_id is equal to the ID of <eos> inference ends.
            if pred_id == vocabidx_y['<eos>']:
                break
            pred_y = vocablist_y[pred_id][0]
            pred.append(pred_y)
            #The decoder processes 1 word by word and takes the resulting output as the next input
            y[0] = pred_id
            e_y = self.decemb(y)
        return pred

In [3]:
list_train = []
list_loss = []
def train():
    model = BiLSTMEncDec(vocablist_en,vocabidx_en,vocablist_vi,vocabidx_vi).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr = LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)
    for epoch in range(EPOCH):
        loss = 0
        step = 0
        for ben, bvi in train_data:
            ben = torch.tensor(ben,dtype=torch.int64).transpose(0,1).to(DEVICE)
            bvi = torch.tensor(bvi,dtype=torch.int64).transpose(0,1).to(DEVICE)
            optimizer.zero_grad()
            batchloss =model((ben,bvi))
            batchloss.backward()
            optimizer.step()
            loss = loss + batchloss.item()
            if step %100 == 0:
                print('step:',step,'batchloss:',batchloss.item())
            step+=1
        print('epoch',epoch,': loss',loss)
        list_loss.append(loss)
        if (epoch%2==0):
            torch.save(model.state_dict(),MODELNAME)
            list_train.append(test_during_train(epoch))
        scheduler.step()
    torch.save(model.state_dict(),MODELNAME)

def test_during_train(epoch):
    total = 0
    correct = 0
    model = BiLSTMEncDec(vocablist_en,vocabidx_en,vocablist_vi,vocabidx_vi).to(DEVICE)
    model.load_state_dict(torch.load(MODELNAME))
    model.eval()
    ref = []
    pred =[]
    for enprep, en, vi in test_data:
        input =  torch.tensor([enprep], dtype=torch.int64).transpose(0,1).to(DEVICE)
        p = model.evaluate(input, vocablist_vi,vocabidx_vi)
        ref.append([vi])
        pred.append(p)
    bleu = torchtext.data.metrics.bleu_score(pred,ref)
    print('bleu of epoch {}: '.format(epoch),bleu)
    return bleu


Training

In [4]:
EPOCH = 10
LR = 0.001
MODELNAME ='iwslt15-en-vi-lstm-lr-0.001.model'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

import time
start = time.time()
train()
print("Done at:", time.time()-start)

step: 0 batchloss: 8.566254615783691
step: 100 batchloss: 25.310598373413086
step: 200 batchloss: 31.356935501098633
step: 300 batchloss: 51.43785095214844
step: 400 batchloss: 45.19828796386719
step: 500 batchloss: 35.16333770751953
step: 600 batchloss: 51.88880157470703
step: 700 batchloss: 43.972068786621094
step: 800 batchloss: 59.01970672607422
step: 900 batchloss: 46.668128967285156
step: 1000 batchloss: 60.2830924987793
step: 1100 batchloss: 45.3815803527832
step: 1200 batchloss: 62.09601974487305
step: 1300 batchloss: 52.25365447998047
step: 1400 batchloss: 63.21088409423828
step: 1500 batchloss: 54.49907684326172
step: 1600 batchloss: 87.90258026123047
step: 1700 batchloss: 66.7521743774414
step: 1800 batchloss: 58.169586181640625
step: 1900 batchloss: 77.09764862060547
step: 2000 batchloss: 74.25008392333984
step: 2100 batchloss: 64.33557891845703
step: 2200 batchloss: 82.91483306884766
step: 2300 batchloss: 84.86637115478516
step: 2400 batchloss: 84.03901672363281
step: 2500

In [5]:
def test():
    total = 0
    correct = 0
    model = BiLSTMEncDec(vocablist_en,vocabidx_en,vocablist_vi,vocabidx_vi).to(DEVICE)
    model.load_state_dict(torch.load(MODELNAME))
    model.eval()
    ref = []
    pred =[]
    for enprep, en, vi in test_data:
        input =  torch.tensor([enprep], dtype=torch.int64).transpose(0,1).to(DEVICE)
        p = model.evaluate(input, vocablist_vi,vocabidx_vi)
        print('INPUT',en)
        print('REF',vi)
        print('MT',p)
        ref.append([vi])
        pred.append(p)
    bleu = torchtext.data.metrics.bleu_score(pred,ref)
    print('total:',len(test_data))
    print('bleu:',bleu)

Testing

In [6]:
test()

INPUT ['When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', 'Envy', '.', '&quot;']
REF ['Khi', 'tôi', 'còn', 'nhỏ', ',', 'Tôi', 'nghĩ', 'rằng', 'BắcTriều', 'Tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&quot;', 'Chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&quot;']
MT ['Khi', 'tôi', 'lớn', 'lên', ',', 'tôi', 'là', 'một', 'nhà', 'thiên', 'văn', 'thế', 'giới', 'tên', 'là', '&quot;', 'Tôi', 'là', 'một', 'ngôi', 'sao', 'tuyệt', 'vời', 'và', 'tôi', 'nghĩ', 'rằng', '&quot;', 'Không', 'có']
INPUT ['And', 'I', 'was', 'very', 'proud', '.']
REF ['Tôi', 'đã', 'rất', 'tự', 'hào', 'về', 'đất', 'nước', 'tôi', '.']
MT ['Và', 'tôi', 'rất', 'tự', 'hào', '.']
INPUT ['In', 'school', ',', 'we', 'spent', 'a', 'lot', 'of', 'time', 'studying', 'the', 'history', 'of', 'Kim', 'Il-Sung'