## Preparing the Data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split

import os
from typing import List, Set, Dict, Tuple

SOS_token = 0
EOS_token = 1
PAD_token = 2


def open_file(file_loc: str) -> List:
    file = open(file_loc, "r")
    file_lines = file.readlines()
    file_lines = [list(map(int, line.split())) for line in file_lines]
    return file_lines


class SeqDataset(Dataset):
    def __init__(self, src_seq: List, tgt_seq: List):
        self.src_seq = src_seq
        self.tgt_seq = tgt_seq

        self.src_maxlen = len(max(self.src_seq, key=len))
        self.tgt_maxlen = len(max(self.tgt_seq, key=len))

    def __len__(self) -> int:
        return len(self.src_seq)
    
    def max_len_return(self):
        return self.src_maxlen, self.tgt_maxlen

    def __getitem__(self, idx: int) -> List:
        src = torch.tensor(self.src_seq[idx])
        tgt = torch.tensor(self.tgt_seq[idx])

        return src, tgt


def pad_collate(batch) -> (List, List):
    (xs, ys) = zip(*batch)

    x_lens = [len(x) for x in xs]
    y_lens = [len(y) for y in ys]

    x_pad = pad_sequence(xs, batch_first=True, padding_value=2)
    y_pad = pad_sequence(ys, batch_first=True, padding_value=2)

    return x_pad, y_pad


def get_loader(
    root_path: str, batch_size: int = 8, valid_ratio: float = 0.2, seed: int = 1234
):
    train_source_loc, train_target_loc = os.path.join(
        root_path, "train_source.txt"
    ), os.path.join(root_path, "train_target.txt")
    test_source_loc, test_target_loc = os.path.join(
        root_path, "test_source.txt"
    ), os.path.join(root_path, "test_target.txt")

    train_source, train_target = open_file(train_source_loc), open_file(
        train_target_loc
    )
    test_source, test_target = open_file(test_source_loc), open_file(test_target_loc)

    special_vocabs = ["<sos>", "<eos>", "<pad>"]

    # generate vocabulary dictionary
    def get_vocab(train_src, train_tgt, test_src, test_tgt):
        train_src_vocab, train_tgt_vocab = set(
            [i for line in train_src for i in line]
        ), set([i for line in train_tgt for i in line])
        test_src_vocab, test_tgt_vocab = set(
            [i for line in test_src for i in line]
        ), set([i for line in test_tgt for i in line])

        # Combine vocab of (train source - test source) & (train target - test target)
        train_src_vocab |= test_src_vocab
        train_tgt_vocab |= test_tgt_vocab

        # Add '<sos>', '<eos>', '<pad>' to the vocab
        src_vocab_dict, tgt_vocab_dict = {}, {}
        for idx, value in enumerate(special_vocabs):
            src_vocab_dict[idx] = value
            tgt_vocab_dict[idx] = value
        src_vocab_dict = {
            value + len(special_vocabs): idx + len(special_vocabs)
            for idx, value in enumerate(sorted(train_src_vocab))
        }
        tgt_vocab_dict = {
            value + len(special_vocabs): idx + len(special_vocabs)
            for idx, value in enumerate(sorted(train_tgt_vocab))
        }

        return src_vocab_dict, tgt_vocab_dict

    src_vocab, tgt_vocab = get_vocab(
        train_source, train_target, test_source, test_target
    )

    train_source = [
        [0] + [src_vocab[i + len(special_vocabs)] for i in seq] + [1]
        for seq in train_source
    ]
    train_target = [
        [0] + [tgt_vocab[i + len(special_vocabs)] for i in seq] + [1]
        for seq in train_target
    ]

    test_source = [
        [0] + [src_vocab[i + len(special_vocabs)] for i in seq] + [1]
        for seq in test_source
    ]
    test_target = [
        [0] + [tgt_vocab[i + len(special_vocabs)] for i in seq] + [1]
        for seq in test_target
    ]

    train_len = int(len(train_source) * (1.0-valid_ratio))
    val_len = len(train_source) - train_len

    seq_dataset = SeqDataset(train_source, train_target)
    train_dataset, val_dataset = random_split(
        seq_dataset, [train_len, val_len], generator=torch.Generator().manual_seed(seed)
    )
    test_dataset = SeqDataset(test_source, test_target)

    # import IPython; IPython.embed(); exit(1)
    train_iterator = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate
    )
    valid_iterator = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate
    )
    test_iterator = DataLoader(
        test_dataset, batch_size=batch_size, collate_fn=pad_collate
    )

    return train_iterator, valid_iterator, test_iterator, src_vocab, tgt_vocab

In [4]:
train_iterator, valid_iterator, test_iterator, src_vocab, tgt_vocab = get_loader('data', 8, 0.8, 1234)

Finally, we define the device and the data iterator.

In [5]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model Build

In [6]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

### Encoder Layer

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

### Mutli Head Attention Layer

In [8]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

### Position-wise Feedforward Layer

In [9]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

### Decoder

In [10]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

### Decoder Layer

In [11]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

### Seq2Seq

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

## Training the Seq2Seq Model

We can now define our encoder and decoders. This model is significantly smaller than Transformers used in research today, but is able to be run on a single GPU quickly.

In [13]:
INPUT_DIM = len(src_vocab)+3
OUTPUT_DIM = len(tgt_vocab)+3
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

src_maxlen = 86
tgt_maxlen = 56

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT,
              device,
              src_maxlen,)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device,
              tgt_maxlen,)

Then, use them to define our whole sequence-to-sequence encapsulating model.

In [14]:
SRC_PAD_IDX = 2
TRG_PAD_IDX = 2

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
        
        model.apply(initialize_weights);

LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

### Train & Evaluate

In [15]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        
        optimizer.zero_grad()
        
        output, _ = model(src, tgt[:,:-1])
                
        #output = [batch size, tgt len - 1, output dim]
        #tgt = [batch size, tgt len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:,1:].contiguous().view(-1)
                
        #output = [batch size * tgt len - 1, output dim]
        #tgt = [batch size * tgt len - 1]
            
        loss = criterion(output, tgt)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [16]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)

            output, _ = model(src, tgt[:,:-1])
            
            #output = [batch size, tgt len - 1, output dim]
            #tgt = [batch size, tgt len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:,1:].contiguous().view(-1)
            
            #output = [batch size * tgt len - 1, output dim]
            #tgt = [batch size * tgt len - 1]
            
            loss = criterion(output, tgt)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 25
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 12s
	Train Loss: 3.470 | Train PPL:  32.142
	 Val. Loss: 3.063 |  Val. PPL:  21.394
Epoch: 02 | Time: 0m 13s
	Train Loss: 2.762 | Train PPL:  15.826
	 Val. Loss: 2.693 |  Val. PPL:  14.772
Epoch: 03 | Time: 0m 13s
	Train Loss: 2.389 | Train PPL:  10.906
	 Val. Loss: 2.444 |  Val. PPL:  11.518
Epoch: 04 | Time: 0m 13s
	Train Loss: 2.111 | Train PPL:   8.260
	 Val. Loss: 2.340 |  Val. PPL:  10.377
Epoch: 05 | Time: 0m 12s
	Train Loss: 1.893 | Train PPL:   6.638
	 Val. Loss: 2.229 |  Val. PPL:   9.292
Epoch: 06 | Time: 0m 12s
	Train Loss: 1.698 | Train PPL:   5.462
	 Val. Loss: 2.174 |  Val. PPL:   8.797
Epoch: 07 | Time: 0m 13s
	Train Loss: 1.547 | Train PPL:   4.696
	 Val. Loss: 2.146 |  Val. PPL:   8.548
Epoch: 08 | Time: 0m 13s
	Train Loss: 1.396 | Train PPL:   4.038
	 Val. Loss: 2.097 |  Val. PPL:   8.144
Epoch: 09 | Time: 0m 13s
	Train Loss: 1.287 | Train PPL:   3.621
	 Val. Loss: 2.132 |  Val. PPL:   8.434
Epoch: 10 | Time: 0m 13s
	Train Loss: 1.177 | Train PPL

In [19]:
model.load_state_dict(torch.load('transformer_model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 2.127 | Test PPL:   8.391 |


## ROUGE, BLEU

Metric 계산

In [23]:
from ignite.metrics.nlp import Bleu
from ignite.metrics import Rouge

def translate_seq(sequence, model, device, max_len):
    src_tensor = sequence.unsqueeze(0).to(device)

    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [SOS_token]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

        pred_token = output.argmax(2)[:,-1].item()

        trg_indexes.append(pred_token)
        if pred_token == EOS_token:
            break
    trg_indexes = cleanse_sent(trg_indexes)
    
    return trg_indexes

def cleanse_sent(sent):
    target_idx = -1
    for idx, s in enumerate(sent):
        if s == 1:
            target_idx = idx
            break

    return sent[1:target_idx]  

def calculate_metric(src_seq, tgt_seq, model, device, max_len):
    trgs = []
    pred_trgs = []
            
    pred_trg = translate_seq(src_seq, model, device, max_len)
    
    #cut off <eos> token
    pred_trgs.append(pred_trg)
    trgs.append(tgt_seq.tolist())
    
    pred_trgs = [[str(k) for k in i] for i in pred_trgs]
    trgs = [[str(l) for l in j] for j in trgs]
    
    # Rouge Score
    m = Rouge(variants=["L", 2], multiref="average")
    m.update((pred_trgs, [trgs]))
    rouge_score = m.compute()
    
    # BLEU Score
    m = Bleu(ngram=1)
    m.update((pred_trgs, [trgs]))
    bleu_score = m.compute()
  
    return rouge_score, bleu_score

In [21]:
total_rouge, total_bleu = 0,0

metric = {'total_bleu':0,
         'Rouge-L-P':0,
         'Rouge-L-R':0,
         'Rouge-L-F':0,
         'Rouge-2-P':0,
         'Rouge-2-R':0,
         'Rouge-2-F':0}
cnt = 0
for src, trg in test_iterator:
    
    for s,t in zip(src, trg):
        cnt += 1
        rouge, bleu = calculate_metric(s, cleanse_sent(t), model, device, tgt_maxlen)
        for key in rouge:
            metric[key] += rouge[key]
            
        metric['total_bleu'] += float(bleu)
        
        if cnt % 100 == 0:
            print(f"BLEU: {metric['total_bleu']}")
            print(f"Rouge-L-P: {metric['Rouge-L-P']}")
        
for key in metric:
    metric[key] = metric[key] / cnt

BLEU: 43.0603285159662
Rouge-L-P: 42.14671518980851
BLEU: 81.76030866455287
Rouge-L-P: 82.0476287061168
BLEU: 127.82132187474367
Rouge-L-P: 128.0728006099543
BLEU: 168.7391926340106
Rouge-L-P: 167.28728985639896
BLEU: 208.98806625394184
Rouge-L-P: 206.0593649623456
BLEU: 247.99786185436733
Rouge-L-P: 244.126969935157
BLEU: 283.8610139012826
Rouge-L-P: 282.16216419407306
BLEU: 327.4947838954629
Rouge-L-P: 326.3649982038611
BLEU: 370.86978234347
Rouge-L-P: 370.1740883702306
BLEU: 411.3853050219858
Rouge-L-P: 409.3214679131644
BLEU: 448.2910399737913
Rouge-L-P: 448.1708299477034
BLEU: 491.5397864414375
Rouge-L-P: 493.3557582200479
BLEU: 537.7142683222858
Rouge-L-P: 537.1584869084996
BLEU: 582.0223067640901
Rouge-L-P: 583.6833068040166
BLEU: 625.7822898430817
Rouge-L-P: 628.205297936379
BLEU: 665.5248883639716
Rouge-L-P: 667.5494697960863
BLEU: 706.1127099750936
Rouge-L-P: 709.4707896645708
BLEU: 745.4077314752836
Rouge-L-P: 747.2445211016321
BLEU: 790.8268317710276
Rouge-L-P: 791.78878049

In [24]:
metric

{'total_bleu': 0.4165198872586476,
 'Rouge-L-P': 0.41836511336864624,
 'Rouge-L-R': 0.4469015755764602,
 'Rouge-L-F': 0.4469015755764602,
 'Rouge-2-P': 0.2556817398603032,
 'Rouge-2-R': 0.27596785006571267,
 'Rouge-2-F': 0.27596785006571267}