<a href="https://colab.research.google.com/github/MohammadrezaPourreza/Seq2Seq/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

First we should download data from github and install transformers package

In [None]:
!git clone https://github.com/HamidSajjadi/Pubmed-T5Gan.git
!pip install transformers
!pip install sent2vec
!pip install biobert_embedding
!pip install torchtext

fatal: destination path 'Pubmed-T5Gan' already exists and is not an empty directory.


In this part we import requried packages for our project

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
import spacy
import numpy as np
import random
import string
import math
import time
import pandas as pd
import pickle
import sent2vec
from random import random as rand
from tqdm import tqdm
from torch.utils.data import DataLoader
from biobert_embedding.embedding import BiobertEmbedding
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
from gensim.models import FastText as ft
from gensim.test.utils import datapath

Setting seeds

In [None]:
SEED = 1234
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Then we should load data and process it to be suitable for Encoder

In [None]:
#open dataset with pandas and drop unrelated columns, removing punctuations from keywords
df = pd.read_csv('/content/Pubmed-T5Gan/data/clef.csv')
df = df.sample(frac=1)
df = df.drop(['topic_id', 'related','orig'], axis=1)
df['keywords'] = df['keywords'].apply(lambda x: str(x).translate(str.maketrans('', '', string.punctuation)))
df['description'] = df['description'].apply(lambda x: str(x).translate(str.maketrans('', '', string.punctuation)))
test_df,val_df,train_df = np.split(df, [int(.1*len(df)), int(.2*len(df))])
test_df.to_csv('test.tsv', sep='\t', encoding='utf-8',index=False,header=False)
val_df.to_csv('val.tsv', sep='\t', encoding='utf-8',index=False,header=False)
train_df.to_csv('train.tsv', sep='\t', encoding='utf-8',index=False,header=False)


Prepocessing the data for encoder and tokenizing it


In [None]:
#download spacy models
spacy_en = spacy.load('en_core_web_sm')
#creating tokenizing function
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
Text = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)
train_data, valid_data, test_data = TabularDataset.splits(path = './',
    train='train.tsv',
    validation='val.tsv', test='test.tsv', format='tsv',
    fields=[('description', Text), ('keywords', Text)])
#use pretrainded Glove embeddings
#Text.build_vocab(train_data,vectors="glove.6B.50d")
#without Glove
Text.build_vocab(train_data)
print(f"Unique tokens in target (keywrods) vocabulary: {len(Text.vocab)}")

#TPU
#device = xm.xla_device()

#GPU and CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key = lambda x: len(x.description),
    sort_within_batch=True)

Unique tokens in target (keywrods) vocabulary: 12388


In [None]:
vocab_len,emb_dim = Text.vocab.vectors.size()
print(vocab_len)

**Building a sequence to sequence model**

**First we should create a embedding matix**

creating embedding **matrix**

In [None]:
biobert = BiobertEmbedding()
matrix_len = len(Text.vocab)
weights_matrix = np.zeros((matrix_len, 768))
words_found = 0
for i, word in enumerate(Text.vocab):
    try: 
#        print(str(i) + str(word) + str(words_found+1))
        print(word)
        weights_matrix[i] = biobert.word_vector(str(word))[0]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(768, ))
print(words_found)
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


KeyboardInterrupt: ignored

**Encoder part without attention**

In [None]:
class Encoder(nn.Module):
    def __init__(self,field , hid_dim, n_layers, dropout):
        super().__init__()
        self.field = field
        self.input_dim,self.emb_dim = self.field.vocab.vectors.size()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        #for trainable embedding layer uncomment below
        #self.embedding = nn.Embedding.from_pretrained(field.vocab.vectors, freeze=False)
        #pretrained embedding
        self.embedding = nn.Embedding.from_pretrained(self.field.vocab.vectors, freeze=True)
        #self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        #LSTM
        self.rnn = nn.LSTM(self.emb_dim, self.hid_dim, self.n_layers, dropout = dropout,bidirectional = True)
        #GRU
        #self.rnn = nn.GRU(self.emb_dim, self.hid_dim, self.n_layers, dropout = dropout,bidirectional = False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        #LSTM
        outputs, (hidden, cell) = self.rnn(embedded)
        #GRU
        #outputs , hidden = self.rnn(embedded)
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer
        #LSTM
        return hidden, cell
        #GRU
        #return hidden

**Decoder without attention**

In [None]:
class Decoder(nn.Module):
    def __init__(self, field, hid_dim, n_layers, dropout):
        super().__init__()
        self.field = field
        self.output_dim, self.emb_dim = self.field.vocab.vectors.size()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding.from_pretrained(self.field.vocab.vectors, freeze=True)
        #LSTM
        self.rnn = nn.LSTM(self.emb_dim, self.hid_dim, self.n_layers, dropout = dropout,bidirectional = True)
        #GRU
        #self.rnn = nn.GRU(self.emb_dim, self.hid_dim, self.n_layers, dropout = dropout,bidirectional = False)
        self.fc_out = nn.Linear(self.hid_dim*2, self.output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input, hidden, cell = None):
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]     
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        input = input.unsqueeze(0)
        #input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        #LSTM
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        #GRU
        #output,hidden = self.rnn(embedded,hidden)
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        prediction = self.fc_out(output.squeeze(0))
        #prediction = [batch size, output dim]
        #LSTM
        return prediction, hidden, cell
        #GRU
        #return prediction, hidden

**Seq2Seq class without attention**

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        #LSTM
        hidden, cell = self.encoder(src)
        #GRU
        #hidden = self.encoder(src)
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        for t in range(1, trg_len):
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            #LSTM
            output, hidden, cell = self.decoder(input, hidden, cell)
            #GRU
            #output, hidden = self.decoder(input,hidden)
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        return outputs

**Creating Encoder, decoder, attention and seq2seq all together**

In [None]:
#encoder
class Encoder(nn.Module):
    def __init__(self, field, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()        
        self.field = field
        self.input_dim,self.emb_dim = self.field.vocab.vectors.size() 
        self.embedding = nn.Embedding.from_pretrained(self.field.vocab.vectors, freeze=True)   
        self.rnn = nn.GRU(self.emb_dim, enc_hid_dim, bidirectional = True)        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)       
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):        
        #src = [src len, batch size]       
        embedded = self.dropout(self.embedding(src))       
        #embedded = [src len, batch size, emb dim]       
        outputs, hidden = self.rnn(embedded)               
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        return outputs, hidden
#attention
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)
#decoder
class Decoder(nn.Module):
    def __init__(self, field, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.attention = attention
        self.field = field
        self.output_dim,self.emb_dim = self.field.vocab.vectors.size() 
        self.embedding = nn.Embedding.from_pretrained(self.field.vocab.vectors, freeze=True)        
        self.rnn = nn.GRU((enc_hid_dim * 2) + self.emb_dim, dec_hid_dim)     
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, self.output_dim)  
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]        
        input = input.unsqueeze(0)       
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)
#seq2seq
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs


# **Training the Seq2Seq model**

**initialization without attention**

In [None]:
HID_DIM = 64
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(Text, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(Text, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

**initilization with attention**

In [None]:
ENC_HID_DIM = 32
DEC_HID_DIM = 32
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(Text, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(Text, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

**Initialize the wieghts**

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(12633, 50)
    (rnn): GRU(50, 32, bidirectional=True)
    (fc): Linear(in_features=64, out_features=32, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=96, out_features=32, bias=True)
      (v): Linear(in_features=32, out_features=1, bias=False)
    )
    (embedding): Embedding(12633, 50)
    (rnn): GRU(114, 32)
    (fc_out): Linear(in_features=146, out_features=12633, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

**trainable features**

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,892,603 trainable parameters


**Focal loss for imbalanced dataset**

In [None]:
class FocalLoss(nn.modules.NLLLoss):
    def __init__(self, weight=None, gamma=2,reduction='mean',ignore_index = -100):
        super(FocalLoss, self).__init__(weight,reduction=reduction,ignore_index= ignore_index)
        self.gamma = gamma
        self.weight = weight #weight parameter will act as the alpha parameter to balance class weights

    def forward(self, input, target):

        ce_loss = F.cross_entropy(input, target,reduction=self.reduction,weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss

**Optimizer and Loss function**

In [None]:
#adding regularization
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
#without regularization
#optimizer = optim.Adam(model.parameters())
Text_PAD_IDX = Text.vocab.stoi[Text.pad_token]
#focal loss
#criterion = FocalLoss(ignore_index = Text_PAD_IDX)
#crossentropy
criterion = nn.CrossEntropyLoss(ignore_index = Text_PAD_IDX)

**trainnig method**

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.description
        trg = batch.keywords
        optimizer.zero_grad()
        output = model(src, trg)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

**Evaluation method**

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.description
            trg = batch.keywords
            output = model(src, trg, 0) #turn off teacher forcing
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

**Epoch time**

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

**Training the model**

In [None]:
N_EPOCHS = 40
CLIP = 1
#model.load_state_dict(torch.load('tut1-model.pt'))
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


**Test the model**

In [None]:
model.load_state_dict(torch.load('tut1-model.pt', map_location='cpu'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 5.878 | Test PPL: 357.247 |


**Create output method**

In [None]:
def create_query(sentence, src_field, trg_field, model, device, max_len = 150):

    model.eval()
    
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        pred_token = output.argmax(1).item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:]

**test the code**

In [None]:
for i in range (10,20):
  src = vars(test_data.examples[i])['description']
  trg = vars(test_data.examples[i])['keywords']
  print(f'src = {src}')
  print(f'trg = {trg}')
  query = create_query(src, Text, Text, model, device)
  print(f'predicted trg = {query}')

src = ['controlled', 'field', 'trials', 'and', 'laboratory', 'studies', 'on', 'the', 'effectiveness', 'of', 'typhoid', 'vaccines', 'in', 'poland', '196164']
trg = ['adolescent', 'adult', 'animals', 'child', 'child', 'preschool', 'humans', 'in', 'vitro', 'techniques', 'middle', 'aged', 'poland', 'rabbits', 'typhoid', 'fever', 'typhoidparatyphoid', 'vaccines']


RuntimeError: ignored

Refrences : 
https://github.com/bentrevett/pytorch-seq2seq