In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics
import sys
from functools import partial

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk
#from google.colab import files

In [2]:
# General util functions
def make_dir_if_not_exists(directory):
	if not os.path.exists(directory):
		logging.info("Creating new directory: {}".format(directory))
		os.makedirs(directory)

def print_list(l, K=None):
	# If K is given then only print first K
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def remove_multiple_spaces(string):
	return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_object, save_file):
	with open(save_file, "wb") as pickle_out:
		pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
	with open(save_file, "w") as writer:
		for line in list_of_strings:
			line = line.strip()
			writer.write(f"{line}\n")

def load_from_txt(txt_file):
	with open(txt_file, "r") as reader:
		all_lines = list()
		for line in reader:
			line = line.strip()
			all_lines.append(line)
		return all_lines

In [3]:
import pandas as pd

print(torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


In [4]:
data_file = 'with_epoque.csv'
data = pd.read_csv(data_file)
print(len(data))
print(data.head())

573
                                    author  \
0                      WILLIAM SHAKESPEARE   
1  DUCHESS OF NEWCASTLE MARGARET CAVENDISH   
2                           THOMAS BASTARD   
3                           EDMUND SPENSER   
4                        RICHARD BARNFIELD   

                                             content  \
0  Let the bird of loudest lay\r\nOn the sole Ara...   
1  Sir Charles into my chamber coming in,\r\nWhen...   
2  Our vice runs beyond all that old men saw,\r\n...   
3  Lo I the man, whose Muse whilome did maske,\r\...   
4  Long have I longd to see my love againe,\r\nSt...   

                                 poem name          age                  type  
0               The Phoenix and the Turtle  Renaissance  Mythology & Folklore  
1                 An Epilogue to the Above  Renaissance  Mythology & Folklore  
2                       Book 7, Epigram 42  Renaissance  Mythology & Folklore  
3  from The Faerie Queene: Book I, Canto I  Renaissance  Mytho

In [5]:
def make_data_training(df, char_max_line = 20):
    inputs = []
    context = []
    targets = []
    for i,rows in df.iterrows():
        splitted = rows['content'].split('\r\n')
        for line in splitted:
            if len(line.strip()) > 0 and len(line.split(' ')) <= char_max_line:
                inputs.append(line)
                targets.append(line)
                context.append(' '.join([str(rows['poem name'])]))
        
    return pd.DataFrame(list(zip(inputs, context, targets)),columns =['text', 'context','target'])


#Defining torch dataset class for poems
class PoemDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]

In [6]:
df = make_data_training(data, char_max_line = 30)

In [7]:
pad_word = "<pad>"
bos_word = "<bos>"
eos_word = "<eos>"
unk_word = "<unk>"
sep_word = "sep"

pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3
sep_id = 4
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id, sep_word: sep_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word, sep_id: sep_word}
        self.num_words = 5
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word.lower()] if word.lower() in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab = Vocabulary()
for src in df['text']:
    vocab.add_words_from_sentence(src.lower())

print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 11340


In [8]:
class Poem_dataset(Dataset):
    """Single-Turn version of Cornell Movie Dialog Cropus dataset."""

    def __init__(self, poems, context,vocab, device):
        """
        Args:
            conversations: list of tuple (src_string, tgt_string) 
                         - src_string: String of the source sentence
                         - tgt_string: String of the target sentence
            vocab: Vocabulary object that contains the mapping of 
                    words to indices
            device: cpu or cuda
        """
        l = []
        
        for i in range(len(poems)):
            l.append( ( context[i] + ' sep ' + poems[i] , poems[i] ))
        
        self.conversations = l.copy()
        self.vocab = vocab
        self.device = device

        def encode(src, tgt):
            src_ids = self.vocab.get_ids_from_sentence(src)
            tgt_ids = self.vocab.get_ids_from_sentence(tgt)
            return (src_ids, tgt_ids)

        # We will pre-tokenize the conversations and save in id lists for later use
        self.tokenized_conversations = [encode(src, tgt) for src, tgt in self.conversations]
        
    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"conv_ids":self.tokenized_conversations[idx], "conv":self.conversations[idx]}

def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, tgt_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of dicts {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, trg_str)}.
            - src_ids: list of src piece ids; variable length.
            - tgt_ids: list of tgt piece ids; variable length.
            - src_str: String of src
            - tgt_str: String of tgt
    Returns: dict { "conv_ids":     (src_ids, tgt_ids), 
                    "conv":         (src_str, tgt_str), 
                    "conv_tensors": (src_seqs, tgt_seqs)}
            src_seqs: torch tensor of shape (src_padded_length, batch_size).
            tgt_seqs: torch tensor of shape (tgt_padded_length, batch_size).
            src_padded_length = length of the longest src sequence from src_ids
            tgt_padded_length = length of the longest tgt sequence from tgt_ids
    """
    # Sort conv_ids based on decreasing order of the src_lengths.
    # This is required for efficient GPU computations.
    src_ids = [torch.LongTensor(e["conv_ids"][0]) for e in data]
    tgt_ids = [torch.LongTensor(e["conv_ids"][1]) for e in data]
    src_str = [e["conv"][0] for e in data]
    tgt_str = [e["conv"][1] for e in data]
    data = list(zip(src_ids, tgt_ids, src_str, tgt_str))
    data.sort(key=lambda x: len(x[0]), reverse=True)
    src_ids, tgt_ids, src_str, tgt_str = zip(*data)


    # Pad the src_ids and tgt_ids using token pad_id to create src_seqs and tgt_seqs
    
    # Implementation tip: You can use the nn.utils.rnn.pad_sequence utility
    # function to combine a list of variable-length sequences with padding.
    
    # YOUR CODE HERE
    src_seqs = nn.utils.rnn.pad_sequence(src_ids, padding_value = pad_id,
                                         batch_first = False)
    tgt_seqs = nn.utils.rnn.pad_sequence(tgt_ids, padding_value = pad_id, 
                                         batch_first = False)
    
    src_padded_length = len(src_seqs[0])
    tgt_padded_length = len(tgt_seqs[0])
    return {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, tgt_str), "conv_tensors":(src_seqs.to(device), tgt_seqs.to(device))}

In [9]:
# Create the DataLoader for all_conversations

all_poems = df['text'].tolist()
context = df['context'].tolist()

dataset = Poem_dataset(all_poems, context, vocab, device)

batch_size = 5

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

In [10]:

for src, tgt in dataset.conversations[:3]:
    sentence = src
    word_tokens = vocab.tokenized_sentence(sentence)
    # Automatically adds bos_id and eos_id before and after sentence ids respectively
    word_ids = vocab.get_ids_from_sentence(sentence)
    print(sentence)
    print(word_tokens)
    print(word_ids)
    print(vocab.decode_sentence_from_ids(word_ids))
    print()

word = "world"
word_id = vocab.word_to_id[word.lower()]
print(f"Word = {word}")
print(f"Word ID = {word_id}")
print(f"Word decoded from ID = {vocab.decode_sentence_from_ids([word_id])}")

The Phoenix and the Turtle sep Let the bird of loudest lay
['<bos>', 'the', 'phoenix', 'and', 'the', 'turtle', 'sep', 'let', 'the', 'bird', 'of', 'loudest', 'lay', '<eos>']
[1, 6, 100, 17, 6, 101, 4, 5, 6, 7, 8, 9, 10, 2]
the phoenix and the turtle sep let the bird of loudest lay

The Phoenix and the Turtle sep On the sole Arabian tree
['<bos>', 'the', 'phoenix', 'and', 'the', 'turtle', 'sep', 'on', 'the', 'sole', 'arabian', 'tree', '<eos>']
[1, 6, 100, 17, 6, 101, 4, 11, 6, 12, 13, 14, 2]
the phoenix and the turtle sep on the sole arabian tree

The Phoenix and the Turtle sep Herald sad and trumpet be,
['<bos>', 'the', 'phoenix', 'and', 'the', 'turtle', 'sep', 'herald', 'sad', 'and', 'trumpet', 'be', '<eos>']
[1, 6, 100, 17, 6, 101, 4, 15, 16, 17, 18, 19, 2]
the phoenix and the turtle sep herald sad and trumpet be

Word = world
Word ID = 392
Word decoded from ID = world


In [11]:
# Test one batch of training data
first_batch = next(iter(data_loader))
print(f"Testing first training batch of size {len(first_batch['conv'][0])}")
print(f"List of source strings:")
print_list(first_batch["conv"][0])
print(f"Tokenized source ids:")
print_list(first_batch["conv_ids"][0])
print(f"Padded source ids as tensor (shape {first_batch['conv_tensors'][0].size()}):")
print(first_batch["conv_tensors"][0])

Testing first training batch of size 5
List of source strings:
Song: to Celia [Come, my Celia, let us prove] sep Fame and rumor are but toys.
Prosopopoia: or Mother Hubbard's Tale sep Yet many eke of them (God wot) are driven
Song: Sweetest love, I do not go sep Destiny may take thy part,
Epithalamion sep The whyles the boyes run up and downe the street,
Three Cantos sep Let us hear John Heydon!

Tokenized source ids:
tensor([   1,  362,   20, 6795,   40,  231, 6795,    5, 2724,  554,    4,  251,
          17, 6800,  220,   27, 5685,   26,    2])
tensor([   1,    3,  221,  428,    3,   36, 3326,    4,  124,  487,  442,    8,
         132,  872,  852,  220, 3318,    2])
tensor([   1,  362, 3461,   96,  235,  242,   41,   91,    4, 6401,  210,  785,
          78, 3796,    2])
tensor([    1, 10020,     4,     6,  2459,     6,  2414,  2460,  1008,    17,
         1362,     6,  2461,     2])
tensor([   1, 2404, 8447,    4,    5, 2724, 3333, 8189, 8599,  171,    2])

Padded source ids as ten

In [12]:
class Erato(nn.Module):
    def __init__(self, vocab, emb_dim = 300, hidden_dim = 300, num_layers = 2, dropout=0.1):
        super().__init__()

        # Initialize your model's parameters here. To get started, we suggest
        # setting all embedding and hidden dimensions to 300, using encoder and
        # decoder GRUs with 2 layers, and using a dropout rate of 0.1.

        # Implementation tip: To create a bidirectional GRU, you don't need to
        # create two GRU networks. Instead use nn.GRU(..., bidirectional=True).
        
        self.num_words = num_words = vocab.num_words
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        # YOUR CODE HERE
        self.encode_emb = nn.Embedding(self.num_words,self.emb_dim)
        self.encode_gru = nn.GRU(self.emb_dim, self.hidden_dim,
                          num_layers=self.num_layers, dropout=dropout,
                          bidirectional=True,batch_first=False)
        self.encode_l_hidden = nn.Linear(2*self.num_layers,self.num_layers)
        self.encode_l_output = nn.Linear(2*self.hidden_dim,self.hidden_dim)
        #self.relu = nn.ReLU()
        #self.l2 = nn.Linear(self.hidden_dim,self.hidden_dim)
        self.dropout_enc = nn.Dropout(dropout)

        self.decode_emb = self.encode_emb
        
        self.decode_gru = nn.GRU(self.emb_dim, self.hidden_dim,
                          num_layers=self.num_layers, dropout=dropout,
                          bidirectional=False,batch_first=False)
        self.d_l = nn.Linear(self.hidden_dim,self.num_words)
        self.logsoftmax = nn.LogSoftmax(dim=2)
        self.loss = nn.CrossEntropyLoss(ignore_index=pad_id)
        self.dropout_dec = nn.Dropout(dropout)
        
        self.softmax_att = nn.Softmax(dim=0)
        self.attention_matrix = nn.Linear(self.hidden_dim,self.hidden_dim)
        self.attention_decode_cat = nn.Linear(2*self.hidden_dim,self.num_words)

    def encode(self, source):
        """Encode the source batch using a bidirectional GRU encoder.

        Args:
            source: An integer tensor with shape (max_src_sequence_length,
                batch_size) containing subword indices for the source sentences.

        Returns:
            A tuple with three elements:
                encoder_output: The output hidden representation of the encoder 
                    with shape (max_src_sequence_length, batch_size, hidden_size).
                    Can be obtained by adding the hidden representations of both 
                    directions of the encoder bidirectional GRU. 
                encoder_mask: A boolean tensor with shape (max_src_sequence_length,
                    batch_size) indicating which encoder outputs correspond to padding
                    tokens. Its elements should be True at positions corresponding to
                    padding tokens and False elsewhere.
                encoder_hidden: The final hidden states of the bidirectional GRU 
                    (after a suitable projection) that will be used to initialize 
                    the decoder. This should be a tensor h_n with shape 
                    (num_layers, batch_size, hidden_size). Note that the hidden 
                    state returned by the bi-GRU cannot be used directly. Its 
                    initial dimension is twice the required size because it 
                    contains state from two directions.

        The first two return values are not required for the baseline model and will
        only be used later in the attention model. If desired, they can be replaced
        with None for the initial implementation.
        """

        # Implementation tip: consider using packed sequences to more easily work
        # with the variable-length sequences represented by the source tensor.
        # See https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.PackedSequence.

        # https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch

        # Implementation tip: there are many simple ways to combine the forward
        # and backward portions of the final hidden state, e.g. addition, averaging,
        # or a linear transformation of the appropriate size. Any of these
        # should let you reach the required performance.

        # Compute a tensor containing the length of each source sequence.
        source_lengths = torch.sum(source != pad_id, axis=0).cpu()

        # YOUR CODE HERE
        emb = self.dropout_enc(self.encode_emb(source))
        emb = nn.utils.rnn.pack_padded_sequence(emb, source_lengths,
                                                enforce_sorted = False)
        encoder_output, encoder_hidden = self.encode_gru(emb)
        encoder_output,_ = nn.utils.rnn.pad_packed_sequence(encoder_output,
                                                   padding_value=pad_id)
        #encoder_hidden = self.l2(self.relu(self.l1(encoder_hidden)))
        #encoder_hidden = torch.tanh(encoder_hidden)
        encoder_output = self.encode_l_output(encoder_output)
        
        encoder_hidden = self.encode_l_hidden(encoder_hidden.permute(2,1,0))
        encoder_hidden = encoder_hidden.permute(2,1,0).contiguous()
        # Compute the encoder mask
        encoder_mask = (source == pad_id)

        return encoder_output, encoder_mask.type(torch.bool), encoder_hidden

    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask):
        """Run the decoder GRU for one decoding step from the last hidden state.

        The third and fourth arguments are not used in the baseline model, but are
        included for compatibility with the attention model in the next section.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing 
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the decoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be 
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden 
                    representing the updated decoder state after processing the 
                    decoder input.
                attention_weights: A tensor with shape (batch_size, 
                    max_src_sequence_length) representing the normalized
                    attention weights. This should sum to 1 along the last dimension.
        """

        # YOUR CODE HERE
        emb = self.dropout_dec(self.decode_emb(decoder_input))
        decoder_output, decoder_hidden = self.decode_gru(emb,last_hidden)
        b = decoder_output.squeeze(0)

        # I use the General method (Luong2015) for attention
        encoder_output = encoder_output.masked_fill(encoder_mask.unsqueeze(2),0)
        att = torch.matmul(self.attention_matrix(decoder_output.permute(1,0,2)),
                           encoder_output.permute(1,2,0))
        att = att.squeeze(1).permute(1,0)
        
        att = att.masked_fill(encoder_mask, float("-inf"))
        att = self.softmax_att(att)
        c = att.unsqueeze(2) * encoder_output
        c = torch.sum(c,0)
        logits = self.attention_decode_cat(torch.cat((b,c),1))
        return (logits, decoder_hidden, att)

    def compute_loss(self, source, target):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """

        # Implementation tip: don't feed the target tensor directly to the decoder.
        # To see why, note that for a target sequence like <s> A B C </s>, you would
        # want to run the decoder on the prefix <s> A B C and have it predict the
        # suffix A B C </s>.

        # You may run self.encode() on the source only once and decode the target 
        # one step at a time.

        # YOUR CODE HERE
        max_source_sequence_length = target.shape[0]
        local_batch_size = target.shape[1]
        encoder_output, encoder_mask, h = self.encode(source)
        input_decode = target[0,:].unsqueeze(0)
        outputs = bos_id*torch.ones(1,local_batch_size,self.num_words, requires_grad=True).cuda()
        for t in range(1,max_source_sequence_length):
            out,h,_ = self.decode(input_decode, h, encoder_output, encoder_mask)
            input_decode = target[t,:].unsqueeze(0)
            outputs = torch.cat((outputs,out.unsqueeze(0)),0)
        return self.loss(outputs[1:].reshape((max_source_sequence_length-1)*local_batch_size,self.num_words),target[1:].flatten())

In [13]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """Train the model for given µnumber of epochs and save the trained model in 
    the final model_file.
    """

    decoder_learning_ratio = 5.0
    #encoder_parameter_names = ['word_embedding', 'encoder']
    encoder_parameter_names = ['encode_emb', 'encode_gru', 'l1', 'l2']
                           
    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        # print(f"Total training instances = {len(train_dataset)}")
        # print(f"train_data_loader = {len(train_data_loader)} {1180 > len(train_data_loader)/20}")
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, target = batch_data["conv_tensors"]
                optimizer.zero_grad()
                loss = model.compute_loss(source, target)
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
    # Save the model after training         
    torch.save(model.state_dict(), model_file)

In [None]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 10
batch_size = 32
learning_rate = 0.001
# Reloading the data_loader to increase batch_size
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

baseline_model = Erato(vocab).to(device)
train(baseline_model, data_loader, num_epochs, "baseline_model.pt",learning_rate=learning_rate)

training:   0%|          | 0/10 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/425 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/425 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/425 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/425 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/425 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/425 [00:00<?, ?batch/s]

epoch 7:   0%|          | 0/425 [00:00<?, ?batch/s]

In [None]:
def predict_beam(model, sentence, k=5, max_length=100):
    """Make predictions for the given inputs using beam search.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: An input sentence, represented as string.
        k: The size of the beam.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        A list of k beam predictions. Each element in the list should be a string
        corresponding to one of the top k predictions for the corresponding input,
        sorted in descending order by its final score.
    """

    # Implementation tip: once an eos_token has been generated for any beam, 
    # remove its subsequent predictions from that beam by adding a small negative 
    # number like -1e9 to the appropriate logits. This will ensure that the 
    # candidates are removed from the beam, as its probability will be very close
    # to 0. Using this method, uou will be able to reuse the beam of an already 
    # finished candidate

    # Implementation tip: while you are encouraged to keep your tensor dimensions
    # constant for simplicity (aside from the sequence length), some special care
    # will need to be taken on the first iteration to ensure that your beam
    # doesn't fill up with k identical copies of the same candidate.
    
    # You are welcome to tweak alpha
    alpha = 0.
    model.eval()
    
    # YOUR CODE HERE
    sentence_ids = torch.tensor(vocab.get_ids_from_sentence(sentence)).cuda()
    sentence_ids = sentence_ids.unsqueeze(1)
    encoder_output, encoder_mask, h = model.encode(sentence_ids)

    out_start = sentence_ids[0]
    beam = [out_start for i in range(k)]
    beam_scores = [1 for i in range(k)]
    hiddens = [h for i in range(k)]
    generations = []
    generations_scores = []
    curr_l = 0
    eos_tensor = torch.Tensor([eos_id]).int().cuda()
    while beam:
        logits = torch.Tensor().cuda()
        inds = torch.Tensor().int().cuda()
        curr_k = len(beam)
        if curr_l==max_length:
            for i in range(curr_k):
                  generations += [torch.cat((beam[i],eos_tensor),0)]
                  generations_scores += [new_beam_scores[i]]
            break
        else:
            for i in range(curr_k):
                out, hiddens[i], _ = model.decode(beam[i][-1].view(1,1), hiddens[i], encoder_output,
                                     encoder_mask)
                logit,ind = torch.topk(out.squeeze(), curr_k, dim=0)
                logits = torch.cat((logits,logit),0)
                inds = torch.cat((inds,ind),0)
            new_beam = []
            new_beam_scores = []
            new_hiddens = []
            if curr_l==0:
                for i in range(curr_k):
                    max_ind = torch.argmax(nn.functional.log_softmax(logit,dim=0))
                    new_beam_scores += [float(logit[max_ind])]
                    logit[max_ind] = -1e9
                    new_beam += [torch.cat((beam[0],ind[max_ind].unsqueeze(0)),0)]
                    new_hiddens += [hiddens[0]]
            else:
                top_logits,top_inds_logit = torch.topk(torch.repeat_interleave(torch.Tensor(beam_scores).cuda(),
                                                                               curr_k)\
                                                       +nn.functional.log_softmax(logits,dim=0),
                                                       curr_k, dim=0)
                for i in range(curr_k):
                    if inds[top_inds_logit[i]]==eos_id:
                        generations += [torch.cat((beam[top_inds_logit[i]//curr_k],inds[top_inds_logit[i]].unsqueeze(0)),0)]
                        generations_scores+=[float(logits[top_inds_logit[i]])/(generations[-1].shape[0]**alpha)]
                    else:
                        new_beam += [torch.cat((beam[top_inds_logit[i]//curr_k],inds[top_inds_logit[i]].unsqueeze(0)),0)]
                        new_hiddens += [hiddens[top_inds_logit[i]//curr_k]]
                        new_beam_scores += [float(logits[top_inds_logit[i]])]
            beam = new_beam
            beam_scores = new_beam_scores
            hiddens = new_hiddens
        curr_l +=1
    generations = [g for _, g in sorted(zip(generations_scores, generations))]
    generations.reverse()
    return [vocab.decode_sentence_from_ids(s.tolist()) for s in generations]

In [None]:
import gc
import torch 
gc.collect()

torch.cuda.empty_cache()

In [None]:
sentence = "under the sea"
model = baseline_model
predict_beam(model, sentence, k=5, max_length=100)