In [28]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import spacy
import os
import codecs
from io import open
import itertools
import math

In [29]:
USE_CUDA = False#torch.cuda.is_available()
device = torch.device("cuda:2" if USE_CUDA else "cpu")
SEED = 23

random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1696b9fb630>

In [30]:
# choose directory to save trained model data if training is interupted
save_dir = os.path.join("preprocessed_data", "save")
# path to data files
train_path = "full_dataset_train.csv"
test_path = "full_dataset_test.csv"
val_path = "full_dataset_val.csv"
gensim_path = "../lt2212-v19-a4/GoogleNews-vectors-negative300.bin"
pre_trained = False # this is not implemented

In [31]:
import pandas as pd 

train_data = pd.read_csv(train_path) 
# Preview the first 5 lines of the loaded data 
train_data.head()


Unnamed: 0,story,highlights
0,teenager three elderly people saudi arabia haj...,virus kills teenager three elderly people on h...
1,jury selection begins monday federal hate crim...,two men are accused of beating a mexican immig...
2,kevin mccarthy whose 65 year long acting caree...,mccarthy died of natural causes in massachuset...
3,know saying best youve got learn best might tr...,ski fish play drums you can learn how at the f...
4,quitter proud november 20 37th anniversary gre...,november 20 marks the 37th great american smok...


In [32]:
# Токены
PAD_token = 0  # Padding
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token
UNK_token = 3  # Unknown word token

PAD_str = '<pad>' 
SOS_str = '[START]'
EOS_str = '[END]'
UNK_str = '<unk>'

In [33]:
import spacy
from spacy.lang.en.examples import sentences 


In [34]:
nlp = spacy.load('en_core_web_sm')

In [35]:
class Vocab:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.embeddings = {}        
        self.index2word = {PAD_token: PAD_str, SOS_token: SOS_str, EOS_token: EOS_str, UNK_str: UNK_token}
        self.num_words = 4  # Count SOS, EOS, PAD, UNK        
        
    
    def addEmbedding(self,model,word): 
        vector = get_w2v_vectors(model, word)
        self.embeddings[word] = vector

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: PAD_str, SOS_token: SOS_str, EOS_token: EOS_str, UNK_str: UNK_token}
        self.num_words = 4 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [36]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np

def load_gensim_model(path_to_model):
    model =  KeyedVectors.load_word2vec_format(path_to_model, binary=True)
    return model

def get_w2v_vectors(w2v_model, words):
    w2v_vectors = {}
    for word in words:
        try:
            vec = w2v_model.word_vec(word)
            w2v_vectors[word] = vec
        # this exception will occur when a word does not exist in the vocabulary of this model
        except KeyError:
                vec = np.random.rand(1,300)[0]
    return w2v_vectors

In [37]:
# Using the functions defined above, return a populated vocab object and pairs list for the train data set
def loadPrepareVocab(train_data):
    import itertools as iter 
    print("Start preparing training data ...")      
    vocab = Vocab("CNN")
    t = train_data["story"].tolist()
    r = train_data["highlights"].tolist()
    # get the length of the longest sentence
    max_length = len(max(t, key=len).split(' '))
    # Split every line into pairs
    pairs = list(zip(t,r))
    if pre_trained:
        print("Loading gensim model...")
        gensim = load_gensim_model(gensim_path)
        print("Finished loading gensim model.")
    print("Counting word pairs and adding embedding vectors.")
    for pair in pairs:        
        vocab.addSentence(pair[0])
        vocab.addSentence(pair[1])    
        if pre_trained:
            vocab.addEmbedding(gensim,pair[0])
            vocab.addEmbedding(gensim,pair[1])        
    print("Counted words:", vocab.num_words)
    return vocab, pairs, max_length

vocab, pairs, max_length = loadPrepareVocab(train_data)

Start preparing training data ...
Counting word pairs and adding embedding vectors.
Counted words: 120792


In [38]:

validation_data = pd.read_csv(val_path) 

validation_data.head()

def addVal(validation_data):
    t = validation_data["story"].tolist()
    r = validation_data["highlights"].tolist()
    for words in list(zip(t,r)):     
        vocab.addSentence(words[0])
        vocab.addSentence(words[1]) 
addVal(validation_data)

In [39]:
# подготовка для валидации
def prepareData(data):
    print("Start preparing data ...") 
    t = data["story"].tolist()
    r = data["highlights"].tolist()
    pairs = list(zip(t,r))
    return pairs

In [40]:

for pair in pairs[:2]:
    print("Story")
    print(pair[0])
    print("Highlight")
    print(pair[1])

Story
teenager three elderly people saudi arabia hajj pilgrimage died h1n1 flu virus saudi health ministry said victims virus 17 year old nigerian female sudanese man indian man moroccan woman 75 years old cases discovered late said dr khaled al marghalani ministry spokesman old others pre existing chronic conditions al marghalani said sudanese man initially went doctor treated h1n1 doctor sent hospital treated h1n1 late said annual pilgrimage mecca saudi arabia required muslims least lives millions people around globe make trek attend year hajj begins wednesday many pilgrims arriving days weeks ahead event coincides flu season arab health ministers met cairo egypt months back looked like several groups might even banned years event flu children 12 adults 65 pregnant women people chronic illnesses saudis ban anybody coming left responsibility pilgrims countries origin effect officials issued guidelines people risk two days ago saudi arabias health minister dr abdullah al rabeeah gather

In [41]:
def indexesFromSentence(lang, sentence):
    sentence_indices = []
    for word in sentence.split(' '):
        try:
            sentence_indices.append(lang.word2index[word])
        except KeyError:
                sentence_indices.append(UNK_token)
    return sentence_indices + [EOS_token]

def embeddingsFromSentence(lang, sentence):
    sentence_embeddings = []
    for word in sentence.split(' '):
        try:
            sentence_embeddings.append(lang.embeddings[word])
        except KeyError:
                vec = np.random.rand(1,300)[0]
    return sentence_embeddings

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, vocab):
    indexes_batch = [indexesFromSentence(vocab, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, vocab):
    indexes_batch = [indexesFromSentence(vocab, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(vocab, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, vocab)
    output, mask, max_target_len = outputVar(output_batch, vocab)
    return inp, lengths, output, mask, max_target_len

In [42]:
small_batch_size = 2
batches = batch2TrainData(vocab, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[  469,  1054],
        [  932,  2312],
        [ 1539,  1216],
        [  260,   785],
        [  217,   945],
        [ 1812,  4358],
        [34389, 12698],
        [11547, 12699],
        [33483,  3827],
        [ 8013,  3113],
        [33475,  2579],
        [10314,   470],
        [ 2234,   651],
        [  391,   652],
        [ 1754,  1174],
        [11507,   651],
        [ 6531,  1965],
        [   18,  9207],
        [  423,  3622],
        [11547, 15495],
        [ 1577,  1174],
        [  409,  2587],
        [ 1889,  4930],
        [ 4661, 12700],
        [ 4714,  2579],
        [14639,  1054],
        [ 8671,  2312],
        [34390,  1216],
        [   99,   705],
        [ 3245,  5898],
        [ 4981,  1699],
        [ 1447,  1971],
        [ 5102,  1207],
        [ 4502,   122],
        [  289,  1502],
        [ 3469,  1498],
        [ 3431, 12700],
        [  806,  1656],
        [  294,     2],
        [ 1990,     0],
        [ 6531,     0],


Encoder

In [43]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because the input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

Attention

In [44]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

Decoder

In [45]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

Define mask

In [46]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

Train

In [47]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=max_length):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [48]:
def train_model(model_name, vocab, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
    
    # Load batches for each iteration
    training_batches = [batch2TrainData(vocab, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training model...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'vocab_dict': vocab.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))


In [49]:
# name the model is saved as
model_name = 'CNN_model_2'
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
# Choose if using pre-trained Glove embeddings or not
pre_trained = False # this is not implemented
# Configure model paramters
attn_model = 'dot'
hidden_size = 300
encoder_n_layers = 4
decoder_n_layers = 4
dropout = 0.1
batch_size = 32

In [50]:
checkpoint_iter = 4

#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    vocab.__dict__ = checkpoint['vocab_dict']


print('Building encoder and decoder ...')
embedding = nn.Embedding(vocab.num_words, hidden_size)

if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, vocab.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Finished building models.')

Building encoder and decoder ...
Finished building models.


In [62]:
# Configure training/optimization paramteters
clip = 10.0
teacher_forcing_ratio = 3.0
learning_rate = 0.01
decoder_learning_ratio = 1.5
n_iteration = 20
print_every = 1
save_every = 4

In [63]:
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
            
for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
print('Finished building optimizers.')

Building optimizers ...
Finished building optimizers.


In [64]:
# Run training iterations
print("Starting Training...")
train_model(model_name, vocab, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, 'CNN', loadFilename)


Starting Training...
Initializing ...
Training model...
Iteration: 1; Percent complete: 5.0%; Average loss: nan
Iteration: 2; Percent complete: 10.0%; Average loss: nan
Iteration: 3; Percent complete: 15.0%; Average loss: nan
Iteration: 4; Percent complete: 20.0%; Average loss: nan
Iteration: 5; Percent complete: 25.0%; Average loss: nan


KeyboardInterrupt: 

Test

In [54]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [55]:
def evaluate(encoder, decoder, searcher, vocab, sentence, max_length=max_length):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(vocab, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [vocab.index2word[token.item()] for token in tokens]
    return decoded_words

In [56]:
def evaluateInput(input_sentence, encoder, decoder, searcher, vocab):
    # Evaluate sentence
    output_words = evaluate(encoder, decoder, searcher, vocab, input_sentence)
    # Format and print response sentence
    output_words[:] = [x for x in output_words if not (x == EOS_str or x == PAD_str)]
    return ' '.join(output_words)

Embedding

In [57]:
import pandas as pd 

test_data = pd.read_csv(test_path) 
# Preview the first 5 lines of the loaded data 
test_data.head()

Unnamed: 0,story,highlights
0,us patient infection mysterious mers virus rec...,a patient with mers in florida has been discha...
1,digital era finding one often involves finding...,siren is a new dating app created for women by...
2,dna explore unknown pushing boundaries explori...,the spaceshiptwo catastrophe comes after an or...
3,nun died setting fire southwestern china first...,group tenzin wangmo called for religious freed...
4,point life wizened age 69 managed run 123 mara...,training is key when preparing for your first ...


In [58]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

In [59]:
story, summary = test_data["story"].tolist(), test_data["highlights"].tolist()

In [61]:
for st,su in zip(story,summary):
    pr = evaluateInput(st,encoder, decoder, searcher, vocab)
    

    print("Story: ", st)
    print("Summary: ", su)
    print("---------------------------------------------------------------")
    print("Prediction: ", pr)
    print("---------------------------------------------------------------")

Story:  us patient infection mysterious mers virus recovered florida department health announced monday patient health care provider lives works saudi arabia discharged department said admitted dr p phillips hospital orlando may 9 health care workers household contacts contact patient tested mers cov results come back negative health department said statement broad risk mers cov infection general public threat traveling orlando area mers cov stands middle east respiratory syndrome coronavirus florida patient us citizen said dr anne schuchat director cdcs national center immunization respiratory diseases one three people united states confirmed mers fatal first us diagnosis indiana man traveled saudi arabia also health care provider florida indiana cases linked schuchat said indiana patient extended face face contact shook hands man illinois 40 minute business meeting centers disease control prevention said saturday believed represent first transmission mers within united states officia

KeyboardInterrupt: 