# Testing my WordVec-RNN text generator

In this notebook I am using the code from NLP Week 8 to test the RNN to model.

In [1]:
import torch
import random

import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchtext.vocab as vocab

from torchtext.data.utils import get_tokenizer
from torch.distributions import Categorical

In [2]:
device = 'cpu'

***Load word vectors***
Loading in the word vectors and using the GloVe package 

In [3]:
word_vectors = vocab.GloVe(name="6B",dim=100) 
tokenizer = get_tokenizer("basic_english")
wordvec_embeddings = nn.Embedding.from_pretrained(word_vectors.vectors)
embedding_dim = wordvec_embeddings.weight.shape[1]

# Get dictionaries from word_vectors class and 
# rename to be consistent with previous notebooks
word_to_ix = word_vectors.stoi
ix_to_word = word_vectors.itos

***Setting the hyperparameters ***

In [26]:
hidden_size = 100   # size of hidden state
num_layers = 3      # number of layers in LSTM layer stack
gen_seq_len = 100   # length of LSTM sequence
temperature = 10     # how random do we want our predictions to be
load_path = "/Users/loiskelly/Documents/GitHub/LoisNLPProject/Data - NEW/Beatles Word Vector RNN model 3.pt"

***Defining the network*** 

Defining the network with the same code as the training notebook

In [27]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.tanh = nn.Tanh()
    
    def forward(self, input, hidden_state):
        output, hidden_state = self.rnn(input, hidden_state)
        output = self.decoder(output)
        output = self.tanh(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

***Setting up network and optimiser:***

This is from the note books: 

[PyTorch tensors](https://pytorch.org/docs/stable/tensors.html) have been designed to work in almost exactly the same way as [numpy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html).

In [28]:
checkpoint = torch.load(load_path)

# Calculate vocab size
vocab_size = len(word_to_ix)

# Instantiate RNN
rnn = RNN(embedding_dim, embedding_dim, hidden_size, num_layers).to(device)

# Load model weights from checkpoint file 
rnn.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

***Generating a random sequence***

In [29]:
with torch.no_grad():
    hidden_state = None
    
    random_word = random.choice(list(ix_to_word))
    #Pick a random starting word
    random_start = np.array(word_vectors[random_word])
    
    # Convert to PyTorch Tensor
    input = torch.tensor(random_start)
    
    # Change dimensionality of tensor for PyTorch compatibility
    # For more info on this function see: https://stackoverflow.com/questions/57237352/what-does-unsqueeze-do-in-pytorch
    input = input.unsqueeze(0)
    print(input.shape)
    # Iterate over our sequence length
    for i in range(gen_seq_len):
        # Forward pass
            output, hidden_state = rnn(input, hidden_state)

            # Comput distances to all words
            dists = torch.norm(word_vectors.vectors - output[0], dim=1) 
            # Use softmax to convert to probabilities
            probs = F.softmax(1 - dists, dim=0)
            # Multiply probabilities by mask to only sample words from dataset
            probs = probs
            # Covert probabilities to probability distribution
            prob_dist = Categorical(probs)
            # Sample from probability distribution
            word_index  = prob_dist.sample()

            # Get the next word and print
            next_word = ix_to_word[word_index]
            print(next_word, end=' ')
            
            # The word vector for the next word is the next input
            input = word_vectors[next_word].unsqueeze(0)

torch.Size([1, 100])


6,860 tohoshinki shoveler sendo un/cefact 18km a-mei rubberneck broadsides -2.9 cheerleading 2,648 gurnee non-scripta nto i-aa federales evaristti amenábar kucherenko shashthi tenes emelia isshu vironia rohul mehdawi ,25 cigar-smoking córas denjū 1,232 mobikom spinalis abogado ørn makmur naghi setesdal huaraches cherryville 95.25 gerak transhumanism termit chicago-based sll bentalha mfecane 43.73 cappio self-reporting chatte ćirić ichilov 2:58 zhiwu mbango appreciator tungning höyük saltair ronalds argentinas cantieri senftenberg dius beevi burmantofts socata ghostscript antoniychuk mabunda coltish .0217 shujah reppy genetech crevier loathes resham trem milanese bandić datel vibrator feest aonghus sharpnose grrrl delicti construido biehler jorvorskie advincula flechette horler betws-y-coed tubize meretskov 

***Mapping string to indexes***

In [30]:
def map_str_to_wordvec(input_str):
    wordvec_seq = []
    tokens = tokenizer(input_str) 
    for word in tokens:
        ix = word_to_ix.get(word, None)
        if ix is not None:
            wordvec_seq.append(word_vectors[word])
        else:
            print(f'The char {word} is not in the dictionary')
    # Convert list of tensors to one tensor
    return torch.stack(wordvec_seq).to(device)

Creating the index list and converting it to a numpy array.
Here is where I came back and updated the input string for each test. 

In [35]:
input_str = 'Whisper words'
wordvec_seq = map_str_to_wordvec(input_str)
print(f'Our sequence of word_vecs is: {wordvec_seq.shape}')

Our sequence of word_vecs is: torch.Size([2, 100])


***Generate from randomly created starting sequence***

In [36]:
with torch.no_grad():
    hidden_state = None

    print(input_str, end=' ')
    for i in range(wordvec_seq.shape[0]):
                
        # Convert to PyTorch Tensor
        input = wordvec_seq[i,:]

        # Reshape tensor
        input = input.unsqueeze(0)
        
        # Condition the model on starting sequence
        output, hidden_state = rnn(input, hidden_state)
        
    input = output

    # Iterate over our sequence length
    for i in range(gen_seq_len):
        # Forward pass
        output, hidden_state = rnn(input, hidden_state)
        
        # Construct categorical distribution and sample a word
        output = F.softmax(torch.squeeze(output), dim=0)
        dist = Categorical(output / temperature)
        index = dist.sample()
        
        # Print the sampled word
        print(ix_to_word[index.item()], end=' ')
        
        # Next input is current output
        input[0][0] = index.item()

Whisper words 

( three was year but which : in when its he which three after said could - said ; it if by were than over by which ; other said ' some ) her said year last n't other . will said than up $ first had first at ' year as it its but president and : we : _ can other also a over their a and government their are but ( two or state not but only only " " - world of were on be world its they be who out other world not last and 