# Training the text generator with WordVec-RNN 

In this notebook I am using the code from NLP Week 8 to train an RNN to model sequence of words vectors. 


In [None]:
import os
import torch
import random

import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchtext.vocab as vocab

from torchtext.data.utils import get_tokenizer
from torch.distributions import Categorical

In [None]:
device = 'cpu'

**Setting the hyperparameters:**

Whilst working with this notebook I was going back to the below cell to change the Hidden State Size and Number of layers. 

In [None]:
hidden_size = 100   # size of hidden state
batch_size = 100    # length of LSTM sequence
gen_seq_len = 50    # size of generated text sequence
step_len = 200      # number of training samples in each stem
num_layers = 3      # number of layers in LSTM layer stack
lr = 0.002          # learning rate
num_steps = 50     # max number of training steps
load_chk = False    # load in pre-trained checkpoint for training
save_path = "/Users/loiskelly/Documents/GitHub/LoisNLPProject/Data - NEW/Beatles Word Vector RNN model 3.pt"
# load_path = "wordvec_rnn_model.pt"

Next I loaded in my singular text file and cleaned it up a bit.

In [None]:
def load_single_text_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        corpus = f.read()
        return corpus
    
data_path = '/Users/loiskelly/Documents/GitHub/LoisNLPProject/all_data/lyrics_cleaned and extra data.txt'
corpus = load_single_text_file(data_path)
words = sorted(list(set(corpus.split())))
data_size, vocab_size = len(corpus.split()), len(words)

***Loading in the  word vectors:***

I chose to sitck with the GloVe dictionary. 

In [None]:
word_vectors = vocab.GloVe(name="6B",dim=100) 
tokenizer = get_tokenizer("basic_english")
wordvec_embeddings = nn.Embedding.from_pretrained(word_vectors.vectors)
embedding_dim = wordvec_embeddings.weight.shape[1]

# Get dictionaries from word_vectors class and 
# rename to be consistent with previous notebooks
word_to_ix = word_vectors.stoi
ix_to_word = word_vectors.itos

Next getting the list of tokens.

In [None]:
tokens = tokenizer(corpus) 
print(f'There raw data cosists of {len(tokens)} tokens.')
tokens = [token for token in tokens if token in word_to_ix]
print(f'There processed data cosists of {len(tokens)} tokens (that exist in the word vector vocab)')
data = word_vectors.get_vecs_by_tokens(tokens, lower_case_backup=True)

***Masking for data only in the dataset***

In [None]:
data_vocab = sorted(list(set(tokens)))
print(data_vocab)
data_indexes = [word_to_ix[word] for word in data_vocab]
print(data_indexes)
mask_array = np.zeros(len(word_to_ix), dtype=int)
mask_array[data_indexes] = 1
print(mask_array)
mask_array = torch.tensor(mask_array, dtype=torch.int64, device=device)

***Defining the network***

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.tanh = nn.Tanh()
    
    def forward(self, input_batch, hidden_state):
        output, hidden_state = self.rnn(input_batch, hidden_state)
        output = self.decoder(output)
        output = self.tanh(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

***Setting up network and optimiser:***

This is from the note books: 

[PyTorch tensors](https://pytorch.org/docs/stable/tensors.html) have been designed to work in almost exactly the same way as [numpy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html).

In [None]:
# Create list of indexes that can be valid starting points for training
index_list = list(range(0, len(data) - step_len - 1))

# Conver data to torch tensor
data = torch.tensor(data).to(device)
data = torch.unsqueeze(data, dim=1)

# Create RNN class
rnn = RNN(embedding_dim, embedding_dim, hidden_size, num_layers).to(device)

# Define loss function and optimiser
loss_fn = nn.CosineSimilarity()
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)

if load_chk:
    checkpoint = torch.load(load_path)
    rnn.load_state_dict(checkpoint['state_dict'])

***Sample data randomly***

In [None]:
def get_training_batch_indicies(index_list, batch_size):
    # Get a batch of indicies to sample our data from
    input_batch_indicies = torch.tensor(np.array(random.sample(index_list, batch_size)))
    # Offset indicies for target batch by one
    target_batch_indicies = input_batch_indicies + 1
    return input_batch_indicies, target_batch_indicies

#### Training the network

In [None]:
# Iterate through the number of steps defined earlier
for step in range(1, num_steps):
    
    running_loss = 0
    hidden_state = None
    rnn.zero_grad()
    train_batch_indicies, target_batch_indicies = get_training_batch_indicies(index_list, batch_size)
    # Cycle through for a set number of consecutive iterations in the data
    for i in range(step_len):
        input_batch = data[train_batch_indicies].squeeze()
        target_batch = data[target_batch_indicies].squeeze()
    
        # Forward pass
        # The following code is the same as calling rnn.forward(input_batch, hidden_state)
        output, hidden_state = rnn(input_batch, hidden_state)

# Following two lines of code are solutions to an error 'RuntimeError: The size of tensor a (400000) must match the size of tensor b (0) at non-singleton dimension 0
# written by ChatGTP

        # Reshape the output to match the dimensions of word_vectors.vectors
        reshaped_output = output[0].unsqueeze(0)  # Add an extra dimension
        # Compute distances to all words
        dists = torch.norm(word_vectors.vectors - reshaped_output, dim=1) 


        # Compute loss (we take 1 minus the loss as it is measure similiarity not distance)
        loss = 1 - loss_fn(output, target_batch).mean() 
        running_loss += loss.item() / step_len
        # print(f'loss {loss}, running loss {running_loss}')
        
        # Update weights of neural network
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Increment batch indicies by 1
        train_batch_indicies = train_batch_indicies + 1
        target_batch_indicies = target_batch_indicies + 1
        

        
    # Print loss
    print('\n'+'-'*75)
    print(f"\nStep: {step} Loss: {running_loss}")

    # Create a dictionary for saving the model and data mappings
    save_dict = {}
    # Add the model weight parameters as a dictionary to our save_dict
    save_dict['state_dict'] = rnn.state_dict()
    # Add the idx_to_char and char_to_idx dicts to our save_dict
    save_dict['ix_to_word'] = ix_to_word
    save_dict['word_to_ix'] = word_to_ix
    # Save the dictionary to a file
    torch.save(save_dict, save_path)

    # Now lets generate a random generated text sample to print out,
    # we will do this without gradient tracking as we are not training
    with torch.no_grad():
        
        # Take a random index and reset the hidden state of the model
        rand_index = np.random.randint(data_size-1)
        input = data[rand_index : rand_index+1].squeeze().unsqueeze(0)
        hidden_state = None
        
        # Iterate over our sequence length
        for i in range(gen_seq_len):
            
            # Forward pass
            output, hidden_state = rnn(input, hidden_state)

            # Comput distances to all words
            dists = torch.norm(word_vectors.vectors - output[0], dim=1) 

            # Use softmax to convert to probabilities
            probs = F.softmax(1 - dists, dim=0)
            # Multiply probabilities by mask to only sample words from dataset
            probs = probs * mask_array
            # Covert probabilities to probability distribution
            prob_dist = Categorical(probs)
            # Sample from probability distribution
            word_index  = prob_dist.sample()

            # Get the next word and print
            next_word = ix_to_word[word_index]
            print(next_word, end=' ')
            
            # The word vector for the next word is the next input
            input = word_vectors[next_word].unsqueeze(0)
