# Training a text generator with Word-RNN 

In this notebook I am using the code from NLP Week 7 to train a RNN to model sequence of words.

In [1]:
import os
import torch
import random

import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from torch.distributions import Categorical

In [2]:
device = 'cpu'

**Setting the hyperparameters:**

Whilst working with this notebook I was going back to the below cell to change the Hidden State Size and Number of layers.  

In [3]:
hidden_size = 512   # size of hidden state
batch_size = 100    # size of the batch used for training
step_len = 200      # number of training samples in each stem
num_layers = 3      # number of layers in LSTM layer stack
lr = 0.002          # learning rate
num_steps = 100     # max number of training steps
gen_seq_len = 50    # length of generated sequence
load_chk = False    # load in pre-trained checkpoint for training
save_path = "/Users/loiskelly/Documents/GitHub/LoisNLPProject/all_data/Beatles word RNN model 5.pt"
# load_path = "word_rnn_model.pt"

Next I loaded in my singular text file and cleaned it up a bit.

In [5]:
data_path = "/Users/loiskelly/Documents/GitHub/LoisNLPProject/Data - NEW/lyrics_cleaned and extra data.txt"
corpus = open(data_path, 'r').read()
words = sorted(list(set(corpus.split())))
data_size, vocab_size = len(corpus.split()), len(words)

Creating two dictionatries for mapping words to indexes and indicies to the words they represent. 

In [6]:
word_to_ix = { w:i for i,w in enumerate(words) }
ix_to_word = { i:w for i,w in enumerate(words) }

data = corpus.split()
for i, ch in enumerate(data):
    data[i] = word_to_ix[ch]

***Defining the network*** 

In [7]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, input_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_batch, hidden_state):
        embedding = self.embedding(input_batch)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

***Setting up network and optimiser:***

This is from the note books: 

[PyTorch tensors](https://pytorch.org/docs/stable/tensors.html) have been designed to work in almost exactly the same way as [numpy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html).

In [8]:
# Create list of indexes that can be valid starting points for training
index_list = list(range(0, len(data) - step_len - 1))

# Conver data to torch tensor
data = torch.tensor(data).to(device)
data = torch.unsqueeze(data, dim=1)

# Create RNN class
rnn = RNN(vocab_size, vocab_size, hidden_size, num_layers).to(device)

# Define loss function and optimiser
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)

# Load in pretrained model if specified
if load_chk:
    checkpoint = torch.load(load_path)
    rnn.load_state_dict(checkpoint['state_dict'])

***Sample data randomly***

In [9]:
def get_training_batch_indicies(index_list, batch_size):
    # Get a batch of indices to sample our data from
    input_batch_indicies = torch.tensor(np.array(random.sample(index_list, batch_size)))

    # Offset indices for the target batch by one
    target_batch_indicies = input_batch_indicies + 1

    return input_batch_indicies, target_batch_indicies
print("Length of index_list:", len(index_list))
print("Provided batch size:", batch_size)


Length of index_list: 37999
Provided batch size: 100


***Training the network***

In [None]:
# Iterate through the number of steps defined earlier
for step in range(1, num_steps):
    
    running_loss = 0
    hidden_state = None
    rnn.zero_grad()
    train_batch_indicies, target_batch_indicies = get_training_batch_indicies(index_list, batch_size)
    # Cycle through for a set number of consecutive iterations in the data
    for i in range(step_len):
        # Extract data batches from indicies
        input_batch = data[train_batch_indicies].squeeze()
        target_batch = data[target_batch_indicies].squeeze()
        
        # Forward pass
        # The following code is the same as calling rnn.forward(input_batch, hidden_state)
        output, hidden_state = rnn(input_batch, hidden_state)
        
        # Compute loss
        loss = loss_fn(output, target_batch)
        running_loss += loss.item() / step_len
        
        # Update weights of neural network
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        # Increment batch coordinates by 1
        train_batch_indicies = train_batch_indicies + 1
        target_batch_indicies = target_batch_indicies + 1

        
    # Print loss
    print('\n'+'-'*75)
    print(f"\nStep: {step} Loss: {running_loss}")

    # Create a dictionary for saving the model and data mappings
    save_dict = {}
    # Add the model weight parameters as a dictionary to our save_dict
    save_dict['state_dict'] = rnn.state_dict()
    # Add the idx_to_word and word_to_idx dicts to our save_dict
    save_dict['ix_to_word'] = ix_to_word
    save_dict['word_to_ix'] = word_to_ix
    # Save the dictionary to a file
    torch.save(save_dict, save_path)

    # Now lets generate a random generated text sample to print out,
    # we will do this without gradient tracking as we are not training
    with torch.no_grad():
        
        # Take a random index and reset the hidden state of the model
        rand_index = np.random.randint(data_size-1)
        input_batch = data[rand_index : rand_index+1]
        hidden_state = None
        
        # Iterate over our sequence length
        for i in range(gen_seq_len):
            # Forward pass
            output, hidden_state = rnn(input_batch, hidden_state)
            
            # Construct categorical distribution and sample a character
            output = F.softmax(torch.squeeze(output), dim=0)
            dist = Categorical(output)
            index = dist.sample()
            
            # Print the sampled character
            print(ix_to_word[index.item()], end=' ')
            
            # Next input is current output
            input_batch[0][0] = index.item()
