In [1]:
from random import shuffle, choice

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.utils.data as datautils
from torchtext import data, datasets

# Activities

1. How many parameters does the first recurrent neural network have? And the LSTM network?

2. Why are exploding/vanishing gradients a more pressing issue in recurrent neural networks? And how does the LSTM architecture help to address the vanishing gradient problem?

3. Repeat the training of the LSTM network, but now using all the words in the two datasets (training and test) as training data. Let the network train for 40 epochs. Then, for each of the 4 possible labels/languages, generate 5 sample names using the trained network. Comment on the performance of the network thus trained.

# Recurrent Neural Networks

In [2]:
def evaluate(net, dataset, loss):
  
    # Compute dataset size
    dataset_size = len(dataset)

    # We first set the network to "evaluation mode". This is useful, for 
    # example, in dropout layers, which should behave differently in training 
    # and evaluation.
    net.eval()

    l = 0
    acc = 0

    # We compute both scores and labels
    for X, y in dataset:
        output = net(X)
        _, label = torch.max(output, dim=1)

        # Compute loss
        l += loss(output, y).item()
            
        # Compute accuracy
        acc += (label == y).double().item()

    # Average
    l = l / dataset_size
    acc = acc / dataset_size
    
    # We reset the network back to training mode
    net.train()
    
    return l, acc

In [3]:
def train_network(net, loss, optimizer, dataset, num_epochs=20):
    
    # We start by initializing two lists, to track the loss and accuracy during
    # training.
    train_losses = []
    train_accuracies = []

    valid_losses = []
    valid_accuracies = []

    for ep in range(num_epochs):
        print('\n- Training epoch: %i -' % ep)

        # We use auxiliary variables to keep track of loss and accuracy within 
        # an epoch
        running_loss = 0.
        running_acc  = 0.

        for X, y in dataset:

            # We zero-out the gradient
            optimizer.zero_grad()

            # Compute output
            output = net(X)

            # Our outputs are *scores*, so we also compute the predicted labels, 
            # since we need them to check the accuracy
            #
            # To that purpose, we compute the class that maximizes the score. 
            # The max function returns both the maximum value, and the 
            # maximizing entry. We care only about the latter, so we ignore the 
            # first output. 
            #
            # Also, recall that the dimensions of the output are 
            # (batch size, n. classes). We take the maximum over the first 
            # dimension
            _, label = torch.max(output, dim=1)

            # Get loss
            l = loss(output, y)

            # Compute gradient
            l.backward()
            
            # Perform optimization step
            optimizer.step()

            # Update total running loss. We account for the number of points 
            # in the batch
            running_loss += l.item()
             
            # Update the accuracy
            running_acc += (label == y.data).double().item()

        train_losses += [running_loss / len(dataset)]
        train_accuracies += [running_acc / len(dataset)]

        # Loss and accuracy in the validation set
        aux_l, aux_a = evaluate(net, valid_set, loss)

        valid_losses += [aux_l]
        valid_accuracies += [aux_a]

        print(f'Training loss: {train_losses[-1]:.4f}')
        print(f'Training accuracy: {train_accuracies[-1]:.1%}')
        print(f'Validation loss: {valid_losses[-1]:.4f}')
        print(f'Validation accuracy: {valid_accuracies[-1]:.1%}')

    return net, train_losses, train_accuracies, valid_losses, valid_accuracies

In [4]:
import string

LABELS = ['English', 'French', 'Portuguese', 'Spanish']
LETTERS = list(string.ascii_letters + " .,;'-") + ['<eos>']

def input_encoding(input_str):
    ''' Receives a string as input and returns, as output, a Pytorch tensor
        containing the one-hot encoding of the provided string.'''

    one_hot_string = torch.zeros(len(input_str), 1, len(LETTERS), dtype=torch.float)
    
    for letter_idx in range(len(input_str)):
        letter = input_str[letter_idx]
        one_hot_string[letter_idx][0][LETTERS.index(letter)] = 1

    return one_hot_string

def label_encoding(output_str):
    ''' Receives a string as input and returns, as output, a Pytorch tensor
        containing the one-hot encoding of the provided label string.'''

    one_hot_label = torch.zeros(1, len(LABELS), dtype=torch.float)
    label_idx = LABELS.index(output_str)
    one_hot_label[0][label_idx] = 1

    return one_hot_label

We are now going to create a custom dataset. We create a subclass of Pytorch's `Dataset` class, and write down the `__init__`, `__len__`, and `__getitem__` methods. 

We thus load the file `names_data.csv` that we just uploaded and then use the data in it to create the dataset.

In [5]:
# We now create our custom class
class NamesDataset(datautils.Dataset):
    def __init__(self, names_file):

        # We load the data from the csv file
        name_data = pd.read_csv(names_file)
        
        # We create a list to store the input and output pairs
        self.samples = []

        # We run through the data in the Dataframe and fill in both lists
        for idx in range(len(name_data)):
            name  = input_encoding(name_data['Name'][idx])
            label = torch.tensor([LABELS.index(name_data['Label'][idx])])

            self.samples += [(name, label)]

        shuffle(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

We can now create a dataset.

In [6]:
train_set = NamesDataset('train_data.csv')
valid_set = NamesDataset('test_data.csv')

We can plot loss and accuracy during training, and see how they evolved.

## Text generation using LSTM

We are now going to perform a slightly more sophisticated task on the same data we used before. In particular, we will now _generate_ names from the different countries that follow the same pattern found in the names in the dataset.

To that purpose, we need to perform a little more ground work. We slightly reformulate the `NamesDataset` class to now have as input a sequence of symbols and a category, and as output another sequence of symbols.

In [7]:
# We now create our custom class
class NamesDataset(datautils.Dataset):
    def __init__(self, names_file):

        # We load the data from the csv file
        name_data = pd.read_csv(names_file)

        # We create a list to store the input and output pairs
        self.samples = []

        # We run through the data in the Dataframe and fill in both lists
        for idx in range(len(name_data)):
            name  = name_data['Name'][idx]
            label = name_data['Label'][idx]

            self.samples += [(name, label)]

        shuffle(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]

        name = item[0]
        label = item[1]
        
        # Since we're using an embedding layer, we no longer 
        # use one-hot encoding, but store only the index        
        input_tensor  = torch.tensor([[LETTERS.index(x)] for x in name], dtype=torch.long)
        label_tensor  = label_encoding(label)
        target_tensor = torch.tensor([LETTERS.index(x) for x in name[1:]] + [LETTERS.index('<eos>')], dtype=torch.long)
        
        item_dict = {"label": label,
                     "name": name,
                     "label_tensor": label_tensor,
                     "input_tensor": input_tensor,
                     "target_tensor": target_tensor}        
        
        return item_dict

In [8]:
train_set = NamesDataset('train_test_data.csv')

In [9]:
class LSTMNetwork(nn.Module):
    def __init__(self, input_size, n_labels, embedding_size, hidden_size, output_size, dropout=0.):
        super().__init__()
        
        # First, an embedding layer is used to convert the one-hot encoding 
        # into a feature vector
        self.i2f_layer = nn.Embedding(input_size, embedding_size)

        # We then create an LSTM layer
        self.f2h_layer = nn.LSTM(embedding_size + n_labels, hidden_size, 1)

        # Then, a linear layer that turns the LSTM hidden state into an 
        # output prediction
        self.h2o_layer = nn.Linear(hidden_size, output_size)
        
        # We include a dropout layer for the embedding
        self.dropout = nn.Dropout(dropout)

        # We add initialization parameters for the hidden state and cell
        self.hidden_init = nn.Parameter(torch.zeros(1, hidden_size))
        self.cell_init   = nn.Parameter(torch.zeros(1, hidden_size))

    def single_pass(self, letter_tensor, label_tensor, hidden, cell):
        # Compute embedding
        f = self.dropout(self.i2f_layer(letter_tensor))
        
        # Peform lstm pass
        o, (h, c) = self.f2h_layer(torch.cat((f, label_tensor), 1), (hidden, cell))
        
        # Compute output
        o = self.h2o_layer(o)

        return o, h, c

    def forward(self, input):
        name = input['input_tensor']
        label = input['label_tensor']

        h = self.hidden_init
        c = self.cell_init

        outputs = []

        for letter in name:
            out, h, c = self.single_pass(letter, label, h, c)
            outputs += [out]

        # We return all outputs
        return torch.cat(outputs)

    def sample(self, label, start_letter, max_length=20):
        ''' We will use this function to generate names given a label.'''
            
        # During sampling, we store no gradient information
        self.eval()

        with torch.no_grad():
            label_tensor = label_encoding(label)
            letter_tensor = torch.tensor([LETTERS.index(start_letter)], dtype=torch.long)
          
            h = self.hidden_init
            c = self.cell_init

            output = [start_letter]

            for i in range(max_length):
                o, h, c = self.single_pass(letter_tensor, label_tensor, h, c)
                
                _, next_idx = torch.max(o, dim=1)
                next_letter = LETTERS[next_idx]

                if next_letter == "<eos>":
                    break
                else:
                    output += [next_letter]
                    letter_tensor = torch.tensor([next_idx], dtype=torch.long)

        self.train()

        return ''.join(output)

We can now run the training routine. Note, however, that the output is now a sequence of distributions, one per symbol, so the computation of the loss must be different. For this reason, we do not use the `train_network` function defined above.

In [10]:
# We create an instance of our LSTM network.
lstm_net = LSTMNetwork(len(LETTERS), len(LABELS), 128, 256, len(LETTERS), dropout=0.3)

# Before training, let's check how well the network generates names
for i in range(10):
    label = np.random.choice(LABELS)
    letter = np.random.choice(LETTERS[:-1]).upper()
    
    print(f'Generated name in {label}:', lstm_net.sample(label, letter))

# We use the cross-entropy loss _per letter_
loss = nn.CrossEntropyLoss()

# ... and Adam as the optimizer
optim = torch.optim.Adam(lstm_net.parameters(), lr=0.001)

train_losses = []

num_epochs = 40

for ep in range(num_epochs):
    print('\n- Training epoch: %i -' % ep)

    # We use auxiliary variables to keep track of loss and accuracy within 
    # an epoch
    running_loss = 0.

    for sample in train_set:
        target = sample['target_tensor']

        # We zero-out the gradient
        optim.zero_grad()

        # We initialize the loss to 0
        l = 0

        # Compute output
        outputs = lstm_net(sample)

        # We now compute the loss for each letter in the input name, given the 
        # target.
        for i in range(len(target)):
            l += loss(outputs[i], target[i])

        # Compute gradient
        l.backward()
            
        # Perform optimization step
        optim.step()

        # Update total running loss. We account for the number of points 
        # in the batch
        running_loss += l.item()
             
    train_losses += [running_loss / len(train_set)]

    print(f'Training loss: {train_losses[-1]:.4f}')

Generated name in Portuguese: PItjjkuuuFoiii.....
Generated name in French: ,yy.....
Generated name in Spanish: CkPouuFKoiiZoii......
Generated name in English: SuuFK
Generated name in English: OuFKoii....
Generated name in Spanish: Q-PstjjkuuuFKoiiZoii.
Generated name in English: OuFKoii....
Generated name in Portuguese: EFoii......
Generated name in Spanish: YlkuuuFKoiiZoii......
Generated name in English: FoppUsjji....

- Training epoch: 0 -
Training loss: 13.7971

- Training epoch: 1 -
Training loss: 12.1272

- Training epoch: 2 -
Training loss: 11.4020

- Training epoch: 3 -
Training loss: 10.8731

- Training epoch: 4 -
Training loss: 10.4619

- Training epoch: 5 -
Training loss: 10.0617

- Training epoch: 6 -
Training loss: 9.6972

- Training epoch: 7 -
Training loss: 9.3869

- Training epoch: 8 -
Training loss: 9.1246

- Training epoch: 9 -
Training loss: 8.8961

- Training epoch: 10 -
Training loss: 8.6407

- Training epoch: 11 -
Training loss: 8.4324

- Training epoch: 12 -
Tr

Now that we trained out network, let us see how well it is able to generate names given a label/language.

In [16]:
for i in range(20):
    label = np.random.choice(LABELS)
    letter = np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))

    print(f'Generated name in {label}:', lstm_net.sample(label, letter))

Generated name in English: Newby
Generated name in English: Ellison
Generated name in French: Irelane
Generated name in Portuguese: Xuerra
Generated name in French: Gagnier
Generated name in Portuguese: Lobo
Generated name in Portuguese: D'cruz
Generated name in Spanish: Yeardo
Generated name in French: Kent
Generated name in French: Newell
Generated name in English: Stannard
Generated name in Portuguese: Escarra
Generated name in Portuguese: Lobo
Generated name in Portuguese: Herrero
Generated name in Spanish: Paredes
Generated name in French: Fabian
Generated name in Spanish: Ness
Generated name in English: Macarthur
Generated name in Spanish: Ferrer
Generated name in French: Elvis


In [18]:
for label in LABELS:
    for i in range(10):
        letter = np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
        print(f'Generated name in {label}:', lstm_net.sample(label, letter))

Generated name in English: Newby
Generated name in English: Yarwood
Generated name in English: Upton
Generated name in English: Kenneth
Generated name in English: Harrison
Generated name in English: Godfrey
Generated name in English: Farrel
Generated name in English: Locke
Generated name in English: Zaunton
Generated name in English: Allcott
Generated name in French: Kent
Generated name in French: Elvis
Generated name in French: Zaville
Generated name in French: Martel
Generated name in French: Paris
Generated name in French: Zaville
Generated name in French: Elvis
Generated name in French: Martel
Generated name in French: Newell
Generated name in French: Oliver
Generated name in Portuguese: Barros
Generated name in Portuguese: Lobo
Generated name in Portuguese: Quiros
Generated name in Portuguese: Xuerra
Generated name in Portuguese: Araullo
Generated name in Portuguese: Kentera
Generated name in Portuguese: Quiros
Generated name in Portuguese: Paredes
Generated name in Portuguese: Fa