## Exercise Sheet 1: Recurrent Models

Compare Vanilla Recurrent Neural Networks (RNN) with Long-Short Term Networks (LSTM). Implement a vanilla RNN and LSTM from scratch. 

In [11]:
import os
import math
import sys
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

# set seed for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# set paths
data_path = 'data/'
model_path = 'model/'
results_path = 'results/'

# make directories if they don't exist
if not os.path.exists(data_path):
    os.makedirs(data_path)
if not os.path.exists(model_path):
    os.makedirs(model_path)
if not os.path.exists(results_path):
    os.makedirs(results_path)


### Task 1: Toy Problem: Palindrome Numbers

Use a  a recurrent neural network to predict the next digit of the palindrome
at every timestep. This can become difficult for very long sequences since the network has to memorise information from very far away earlier timesteps. Goal is to study the memoization capability of recurrent networks.

In [12]:
class PalindromeDataset(data.Dataset):
    """ Randomly generates palindromes of a given length. 
        The input is the first N-1 digits of the palindrome, the target is the last digit.
        For short palindromes, the number of possible palindromes is limited.
    """
    def __init__(self, seq_length):
        self.seq_length = seq_length

    def __len__(self):
        # Number of possible palindroms can be very big:
        # (10**(seq_length/2) or (10**((seq_length+1)/2)
        # Therefore we return the maximum integer value
        return sys.maxsize

    def __getitem__(self, idx):
        # Keep last digit as target label. Note: one-hot encoding for inputs is
        # more suitable for training, but this also works.
        full_palindrome = self.generate_palindrome()
        # Split palindrome into inputs (N-1 digits) and target (1 digit)
        return full_palindrome[0:-1], int(full_palindrome[-1])

    def generate_palindrome(self):
        # Generates a single, random palindrome number of 'length' digits.
        left = [np.random.randint(0, 10) for _ in range(math.ceil(self.seq_length / 2))]
        left = np.asarray(left, dtype=np.float32)
        right = np.flip(left, 0) if self.seq_length % 2 == 0 else np.flip(left[:-1], 0)
        return np.concatenate((left, right))


In [13]:
def train(model, train_loader, criterion, optimizer, device, epochs=5):
    model.train()
    train_loss = []
    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            inputs = torch.nn.functional.one_hot(inputs.to(torch.int64), num_classes=10).to(torch.float32)
            labels = labels.to(torch.long)

            optimizer.zero_grad()

            outputs, _ = model(inputs)
            loss = criterion(outputs[:, -1, :], labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 1000 == 999:
                print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 1000))
                running_loss = 0.0
        
        # Save the loss for this epoch
        train_loss.append(running_loss / len(train_loader))

    print('Finished Training')

    # Plot the loss curve
    plt.plot(train_loss)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training loss')
    plt.savefig(results_path + 'training_loss.png')
    plt.show()


def test(model, test_loader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            inputs = torch.nn.functional.one_hot(inputs.to(torch.int64), num_classes=10).to(torch.float32)
            labels = labels.to(torch.long)

            outputs, _ = model(inputs)
            _, predicted = torch.max(outputs[:, -1, :], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the test set: %d %%' % (100 * correct / total))


In [15]:
# Create dataset
seq_length = 3
train_data = PalindromeDataset(seq_length)
train_loader = data.DataLoader(train_data, batch_size=8, shuffle=True, num_workers=0)

test_data = PalindromeDataset(seq_length)
test_loader = data.DataLoader(test_data, batch_size=8, shuffle=False, num_workers=0)

# Print some examples
for i in range(5):
    inputs, target = train_data[i]
    print(f'Input: {inputs}, Target: {target}')

RuntimeError: Storage size calculation overflowed with sizes=[9223372036854775807]

In [None]:
rnn = nn.RNN(input_size=10, hidden_size=10, num_layers=1, nonlinearity='relu', batch_first=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)


Number of training samples: 9223372036854775807


In [None]:
# Train the model
train(rnn, train_loader, criterion, optimizer, device, epochs=5)

KeyboardInterrupt: 

In [None]:
class VanillaRNN(nn.Module):

    def __init__(
        self, seq_length, input_dim, num_hidden, num_classes, batch_size, device=None):
        super(VanillaRNN, self).__init__()
        self.seq_length = seq_length
        self.input_dim = input_dim
        self.num_hidden = num_hidden
        self.num_classes = num_classes
        self.batch_size = batch_size

        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.device = device

        # Define the RNN layer
        self.hidden_state = torch.zeros(self.batch_size, self.num_hidden, device=self.device)
        self.W_hh = nn.Parameter(torch.randn(self.num_hidden, self.num_hidden))     # hidden to hidden
        self.W_hx = nn.Parameter(torch.randn(self.input_dim, self.num_hidden))      # input to hidden
        self.B_h = nn.Parameter(torch.zeros(self.num_hidden))                       # hidden bias
        self.B_y = nn.Parameter(torch.zeros(self.num_classes))                      # output bias


    def forward(self, x):
        # Implementation here ...
        pass

### Task 2: Vanilla RNN in PyTorch

### Task 3: Long-Short Term Network (LSTM) in PyTorch 