# Language Models Lab - Getting startred with RNN

Based on lab [notebook](https://github.com/aeau/MAU-AML-labs/blob/develop/2-language-models-lab/0-getting-started-with-RNN.ipynb) provided.

This notebook covers:
1. Defining, training and testing:
    - Recurrent Neural Netwoks
    - Long Short-Term Memory Netowrks
    - Gated Recurrent Unit Networks
2. Comparison of the different networks

## Imports & Setup

In [170]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import torch.optim as optim
import plotly.express as px

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Defining helper functions  

In [171]:
def get_word_indexes(sentences: list):
    words = set(word for sent in  sentences for word in sent)
    return {word : i for i, word in enumerate(words)}

def prepare_sequence(sequence, to_index):
    idxs = [to_index[w] for w in sequence]
    return torch.tensor(idxs, dtype=torch.long)


## Preparing Data

In [172]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]),
    ("Everybody does machine learning nowadays".split(), ["NN", "V", "NN", "NN", "ADV", ])
]

word_to_index = get_word_indexes([x[0] for x in training_data])
tag_to_index = {"DET": 0, "NN": 1, "V": 2, "ADV": 3} 


## Setting Hyperparmeters 

In [173]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 12
VOCAB_SIZE = len(word_to_index)
NUM_CLASSES = len(tag_to_index)
NUM_EPOCHS = 300

## Defining train & evaluation functions

In [174]:
def train(model, optimizer, criterion, training_data, epochs):
    epoch_loss = []
    for _ in range(epochs):
        final_loss = 0
        for sentence, tags in training_data:
            
            model.zero_grad()

            # get inputs and targets ready for the network!
            sentence_in = prepare_sequence(sentence, word_to_index)
            targets = prepare_sequence(tags, tag_to_index)

            # get the tag scores
            tag_scores = model(sentence_in)
            
            loss = criterion(tag_scores, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            final_loss += loss.item()
        epoch_loss.append(final_loss)
    
    return epoch_loss

def evaluate(model, test_sequence):
    with torch.no_grad():
        inputs = prepare_sequence(training_data[test_sequence][0], word_to_index)
        tag_scores = model(inputs)
        
        outputs = []
        
        print(tag_to_index)
        print(training_data[test_sequence][0])
        print(training_data[test_sequence][1])
        
        for tag_score in tag_scores:
            outputs.append(tag_score.topk(1).indices.item())
            
        print(outputs)
        print("--------------")

## Defining an RNN

In [175]:
class RNNTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(RNNTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The RNN takes word embeddings as inputs, and outputs hidden states and output
        self.rnn = nn.RNN(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        
        embeds = self.word_embeddings(sentence)
        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1)) #The module is expecting [sentence_length, batch_size, embedding_dim]
        
        # in this case, rnn_out.view(len(sentence), -1) is the same as doing what function?
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Training RNN

In [176]:
model = RNNTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses = train(model, optimizer, loss_function, training_data, NUM_EPOCHS)

### Loss

In [177]:
fig = px.line(losses, title="Loss")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Loss",)
fig.show()

In [178]:
ppl = [torch.exp(torch.tensor(l)).item() for l in losses]
fig = px.line(ppl, title="Perplixity")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Perplixity",)
fig.show()

### Evaluation

In [179]:
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)

{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['The', 'dog', 'ate', 'the', 'apple']
['DET', 'NN', 'V', 'DET', 'NN']
[0, 1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'read', 'that', 'book']
['NN', 'V', 'DET', 'NN']
[1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'does', 'machine', 'learning', 'nowadays']
['NN', 'V', 'NN', 'NN', 'ADV']
[1, 2, 1, 1, 3]
--------------


## Defining LSTM

In [180]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [181]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses = train(model, optimizer, criterion, training_data, NUM_EPOCHS)

### Loss

In [182]:
fig = px.line(losses, title="Loss")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Loss",)
fig.show()

In [183]:
ppl = [torch.exp(torch.tensor(l)).item() for l in losses]
fig = px.line(ppl, title="Perplixity")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Perplixity",)
fig.show()

### Evaluation

In [184]:
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)

{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['The', 'dog', 'ate', 'the', 'apple']
['DET', 'NN', 'V', 'DET', 'NN']
[0, 1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'read', 'that', 'book']
['NN', 'V', 'DET', 'NN']
[1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'does', 'machine', 'learning', 'nowadays']
['NN', 'V', 'NN', 'NN', 'ADV']
[1, 2, 1, 1, 3]
--------------


## Defining GRU

In [185]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Training GRU

In [186]:
model = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses = train(model, optimizer, criterion, training_data, NUM_EPOCHS)

In [187]:
fig = px.line(losses, title="Loss")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Loss",)
fig.show()

In [188]:
ppl = [torch.exp(torch.tensor(l)).item() for l in losses]
fig = px.line(ppl, title="Perplixity")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Perplixity",)
fig.show()

In [189]:
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)


{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['The', 'dog', 'ate', 'the', 'apple']
['DET', 'NN', 'V', 'DET', 'NN']
[0, 1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'read', 'that', 'book']
['NN', 'V', 'DET', 'NN']
[1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'does', 'machine', 'learning', 'nowadays']
['NN', 'V', 'NN', 'NN', 'ADV']
[1, 2, 1, 1, 3]
--------------
