# Language Models Lab - Getting startred with RNN

Based on lab [notebook](https://github.com/aeau/MAU-AML-labs/blob/develop/2-language-models-lab/0-getting-started-with-RNN.ipynb) provided.

This notebook covers:
1. Defining, training and testing:
    - Recurrent Neural Netwoks
    - Long Short-Term Memory Netowrks
    - Gated Recurrent Unit Networks
2. Comparison of the different networks

## Imports & Setup

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import torch.optim as optim
import plotly.express as px
import plotly.graph_objects as go

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Defining helper functions  

In [3]:
def get_word_indexes(sentences: list):
    words = set(word for sent in  sentences for word in sent)
    return {word : i for i, word in enumerate(words)}

def prepare_sequence(sequence, to_index):
    idxs = [to_index[w] for w in sequence]
    return torch.tensor(idxs, dtype=torch.long)


## Preparing Data

In [4]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]),
    ("Everybody does machine learning nowadays".split(), ["NN", "V", "NN", "NN", "ADV", ])
]

word_to_index = get_word_indexes([x[0] for x in training_data])
tag_to_index = {"DET": 0, "NN": 1, "V": 2, "ADV": 3} 


## Setting Hyperparmeters 

In [5]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 12
VOCAB_SIZE = len(word_to_index)
NUM_CLASSES = len(tag_to_index)
NUM_EPOCHS = 100

## Defining train & evaluation functions

In [6]:
def train(model, optimizer, criterion, training_data, epochs):
    epoch_loss = []
    for _ in range(epochs):
        final_loss = 0
        for sentence, tags in training_data:
            
            model.zero_grad()

            # get inputs and targets ready for the network!
            sentence_in = prepare_sequence(sentence, word_to_index)
            targets = prepare_sequence(tags, tag_to_index)

            # get the tag scores
            tag_scores = model(sentence_in)
            
            loss = criterion(tag_scores, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            final_loss += loss.item()
        epoch_loss.append(final_loss)
    
    return epoch_loss

def evaluate(model, test_sequence):
    with torch.no_grad():
        inputs = prepare_sequence(training_data[test_sequence][0], word_to_index)
        tag_scores = model(inputs)
        
        outputs = []
        
        print(tag_to_index)
        print(training_data[test_sequence][0])
        print(training_data[test_sequence][1])
        
        for tag_score in tag_scores:
            outputs.append(tag_score.topk(1).indices.item())
            
        print(outputs)
        print("--------------")

## Defining an RNN

In [7]:
class RNNTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(RNNTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The RNN takes word embeddings as inputs, and outputs hidden states and output
        self.rnn = nn.RNN(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        
        embeds = self.word_embeddings(sentence)
        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1)) #The module is expecting [sentence_length, batch_size, embedding_dim]
        
        # in this case, rnn_out.view(len(sentence), -1) is the same as doing what function?
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Training RNN

In [8]:
model = RNNTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
rnn_losses = train(model, optimizer, loss_function, training_data, NUM_EPOCHS)

### Loss

In [9]:
fig = px.line(rnn_losses, title="Loss")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Loss",)
fig.show()

In [10]:
rnn_ppl = [torch.exp(torch.tensor(l)).item() for l in rnn_losses]
fig = px.line(rnn_ppl, title="Perplixity")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Perplixity",)
fig.show()

### Evaluation

In [11]:
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)

{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['The', 'dog', 'ate', 'the', 'apple']
['DET', 'NN', 'V', 'DET', 'NN']
[0, 1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'read', 'that', 'book']
['NN', 'V', 'DET', 'NN']
[1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'does', 'machine', 'learning', 'nowadays']
['NN', 'V', 'NN', 'NN', 'ADV']
[1, 2, 1, 1, 3]
--------------


## Defining LSTM

In [12]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [13]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
lstm_losses = train(model, optimizer, criterion, training_data, NUM_EPOCHS)

### Loss

In [14]:
fig = px.line(lstm_losses, title="Loss")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Loss",)
fig.show()

In [15]:
lstm_ppl = [torch.exp(torch.tensor(l)).item() for l in lstm_losses]
fig = px.line(lstm_ppl, title="Perplixity")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Perplixity",)
fig.show()

### Evaluation

In [16]:
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)

{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['The', 'dog', 'ate', 'the', 'apple']
['DET', 'NN', 'V', 'DET', 'NN']
[0, 1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'read', 'that', 'book']
['NN', 'V', 'DET', 'NN']
[1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'does', 'machine', 'learning', 'nowadays']
['NN', 'V', 'NN', 'NN', 'ADV']
[1, 2, 1, 1, 1]
--------------


## Defining GRU

In [17]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Training GRU

In [18]:
model = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
model

GRUTagger(
  (word_embeddings): Embedding(13, 6)
  (gru): GRU(6, 12)
  (hidden2tag): Linear(in_features=12, out_features=4, bias=True)
)

In [19]:
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
gru_losses = train(model, optimizer, criterion, training_data, NUM_EPOCHS)

In [20]:
fig = px.line(gru_losses, title="Loss")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Loss",)
fig.show()

In [21]:
gru_ppl = [torch.exp(torch.tensor(l)).item() for l in gru_losses]
fig = px.line(gru_ppl, title="Perplixity")
fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Perplixity",)
fig.show()

In [22]:
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)


{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['The', 'dog', 'ate', 'the', 'apple']
['DET', 'NN', 'V', 'DET', 'NN']
[0, 1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'read', 'that', 'book']
['NN', 'V', 'DET', 'NN']
[1, 2, 0, 1]
--------------
{'DET': 0, 'NN': 1, 'V': 2, 'ADV': 3}
['Everybody', 'does', 'machine', 'learning', 'nowadays']
['NN', 'V', 'NN', 'NN', 'ADV']
[1, 2, 1, 1, 3]
--------------


In [29]:
fig = go.Figure()
epochs_index = list(range(1,len(gru_ppl)+1))

fig.add_trace(go.Scatter(
    y=gru_ppl,
    x=epochs_index,
    name='GRU'
))

fig.add_trace(go.Scatter(
    y=lstm_ppl,
    x=epochs_index,
    name='LSTM'
))

fig.add_trace(go.Scatter(
    y=rnn_ppl,
    x=epochs_index,
    name='RNN'
))

fig.update_layout(
    title='Perplexity',
    xaxis_title="Epochs")

fig.show()

In [30]:
fig = go.Figure()
epochs_index = list(range(1,len(gru_ppl)+1))

fig.add_trace(go.Scatter(
    y=gru_losses,
    x=epochs_index,
    name='GRU'
))

fig.add_trace(go.Scatter(
    y=lstm_losses,
    x=epochs_index,
    name='LSTM'
))

fig.add_trace(go.Scatter(
    y=rnn_losses,
    x=epochs_index,
    name='RNN'
))

fig.update_layout(
    title='losses',
    xaxis_title="Epochs")

fig.show()