<a href="https://colab.research.google.com/github/JohnnyPeng123/NLP-USYD/blob/master/Lab08%20-%20Johnny's%20Answer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab08


# Exercise

You are required to modify the below example code that can be working with beam search (k > 1)

## Neural Language Model

Now, let's see how to build a language model for generating natural language text by implement and training state-of-the-art Recurrent Neural Network. The objective of this model is to generate new text, given that some input text is present. Lets start building the architecture.

In [0]:
import numpy as np 

from numpy import array
from numpy import argmax
from numpy import log

Lets use a popular nursery rhyme — “Cat and Her Kittens” as our corpus. A corpus is defined as the collection of text documents.



In [0]:
import re

# Pad sequences to the max length
def pad_sequences_pre(input_sequences, maxlen):
    output = []
    for inp in input_sequences:
        if len(inp)< maxlen:
            output.append([0]*(maxlen-len(inp)) + inp)
        else:
            output.append(inp[:maxlen])
    return output

# Prepare the data
def dataset_preparation(data):
    corpus = data.lower().split("\n")
    normalized_text=[]
    for string in corpus:
        tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
        normalized_text.append(tokens)
    tokenized_sentences=[sentence.strip().split(" ") for sentence in normalized_text]

    word_list_dict ={}
    for sent in tokenized_sentences:
        for word in sent:
            if word != "":
                word_list_dict[word] = 1
    word_list = list(word_list_dict.keys())
    word_to_index = {word:word_list.index(word) for word in word_list}

    total_words = len(word_list)+1

    # create input sequences using list of tokens
    input_sequences = []
    for line in tokenized_sentences:
        token_list = []
        for word in line:
            if word!="":
                token_list.append(word_to_index[word])
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences_pre(input_sequences, maxlen=max_sequence_len))

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

    return predictors, np.array(label), max_sequence_len, total_words, word_list, word_to_index

data = '''The cat and her kittens
They put on their mittens
To eat a christmas pie
The poor little kittens
They lost their mittens
And then they began to cry.

O mother dear, we sadly fear
We cannot go to-day,
For we have lost our mittens
If it be so, ye shall not go
For ye are naughty kittens'''

predictors, label, max_sequence_len, total_words, word_list, word_to_index = dataset_preparation(data)

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

# Define the model
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim_1, hidden_dim_2, total_words):
        super(LSTMTagger, self).__init__()
        self.hidden_dim_1 = hidden_dim_1
        self.hidden_dim_2 = hidden_dim_2
        self.word_embeddings = nn.Embedding(total_words, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim_1, batch_first=True)  
        self.lstm2 = nn.LSTM(hidden_dim_1, hidden_dim_2, batch_first=True)  
        self.hidden2tag = nn.Linear(hidden_dim_2, total_words)


    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out_1, _ = self.lstm1(embeds)
        lstm_out_2, _ = self.lstm2(lstm_out_1)
        tag_space = self.hidden2tag(lstm_out_2[:,-1,:])                    
        tag_scores = F.log_softmax(tag_space, dim=1)      
        return tag_scores

# Parameter setting
EMBEDDING_DIM = 10
HIDDEN_DIM_1 = 150
HIDDEN_DIM_2 = 100
batch_size=predictors.shape[0]

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM_1, HIDDEN_DIM_2, total_words).cuda()
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


sentence =torch.from_numpy(predictors).cuda().to(torch.int64)
targets = torch.from_numpy(label).cuda().to(torch.int64)


# Training
for epoch in range(100):  

    model.train()
    model.zero_grad()       
    tag_scores = model(sentence)
    loss = loss_function(tag_scores, targets)
    loss.backward()
    optimizer.step()


    if epoch % 10 == 9:
        model.eval()
        _, predicted = torch.max(tag_scores, 1)
        prediction = predicted.view(-1).cpu().numpy()
        t = targets.view(-1).cpu().numpy()
        acc = accuracy_score(prediction,t)
        print('Epoch: %d, training loss: %.4f, training acc: %.2f%%'%(epoch+1,loss.item(),100*acc))



Epoch: 10, training loss: 3.6694, training acc: 6.25%
Epoch: 20, training loss: 3.4622, training acc: 12.50%
Epoch: 30, training loss: 3.0190, training acc: 20.83%
Epoch: 40, training loss: 2.6104, training acc: 27.08%
Epoch: 50, training loss: 2.2231, training acc: 47.92%
Epoch: 60, training loss: 1.8765, training acc: 60.42%
Epoch: 70, training loss: 1.6554, training acc: 58.33%
Epoch: 80, training loss: 1.3766, training acc: 83.33%
Epoch: 90, training loss: 1.1904, training acc: 89.58%
Epoch: 100, training loss: 1.0295, training acc: 91.67%


The code below only works with k=1, it does not store the candidates. You need to modify the code to make it working with k > 1.

In [34]:
# convert index to word
def ind_to_word(predicted_ind):
    for word, index in word_to_index.items():
        if index == predicted_ind:
            return word
    return ""    


# get the top k most predicted results
def get_topK(predicted, k=1):
    
    top_k = np.argsort(predicted[0])[-k:]

    return [(id, predicted[0][id]) for id in top_k]


# generate text, currently only works with k=1 

# To-Do: modify this function
###

def generate_text(seed_text, next_words, max_sequence_len, k=1):

    seed_candidates = [(seed_text, .0)]
    
    for _ in range(next_words):
        all_candidates = []

        for l in range(len(seed_candidates)):
          seed_text, score = seed_candidates[l]
          token_list = [word_to_index[word] for word in seed_text.split()]
          token_list = pad_sequences_pre([token_list], maxlen=max_sequence_len-1)
          
          seed_input = torch.from_numpy(np.array(token_list)).cuda().to(torch.int64)
          predicted = model(seed_input).cpu().detach().numpy()
          
          predicitons = []
          for i in range(1,k+1):
            id, log_p = get_topK(predicted, k)[i-1]
            predicitons.append((id, log_p))
          
          for j in range(len(predicitons)):
            candidate = [seed_text + ' ' + ind_to_word(predicitons[j][0]), score - predicitons[j][1]]  #we are summing up the negative log, so we need to find the minimum score(which is the highest prob)
            all_candidates.append(candidate)
        
        # order all candidates by score
        ordered = sorted(all_candidates, key=lambda tup:tup[1])
        print(ordered)
        seed_candidates = ordered[:1]
    
    return seed_candidates

# you can add more function if you want to

print(generate_text("we naughty", 3, max_sequence_len, k=3))

# Please note that it can happen that k=1 and k=3 have the same output because this is only a small dataset.


[['we naughty her', 1.9356861114501953], ['we naughty lost', 2.0805208683013916], ['we naughty are', 2.538965940475464]]
[['we naughty her so', 3.3954014778137207], ['we naughty her kittens', 3.825654983520508], ['we naughty her mittens', 3.9835870265960693]]
[['we naughty her so mittens', 4.440878868103027], ['we naughty her so our', 4.935371398925781], ['we naughty her so sadly', 5.779910564422607]]
[['we naughty her so mittens', 4.440878868103027]]


**Sample Output** (Your output would be different, it is based on the trained model)


```
we naughty lost their mittens
```

