<a href="https://colab.research.google.com/github/JohnnyPeng123/NLP-USYD/blob/master/Lab06%20-%20Johnny's%20Answer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab06

In [2]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

nltk.download('treebank')
from nltk.corpus import treebank

import numpy as np
from sklearn.model_selection import train_test_split
 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [3]:
# Retrieve tagged sentences from treebank corpus
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))
#tagged_words(): list of (str,str) tuple

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [4]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    #The zip() function returns a zip object, which is an iterator of tuples where the first item in each passed iterator is paired together, 
    #and then the second item in each passed iterator are paired together etc.
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
 
print(sentences[5])
print(sentence_tags[5])

['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
 'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
 'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
 '.']
['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
 'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
 '.']


In [0]:
(train_sentences, 
 test_sentences, 
 train_tags, 
 test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

### Making vocabs with special tokens

*PAD: Padding*
*OOV: Out Of Vocabulary*

In [0]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())

for ts in train_tags:
    for t in ts:
        tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 2 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding
tag2index['-OOV-'] = 1  # The special value used for OOVs

def tag_to_index(tag):
    if tag in tag2index:
        return tag2index[tag]
    else:
        return tag2index['-OOV-']

In [7]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    train_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    test_sentences_X.append(s_int)

for s in train_tags:
    train_tags_y.append([tag_to_index(t) for t in s])

for s in test_tags:
    test_tags_y.append([tag_to_index(t) for t in s])

print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[9897, 7075, 1464, 4207, 8312, 2043, 6184, 6852, 6118, 9823, 8929, 6469, 10075, 7075, 4306, 9839, 8222, 507, 6133, 7357, 6680, 2583, 1464, 4119, 5978, 196, 7018, 4628, 9762, 1667]
[1511, 2760, 5580, 2415, 8425, 3567, 1511, 5615, 3171, 1856, 719, 8961, 9625, 6800, 250, 1, 4050, 1667]
[42, 18, 29, 36, 44, 44, 44, 44, 12, 44, 44, 31, 44, 19, 45, 43, 20, 6, 7, 6, 20, 18, 29, 36, 2, 42, 6, 6, 20, 4]
[42, 20, 10, 30, 34, 17, 42, 5, 6, 20, 20, 11, 7, 35, 6, 6, 20, 4]


### Getting max length of sequence

In [8]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH) 

271


### Add PAD by using torch pad_sequence
Due to the limitation of Pytorch pad_sequence, we can't assign the max_length, the max_length is calculated by pad_sequence itself. You can try to add padding manually by using for loop.

In [0]:
import torch
new_train_sentences_X = [torch.from_numpy(np.array(l)) for l in train_sentences_X]
new_test_sentences_X = [torch.from_numpy(np.array(l)) for l in test_sentences_X]
new_train_tags_y = [torch.from_numpy(np.array(l)) for l in train_tags_y]
new_test_tags_y = [torch.from_numpy(np.array(l)) for l in test_tags_y]

In [0]:
from torch.nn.utils.rnn import pad_sequence
after_pad = pad_sequence(new_train_sentences_X+new_test_sentences_X+new_train_tags_y+new_test_tags_y,batch_first=True)
train_sentences_X_pad = after_pad[:len(new_train_sentences_X)]
test_sentences_X_pad = after_pad[len(new_train_sentences_X):len(new_train_sentences_X)+len(new_test_sentences_X)]
train_tags_y_pad = after_pad[len(new_train_sentences_X)+len(new_test_sentences_X):-len(new_test_tags_y)]
test_tags_y_pad = after_pad[-len(new_test_tags_y):]

### Build Dataset and Dataloader for training data

In [0]:
#More detailed info about the TensorDataset, https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataset.html#TensorDataset
from torch.utils.data import TensorDataset
train_data = TensorDataset(train_sentences_X_pad, train_tags_y_pad)

batch_size = 128
#More detailed info about the dataLoader, https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataloader.html
from torch.utils.data import DataLoader
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=False)

## Pytorch Model (Bidirectional LSTM)

In [12]:
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,batch_first=True, bidirectional=True)  
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)


    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out) 
        tag_scores = F.log_softmax(tag_space, dim=-1)     
        return tag_scores


EMBEDDING_DIM = 128
HIDDEN_DIM = 256

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2index), len(tag2index)).cuda()
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


for epoch in range(40):  
    loss_now = 0.0
    acc = 0

    for sentence,targets in train_loader:
        sentence = sentence.cuda()
        targets = targets.cuda()

        model.zero_grad()
        model.train()       
        tag_scores = model(sentence)

        # loss = loss_function(tag_scores, targets)
        loss = loss_function(tag_scores.view(-1,tag_scores.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()

        loss_now+=loss.item()

        model.eval()
        tag_scores = model(sentence)
        _, predicted = torch.max(tag_scores, -1)
        prediction = predicted.view(-1).cpu().numpy()
        t = targets.view(-1).cpu().numpy()
        # Note: The training accuracy here is calculated with "PAD", which means most of pos tag will be "0".
        acc = acc+accuracy_score(prediction,t)*len(prediction) 
    print('Epoch: %d, training loss: %.4f, training acc: %.2f%%'%(epoch+1,loss_now,100*acc/len(train_sentences_X)/MAX_LENGTH))



Epoch: 1, training loss: 18.7865, training acc: 91.00%
Epoch: 2, training loss: 7.3675, training acc: 92.88%
Epoch: 3, training loss: 6.0638, training acc: 94.27%
Epoch: 4, training loss: 5.0721, training acc: 94.93%
Epoch: 5, training loss: 4.2304, training acc: 95.49%
Epoch: 6, training loss: 3.5371, training acc: 96.30%
Epoch: 7, training loss: 2.9649, training acc: 96.86%
Epoch: 8, training loss: 2.5145, training acc: 97.32%
Epoch: 9, training loss: 2.1576, training acc: 97.68%
Epoch: 10, training loss: 1.8668, training acc: 98.00%
Epoch: 11, training loss: 1.6254, training acc: 98.27%
Epoch: 12, training loss: 1.4224, training acc: 98.50%
Epoch: 13, training loss: 1.2502, training acc: 98.69%
Epoch: 14, training loss: 1.1030, training acc: 98.85%
Epoch: 15, training loss: 0.9765, training acc: 98.99%
Epoch: 16, training loss: 0.8672, training acc: 99.11%
Epoch: 17, training loss: 0.7721, training acc: 99.21%
Epoch: 18, training loss: 0.6890, training acc: 99.31%
Epoch: 19, trainin

## Test with the test set

In [13]:
model.eval()
sentence = test_sentences_X_pad.cuda()
tag_scores = model(sentence)
_, predicted = torch.max(tag_scores, -1)
predicted = predicted.cpu().numpy()

# cut off the PAD part
test_len_list = [len(s) for s in test_sentences_X]
actual_predicted_list= []
for i in range(predicted.shape[0]):
    actual_predicted_list+=list(predicted[i])[:test_len_list[i]]

# get actual tag list
actual_tags = sum(test_tags_y, [])

print('Test Accuracy: %.2f%%'%(accuracy_score(actual_predicted_list,actual_tags)*100))

Test Accuracy: 91.14%


# Exercise
In this exercise, you are to classify part-of-speech(pos) tags on user-defined sentences using the Bi-LSTM model trained right before the exercise. You should complete the below "Prediction Result to PoS Tags" section


## Testing with the sentence

### Preprocess and Predcition

In [0]:
test_samples = [
    word_tokenize("This race is awesome, I want to race too."),
    word_tokenize("That race is silly, I do not want to race.")
]

# Converting sentence (tokens) word to index
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

# manually add PAD
test_samples_X_pad = []
for l in test_samples_X:
    test_samples_X_pad.append(l+[0]*(MAX_LENGTH-len(l)))

index2tag = {i: t for t, i in tag2index.items()}

model.eval()
sentence = torch.from_numpy(np.array(test_samples_X_pad)).cuda()
predictions = model(sentence)
_, predictions = torch.max(predictions, -1)
predictions = predictions.cpu().numpy()

### Prediction Result to PoS Tags [Complete this part]

In [21]:
#decode the result to have actual tags
def decode_result(predictions, test_samples_X, index2tag):
    token_sequences = []
    ## write your codes here
    # cut off the PAD part
    test_len_list = [len(s) for s in test_samples_X]
    actual_predicted_list= []
    for j in range(len(predictions)):
      token_sequences_temp = []
      actual_predicted_list.append(list(predictions[j])[:test_len_list[j]])
      for i in range(len(actual_predicted_list[j])):
         token_sequences_temp.append(sorted(index2tag.items())[actual_predicted_list[j][i]][1])
      token_sequences.append(token_sequences_temp)
    return token_sequences

print(test_samples)
print(decode_result(predictions, test_samples_X, index2tag))

[['This', 'race', 'is', 'awesome', ',', 'I', 'want', 'to', 'race', 'too', '.'], ['That', 'race', 'is', 'silly', ',', 'I', 'do', 'not', 'want', 'to', 'race', '.']]
[['DT', 'NN', 'VBZ', 'VBG', ',', 'PRP', 'VBP', 'TO', 'VB', 'RB', '.'], ['DT', 'NN', 'VBZ', 'VBG', ',', 'PRP', 'VBP', 'RB', 'VB', 'TO', 'CD', '.']]
