# Part-of-Speech Tagging

- using LSTM
- Very simple example to understand concept

# 1)- Importing key modules

In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
! pip install jdc



In [0]:
# For data processing and maths
import numpy as np
import pandas as pd
import time
import math
import os
import jdc
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [0]:
# for deep learning and neural network
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [5]:
! pip install version_information



In [6]:
# first install: pip install version_information
%reload_ext version_information
%version_information pandas,torch,numpy,seaborn, matplotlib

Software,Version
Python,3.6.8 64bit [GCC 8.0.1 20180414 (experimental) [trunk revision 259383]
IPython,5.5.0
OS,Linux 4.14.137+ x86_64 with Ubuntu 18.04 bionic
pandas,0.24.2
torch,1.3.0+cu100
numpy,1.16.5
seaborn,0.9.0
matplotlib,3.0.3
Mon Oct 21 17:50:37 2019 UTC,Mon Oct 21 17:50:37 2019 UTC


# 2)- Example data & its preprocessing

In [0]:
training_data = [
    ("The dog ate the apple.".split(), ["Determiner", "Noun", "Verb", "Determiner", "Noun"]),
    ("Everybody read that book.".split(), ["Noun", "Verb", "Determiner", "Noun"])
]
training_sentences = [training_data[x][0] for x in range(len(training_data))]

In [8]:
training_sentences

[['The', 'dog', 'ate', 'the', 'apple.'],
 ['Everybody', 'read', 'that', 'book.']]

In [9]:
type(training_sentences)

list

### 2.a)-Clean the data

Make all words lower case and remove punctuation.

In [0]:
training_data_clean = []
for sentence, tags in training_data:
    clean_sentence = [x.lower().split('.')[0] for x in sentence]
    training_data_clean += [(clean_sentence, tags)]
training_sentences_clean = [training_data_clean[x][0] for x in range(len(training_data_clean))]

In [11]:
training_sentences_clean 

[['the', 'dog', 'ate', 'the', 'apple'], ['everybody', 'read', 'that', 'book']]

### 2.b)-Create vocabulary

Using all words in each sentence of the training data, create a vocabulary.

In [12]:
words = []
for sentence in training_sentences_clean:
    words += sentence
vocab = list(set(words))
print(vocab)

['everybody', 'that', 'the', 'ate', 'apple', 'dog', 'read', 'book']


In [13]:
len(vocab)

8

### 2.c)-Create mapping dictionaries

Using dictionaries to convert words to integers.

In [14]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
print('word_to_ix: {}'.format(word_to_ix))

word_to_ix: {'everybody': 0, 'that': 1, 'the': 2, 'ate': 3, 'apple': 4, 'dog': 5, 'read': 6, 'book': 7}


### 2.d)-Map the parts-of-speech tags to integers

In [0]:
# Tags to integers
tag_to_ix = {"Determiner": 0, "Noun": 1, "Verb": 2}

In [0]:
# Integers to tags
ix_to_tag = {0: "Determiner", 1: "Noun", 2: "Verb"}

### 2.e)-Set Hyperparameters

In [0]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
LEARNING_RATE = 0.1
NUM_EPOCHS = 300

# 3)-Create the model

- LSTMTagger class.

Inherits nn.Module from PyTorch.

- Inputs:

Embedding dimension.<br>
Number of hidden dimensions.<br>
Vocabulary size.<br>
Tag set size.<br>

In [0]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # LSTM: Inputs are embeddings, outputs are hidden states
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # Linear layer maps hidden space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

**function to initialize the hidden states**

In [0]:
%%add_to LSTMTagger
def init_hidden(self):
    """
    Initialize the hidden state. The axes correspond to (num_layers, minibatch_size, hidden_dim).
    """
    return (torch.zeros(1, 1, self.hidden_dim),
            torch.zeros(1, 1, self.hidden_dim))

**a function to make a forward pass through the recurrent LSTM network**


It will return the predict tag values given an input sentence.

In [0]:
%%add_to LSTMTagger
def forward(self, sentence):
    """
    Make a forward pass through the LSTM.
    
    :param sentence: The input sentence.
    :type sentence: list
    :return: A Tensor of tag scores.
    :rtype: Tensor
    """
    embeds = self.word_embeddings(sentence)
    lstm_out, self.hidden = self.lstm(
        embeds.view(len(sentence), 1, -1), self.hidden)
    tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    tag_scores = F.log_softmax(tag_space, dim=1)
    return tag_scores

**Helper Function**

Map either words or tags to integers, using the previously defined dictionaries (tag_to_ix, ix_to_tag).

In [0]:
def prepare_sequence(seq, to_ix):
    """
    Convert words or tags to intigers and return a Pytorch tensor.
    :param seq: Sequence of words.
    :type seq: list
    :param to_ix: Dictionary mapping words or tags to intigers.
    :return: A Pytorch tensor of indices.
    :rtype: Tensor
    """
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# 4)-Build the Model

In [0]:
#LSTM Pytorch model using the hyperparameters defined above

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))

**Define loss function**

In [0]:
# we will be using a negative log likelihood function, which is useful in classification problems.

loss_function = nn.NLLLoss()

**optimizer**

using stochastic gradient descent.

In [0]:
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

### 4.a)- Model before training

Let's run the model before any training has been done and store the scores to a list. We will then compare these scores with the scores after training.

In [0]:
store_initial_probabilities = []
store_initial_predictions = []
with torch.no_grad():
    for sentence in training_sentences_clean:
        inputs = prepare_sequence(sentence, word_to_ix)
        tag_scores = model(inputs)
        tag_probabilities = tag_scores.exp()
        max_values, max_indices = torch.max(tag_probabilities, 1)
        initial_prediction = [ix_to_tag[x] for x in max_indices.numpy()]
        store_initial_predictions.append(initial_prediction)
        store_initial_probabilities.append(tag_probabilities)

### 4.b)-Train the model

In [0]:
for epoch in range(NUM_EPOCHS):
    for sentence, tags in training_data_clean:
        # Set gradients equal to zero after each intance
        model.zero_grad()
        
        # Initialize hidden state of LSTM after each intance
        model.hidden = model.init_hidden()
        
        # Turn inputs into tensors of word indices
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        # Run forward pass
        tag_scores = model(sentence_in)
        
        # Compute the loss, gradients, and update the parameters
        loss = loss_function(tag_scores, targets)
        
        # Perform backward pass
        loss.backward()
        
        # Update model parameters
        optimizer.step()

# 5)- Plot results

Both before training and after training

In [27]:
# Print out the scores after training the model
store_initial_probabilities.reverse()
store_initial_predictions.reverse()
with torch.no_grad():
    for sentence in training_sentences_clean:
        inputs = prepare_sequence(sentence, word_to_ix)
        tag_scores = model(inputs)
        tag_probabilities = tag_scores.exp()
        max_values, max_indices = torch.max(tag_probabilities, 1)
        predictions = [ix_to_tag[x] for x in max_indices.numpy()]
        
        print('Before training:')
        print(' - initial probabilities: {}'.format(store_initial_probabilities.pop()))
        print(' - sentence: {}'.format(' '.join(sentence)))
        print(' - predicition: {}'.format(store_initial_predictions.pop()))
        print('After training:')
        print(' - final probabilities: {}'.format(tag_probabilities))
        print(' - sentence: {}'.format(' '.join(sentence)))
        print(' - prediction: {}'.format(predictions))
        print('')

Before training:
 - initial probabilities: tensor([[0.4357, 0.2607, 0.3036],
        [0.4626, 0.2501, 0.2873],
        [0.4421, 0.2604, 0.2976],
        [0.4406, 0.2551, 0.3043],
        [0.4642, 0.2479, 0.2879]])
 - sentence: the dog ate the apple
 - predicition: ['Determiner', 'Determiner', 'Determiner', 'Determiner', 'Determiner']
After training:
 - final probabilities: tensor([[0.9231, 0.0709, 0.0060],
        [0.0210, 0.9477, 0.0313],
        [0.0254, 0.0205, 0.9541],
        [0.9766, 0.0180, 0.0054],
        [0.0166, 0.9769, 0.0065]])
 - sentence: the dog ate the apple
 - prediction: ['Determiner', 'Noun', 'Verb', 'Determiner', 'Noun']

Before training:
 - initial probabilities: tensor([[0.3742, 0.2619, 0.3639],
        [0.3972, 0.2580, 0.3447],
        [0.3657, 0.2808, 0.3535],
        [0.4013, 0.2654, 0.3333]])
 - sentence: everybody read that book
 - predicition: ['Determiner', 'Determiner', 'Determiner', 'Determiner']
After training:
 - final probabilities: tensor([[0.0036, 0

**Making more sense of score**

In [28]:
print('Let us take the 1st sentence of our dataset: {}'.format(' '.join(training_sentences[0])))
print('For the word "{}" the list of possible parts-of-speech are: {}'.format(training_sentences[0][0], [x for x in ix_to_tag.values()]))

Let us take the 1st sentence of our dataset: The dog ate the apple.
For the word "The" the list of possible parts-of-speech are: ['Determiner', 'Noun', 'Verb']


In [29]:
model.eval()

LSTMTagger(
  (word_embeddings): Embedding(8, 6)
  (lstm): LSTM(6, 6)
  (hidden2tag): Linear(in_features=6, out_features=3, bias=True)
)

In [30]:
inputs = prepare_sequence(training_sentences_clean[0], word_to_ix)
tag_scores = model(inputs)
tag_probabilities = tag_scores.exp()
max_values, max_indices = torch.max(tag_probabilities, 1)
predictions = [ix_to_tag[x] for x in max_indices.numpy()]
print('sentence: {}'.format(' '.join(training_sentences[0])))
print('parts-of-speach: {}'.format(predictions))

sentence: The dog ate the apple.
parts-of-speach: ['Determiner', 'Noun', 'Verb', 'Determiner', 'Noun']


**What about second sentence**

In [31]:
inputs = prepare_sequence(training_sentences_clean[1], word_to_ix)
tag_scores = model(inputs)
tag_probabilities = tag_scores.exp()
max_values, max_indices = torch.max(tag_probabilities, 1)
predictions = [ix_to_tag[x] for x in max_indices.numpy()]
print('sentence: {}'.format(' '.join(training_sentences[1])))
print('parts-of-speach: {}'.format(predictions))

sentence: Everybody read that book.
parts-of-speach: ['Noun', 'Verb', 'Determiner', 'Noun']
