<a href="https://colab.research.google.com/github/Muktargetu/AI/blob/main/POS_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk.corpus
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [2]:
treebank_corpus = nltk.corpus.treebank.tagged_sents(tagset='universal')
brown_corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
conll_corpus = nltk.corpus.conll2000.tagged_sents(tagset='universal')


In [3]:
tagged_sentences = treebank_corpus  + brown_corpus + conll_corpus

In [4]:
tagged_sentences[8]

[('We', 'PRON'),
 ("'re", 'VERB'),
 ('talking', 'VERB'),
 ('about', 'ADP'),
 ('years', 'NOUN'),
 ('ago', 'ADP'),
 ('before', 'ADP'),
 ('anyone', 'NOUN'),
 ('heard', 'VERB'),
 ('of', 'ADP'),
 ('asbestos', 'NOUN'),
 ('having', 'VERB'),
 ('any', 'DET'),
 ('questionable', 'ADJ'),
 ('properties', 'NOUN'),
 ('.', '.')]

In [5]:
X = [] # store input sequence
Y = [] # store output sequence
for sentence in tagged_sentences:
  X_sentence = []
  Y_sentence = []


In [6]:
  for entity in sentence:
    X_sentence.append(entity[0]) # entity[0] contains the word
    Y_sentence.append(entity[1]) # entity[1] contains corresponding tag

  X.append(X_sentence)
  Y.append(Y_sentence)


In [7]:
num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y for word in sentence]))
print("Total number of tagged sentences: {}".format(len(X)))
print("Vocabulary size: {}".format(num_words))
print("Total number of tags: {}".format(num_tags))


Total number of tagged sentences: 1
Vocabulary size: 25
Total number of tags: 8


In [8]:
print('sample X: ', X[0], '\n')
print('sample Y: ', Y[0], '\n')


sample X:  ['In', 'Los', 'Angeles', ',', 'for', 'example', ',', 'Central', 'has', 'had', 'a', 'strong', 'market', 'position', 'while', 'Unilab', "'s", 'presence', 'has', 'been', 'less', 'prominent', ',', 'according', 'to', 'Mr.', 'Harlow', '.'] 

sample Y:  ['ADP', 'NOUN', 'NOUN', '.', 'ADP', 'NOUN', '.', 'NOUN', 'VERB', 'VERB', 'DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'PRT', 'NOUN', 'VERB', 'VERB', 'ADV', 'ADJ', '.', 'VERB', 'PRT', 'NOUN', 'NOUN', '.'] 



In [9]:
print("Length of first input sequence : {}".format(len(X[0])))
print("Length of first output sequence : {}".format(len(Y[0])))


Length of first input sequence : 28
Length of first output sequence : 28


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

# encode X
word_tokenizer = Tokenizer()              # instantiate tokenizer
word_tokenizer.fit_on_texts(X)            # fit tokenizer on data

# use the tokenizer to encode input sequence
X_encoded = word_tokenizer.texts_to_sequences(X)

# encode Y
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(Y)
Y_encoded = tag_tokenizer.texts_to_sequences(Y)


In [11]:
print("** Raw data point **", "\n", "-"*100, "\n")
print('X: ', X[0], '\n')
print('Y: ', Y[0], '\n')
print()
print("** Encoded data point **", "\n", "-"*100, "\n")
print('X: ', X_encoded[0], '\n')
print('Y: ', Y_encoded[0], '\n')


** Raw data point ** 
 ---------------------------------------------------------------------------------------------------- 

X:  ['In', 'Los', 'Angeles', ',', 'for', 'example', ',', 'Central', 'has', 'had', 'a', 'strong', 'market', 'position', 'while', 'Unilab', "'s", 'presence', 'has', 'been', 'less', 'prominent', ',', 'according', 'to', 'Mr.', 'Harlow', '.'] 

Y:  ['ADP', 'NOUN', 'NOUN', '.', 'ADP', 'NOUN', '.', 'NOUN', 'VERB', 'VERB', 'DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'PRT', 'NOUN', 'VERB', 'VERB', 'ADV', 'ADJ', '.', 'VERB', 'PRT', 'NOUN', 'NOUN', '.'] 


** Encoded data point ** 
 ---------------------------------------------------------------------------------------------------- 

X:  [3, 4, 5, 1, 6, 7, 1, 8, 2, 9, 10, 11, 12, 13, 14, 15, 16, 17, 2, 18, 19, 20, 1, 21, 22, 23, 24, 25] 

Y:  [4, 1, 1, 3, 4, 1, 3, 1, 2, 2, 7, 5, 1, 1, 4, 1, 6, 1, 2, 2, 8, 5, 3, 2, 6, 1, 1, 3] 



In [12]:
from tensorflow.keras.utils import pad_sequences

# Pad each sequence to MAX_SEQ_LENGTH using KERAS’ pad_sequences() function.
# Sentences longer than MAX_SEQ_LENGTH are truncated.
# Sentences shorter than MAX_SEQ_LENGTH are padded with zeroes.
# Truncation and padding can either be ‘pre’ or ‘post’.
# For padding we are using ‘pre’ padding type, i.e. add zeroes on the left side.
# For truncation, we are using ‘post’, i.e. truncate a sentence from right side.
# sequences greater than 100 in length will be truncated

MAX_SEQ_LENGTH = 100
X_padded = pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH,
                                                  padding="pre", truncating="post")
Y_padded = pad_sequences(Y_encoded, maxlen=MAX_SEQ_LENGTH,
                                                   padding="pre", truncating="post")
# print the first sequence
print(X_padded[0], "\n"*3)
print(Y_padded[0])


[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  3  4  5  1  6  7  1  8  2  9 10 11 12 13 14 15 16 17  2 18 19 20  1 21
 22 23 24 25] 



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 1
 1 3 4 1 3 1 2 2 7 5 1 1 4 1 6 1 2 2 8 5 3 2 6 1 1 3]
