# POS

In [12]:
import nltk
from nltk import word_tokenize
from nltk.corpus import brown

from nltk.tag import (
    UnigramTagger,
    BigramTagger,
    TrigramTagger
)

In [5]:
s ="This is belonging to her"
nltk.pos_tag(word_tokenize(s))
#? This internally use the Maximum Entropy Classification classifier

[('This', 'DT'),
 ('is', 'VBZ'),
 ('belonging', 'VBG'),
 ('to', 'TO'),
 ('her', 'PRP$')]

In [6]:
nltk.help.upenn_tagset("PRP$")

PRP$: pronoun, possessive
    her his mine my our ours their thy your


In [8]:
tags = [tag for (_,tag) in brown.tagged_words(categories="news")]
fd = nltk.FreqDist(tags)

In [10]:
fd.most_common(4)

[('NN', 13162), ('IN', 10616), ('AT', 8893), ('NP', 6866)]

In [11]:
# we simply predict all the tags as NN
brown_tagged_sents = brown.tagged_sents(categories="news")
default_tagger = nltk.DefaultTagger("NN")
default_tagger.accuracy(brown_tagged_sents)

0.13089484257215028

#  SequentialBackoffTagger

1. SequentialBackoffTagger is base class for the "DefaultTagger" and "NGramTagger".
2. This help when one does know what we should classify we can "ask the backoff tagger to do the work."
3. Often "DefaultTagger" is used as the backoff tagger.

# NGram tagger

In [13]:
size = int(len(brown_tagged_sents)*0.9)
train_data = brown_tagged_sents[:size]
test_data = brown_tagged_sents[size:]

In [14]:
unigram_tagger = UnigramTagger(train=train_data,
                               backoff=default_tagger)
unigram_tagger.accuracy(test_data)

0.8361407355726104

In [15]:
bigram_tagger = BigramTagger(train=train_data,
                             backoff=unigram_tagger)
bigram_tagger.accuracy(test_data)

0.8452108043456593

In [16]:
trigram_tagger = TrigramTagger(train=train_data,
                               backoff=bigram_tagger)
trigram_tagger.accuracy(test_data)

0.843317053722715

# Hidden Markov Model

In [3]:
from nltk.corpus import treebank
import nltk

In [11]:
wsj = treebank.tagged_words()

In [29]:
tags = [tag for (_,tag) in wsj ]
wsj_tags = nltk.FreqDist(tags)

In [13]:
wsj_tags['MD']

927

In [14]:
wsj_cdf = nltk.ConditionalFreqDist(wsj)

In [16]:
wsj_cdf.tabulate(conditions=["will"])

      MD  NN 
will 280   1 


In [19]:
dir(wsj_cdf)

['N',
 '__add__',
 '__and__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__missing__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__weakref__',
 'clear',
 'conditions',
 'copy',
 'deepcopy',
 'default_factory',
 'fromkeys',
 'get',
 'items',
 'keys',
 'plot',
 'pop',
 'popitem',
 'setdefault',
 'tabulate',
 'update',
 'values']

In [21]:
wsj_cdf["will"]

FreqDist({'MD': 280, 'NN': 1})

In [23]:
wsj_tags["MD"]

927

In [27]:
280/927

0.30204962243797195

In [37]:
bigram_tags = nltk.bigrams(tags)

In [38]:
wsj_trans = nltk.ConditionalFreqDist(bigram_tags)

In [40]:
wsj_trans["MD"]["VB"]

756

In [41]:
756/927

0.8155339805825242