# POS tagging examples

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import brown

#POS tagger require tokenized text
text = word_tokenize("And now for something completely different")
print(text)

['And', 'now', 'for', 'something', 'completely', 'different']


In [2]:
#Use NLTK POS tagger
tags=nltk.pos_tag(text)

#print the list of tagged words (tuples)
print(tags)

#this should print something like:
#[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]


[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]


In [3]:
#Try some homonyms (same spelling, different meaning)
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
print(nltk.pos_tag(text))

#notice that 'refuse' is tagged both as verb (VBP) and as noun (NN) depending on its role on the sentence

[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]


In [4]:
#Tagged Tokens
tagged_token = nltk.tag.str2tuple('fly/NN')
print(tagged_token)

('fly', 'NN')


In [5]:
#List the tagset from PennTreeBank
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [6]:
#List the tagset from Brown
nltk.help.brown_tagset()

(: opening parenthesis
    (
): closing parenthesis
    )
*: negator
    not n't
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ? ; ! :
:: colon
    :
ABL: determiner/pronoun, pre-qualifier
    quite such rather
ABN: determiner/pronoun, pre-quantifier
    all half many nary
ABX: determiner/pronoun, double conjunction or pre-quantifier
    both
AP: determiner/pronoun, post-determiner
    many other next more last former little several enough most least only
    very few fewer past same Last latter less single plenty 'nough lesser
    certain various manye next-to-last particular final previous present
    nuf
AP$: determiner/pronoun, post-determiner, genitive
    other's
AP+AP: determiner/pronoun, post-determiner, hyphenated pair
    many-much
AT: article
    the an no a every th' ever' ye
BE: verb 'to be', infinitive or imperative
    be
BED: verb 'to be', past tense, 2nd person singular or all persons plural
    were
BED*: verb 'to be', past tense, 2nd person singular or 

In [7]:
#Check the different naming conventions for tags

#print tags using PennTreeBank tags
print(nltk.corpus.treebank.tagged_words())

#print tags using the Brown corpus
print(nltk.corpus.brown.tagged_words())

#print tags using the universal convention
print(nltk.corpus.brown.tagged_words(tagset='universal'))

[(u'Pierre', u'NNP'), (u'Vinken', u'NNP'), ...]
[(u'The', u'AT'), (u'Fulton', u'NP-TL'), ...]
[(u'The', u'DET'), (u'Fulton', u'NOUN'), ...]


#### The text.similar() method takes a word w, finds all contexts w1w w2 then finds all words w' that appear in the same context, i.e. w1w'w2

This is the rationale behind POS tags, identify words that have similar roles in a sentence

In [8]:
#build an NLTK Text object from all the words in the Brown corpus
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

#if we look for words in similar context than a noun we should obtain other nouns
#this could take a minute or two
text.similar('car')

man time way house room day world life hand head job work wife country
state place family word year moment


# Test different POS taggers

### Default tagger

In [9]:
#import sentences and tagged sentences from Brown corpus

#sents without tags
brown_sents = brown.sents(categories='news')

#sents with tags
brown_tagged_sents = brown.tagged_sents(categories='news')

In [10]:
#deault tagger
default_tagger = nltk.DefaultTagger('NN')

In [11]:
#apply the default tagger over a tokenized sentence
default_tagger.tag("This is just a test sentence".split())

[('This', 'NN'),
 ('is', 'NN'),
 ('just', 'NN'),
 ('a', 'NN'),
 ('test', 'NN'),
 ('sentence', 'NN')]

In [12]:
#evaluate the tagger, using the tagged sentences as gold standard
default_tagger.evaluate(brown_tagged_sents)

#we should expect low results

0.13089484257215028

### Regexp tagger

In [13]:
#define the set of matching patterns
patterns = [
     (r'.*ing$', 'VBG'),               # gerunds
     (r'.*ed$', 'VBD'),                # simple past
     (r'.*es$', 'VBZ'),                # 3rd singular present
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN')                     # nouns (default)
]

In [14]:
#create a regexp tagger using the pattenrs
regexp_tagger = nltk.RegexpTagger(patterns)

#use if in a Brwon sentence
print(brown_sents[3])
regexp_tagger.tag(brown_sents[3])

#this should tag many things as noun incorrectly

[u'``', u'Only', u'a', u'relative', u'handful', u'of', u'such', u'reports', u'was', u'received', u"''", u',', u'the', u'jury', u'said', u',', u'``', u'considering', u'the', u'widespread', u'interest', u'in', u'the', u'election', u',', u'the', u'number', u'of', u'voters', u'and', u'the', u'size', u'of', u'this', u'city', u"''", u'.']


[(u'``', 'NN'),
 (u'Only', 'NN'),
 (u'a', 'NN'),
 (u'relative', 'NN'),
 (u'handful', 'NN'),
 (u'of', 'NN'),
 (u'such', 'NN'),
 (u'reports', 'NNS'),
 (u'was', 'NNS'),
 (u'received', 'VBD'),
 (u"''", 'NN'),
 (u',', 'NN'),
 (u'the', 'NN'),
 (u'jury', 'NN'),
 (u'said', 'NN'),
 (u',', 'NN'),
 (u'``', 'NN'),
 (u'considering', 'VBG'),
 (u'the', 'NN'),
 (u'widespread', 'NN'),
 (u'interest', 'NN'),
 (u'in', 'NN'),
 (u'the', 'NN'),
 (u'election', 'NN'),
 (u',', 'NN'),
 (u'the', 'NN'),
 (u'number', 'NN'),
 (u'of', 'NN'),
 (u'voters', 'NNS'),
 (u'and', 'NN'),
 (u'the', 'NN'),
 (u'size', 'NN'),
 (u'of', 'NN'),
 (u'this', 'NNS'),
 (u'city', 'NN'),
 (u"''", 'NN'),
 (u'.', 'NN')]

In [15]:
#evaluate it using the same gold standard
regexp_tagger.evaluate(brown_tagged_sents)

0.20326391789486245

### Lookup tagger

A lot of high-frequency words do not have the NN tag. Let's find the hundred most frequent words and store their most likely tag. We can then use this information as the model for a "lookup tagger" (an NLTK UnigramTagger):

In [16]:
# get the most frequent word from the news category
fd = nltk.FreqDist(brown.words(categories='news'))

# get the conditional freq of the tags for each word
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))

#get the top 100 words
most_freq_words = fd.most_common(100)

#for each word, get the most used tag
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)

#train a unigram tagger with the top 100 words and categories
baseline_tagger = nltk.UnigramTagger(model=likely_tags)

#tag a sentence
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)

# we should get tags only for highly common words(a, the, etc.)
# for the rest of the words, the tagger will fail (tag None)

[(u'``', u'``'),
 (u'Only', None),
 (u'a', u'AT'),
 (u'relative', None),
 (u'handful', None),
 (u'of', u'IN'),
 (u'such', None),
 (u'reports', None),
 (u'was', u'BEDZ'),
 (u'received', None),
 (u"''", u"''"),
 (u',', u','),
 (u'the', u'AT'),
 (u'jury', None),
 (u'said', u'VBD'),
 (u',', u','),
 (u'``', u'``'),
 (u'considering', None),
 (u'the', u'AT'),
 (u'widespread', None),
 (u'interest', None),
 (u'in', u'IN'),
 (u'the', u'AT'),
 (u'election', None),
 (u',', u','),
 (u'the', u'AT'),
 (u'number', None),
 (u'of', u'IN'),
 (u'voters', None),
 (u'and', u'CC'),
 (u'the', u'AT'),
 (u'size', None),
 (u'of', u'IN'),
 (u'this', u'DT'),
 (u'city', None),
 (u"''", u"''"),
 (u'.', u'.')]

In [17]:
#evaluate the tagger performance
baseline_tagger.evaluate(brown_tagged_sents)

0.45578495136941344

## N-grams taggers

### Unigram tagger

Sames as a lookup tagger

In [18]:
# create a tagger using the tagged sents from Brown
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)

#tag one of the sentences from the corpus
unigram_tagger.tag(brown_sents[2007])

#it should be able to get all the tags

[(u'Various', u'JJ'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'apartments', u'NNS'),
 (u'are', u'BER'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'terrace', u'NN'),
 (u'type', u'NN'),
 (u',', u','),
 (u'being', u'BEG'),
 (u'on', u'IN'),
 (u'the', u'AT'),
 (u'ground', u'NN'),
 (u'floor', u'NN'),
 (u'so', u'QL'),
 (u'that', u'CS'),
 (u'entrance', u'NN'),
 (u'is', u'BEZ'),
 (u'direct', u'JJ'),
 (u'.', u'.')]

In [19]:
# evaluate the unigram tagger
unigram_tagger.evaluate(brown_tagged_sents)

# we should get a high score
# this is because we are evaluating it using the same data used in the trainig...

0.9349006503968017

#### Split training and testing 

In [20]:
size = int(len(brown_tagged_sents) * 0.9)

# 90% of the data for training
train_sents = brown_tagged_sents[:size]

# 10% of the data for testing
test_sents = brown_tagged_sents[size:]

#train using only the training data
unigram_tagger = nltk.UnigramTagger(train_sents)

#tag one of the sentences from the test sents
unigram_tagger.tag(brown_sents[-1])

# now we should get some None tags

[(u'This', u'DT'),
 (u'is', u'BEZ'),
 (u'one', u'CD'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'main', u'JJS'),
 (u'reasons', u'NNS'),
 (u'for', u'IN'),
 (u'National', u'JJ-TL'),
 (u'Library', u'NN-TL'),
 (u'Week', u'NN-TL'),
 (u',', u','),
 (u'April', u'NP'),
 (u'16-22', None),
 (u',', u','),
 (u'and', u'CC'),
 (u'for', u'IN'),
 (u'its', u'PP$'),
 (u'theme', u'NN'),
 (u':', u':'),
 (u'``', u'``'),
 (u'For', u'IN'),
 (u'a', u'AT'),
 (u'richer', None),
 (u',', u','),
 (u'fuller', None),
 (u'life', u'NN'),
 (u',', u','),
 (u'read', u'VBN'),
 (u"''", u"''"),
 (u'!', u'.'),
 (u'!', u'.')]

In [21]:
# evaluate
# the score now should be lower
unigram_tagger.evaluate(test_sents)

0.8120203329014253

### Bigram Taggers

In [22]:
#create a bigram tagger using the training data
bigram_tagger = nltk.BigramTagger(train_sents)

#tag a sentence belonging to the training data
#should work perfectly
bigram_tagger.tag(brown_sents[2007])

[(u'Various', u'JJ'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'apartments', u'NNS'),
 (u'are', u'BER'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'terrace', u'NN'),
 (u'type', u'NN'),
 (u',', u','),
 (u'being', u'BEG'),
 (u'on', u'IN'),
 (u'the', u'AT'),
 (u'ground', u'NN'),
 (u'floor', u'NN'),
 (u'so', u'CS'),
 (u'that', u'CS'),
 (u'entrance', u'NN'),
 (u'is', u'BEZ'),
 (u'direct', u'JJ'),
 (u'.', u'.')]

In [23]:
#tag a sentence outside the training data
#some None tags should appear
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)

#once we reach a None tag, everything that comes after fails

[(u'The', u'AT'),
 (u'population', u'NN'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'Congo', u'NP'),
 (u'is', u'BEZ'),
 (u'13.5', None),
 (u'million', None),
 (u',', None),
 (u'divided', None),
 (u'into', None),
 (u'at', None),
 (u'least', None),
 (u'seven', None),
 (u'major', None),
 (u'``', None),
 (u'culture', None),
 (u'clusters', None),
 (u"''", None),
 (u'and', None),
 (u'innumerable', None),
 (u'tribes', None),
 (u'speaking', None),
 (u'400', None),
 (u'separate', None),
 (u'dialects', None),
 (u'.', None)]

In [24]:
# evaluate the bigram tagger
bigram_tagger.evaluate(test_sents)

0.10276088906608193

## Combining taggers

In [25]:
#by default everything is a noun
#new and unknown words are usually nouns
t0 = nltk.DefaultTagger('NN')

#Create a Unigram tagger
#if it fails (None tag), it will try to use the DefaultTagger
t1 = nltk.UnigramTagger(train_sents, backoff=t0)

#Create a Bigram tagger
#if it fails (None tag), it will try to use the UnigramTagger
t2 = nltk.BigramTagger(train_sents, backoff=t1)

#Tag an unseen sentence
t2.tag(unseen_sent)

[(u'The', u'AT'),
 (u'population', 'NN'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'Congo', u'NP'),
 (u'is', u'BEZ'),
 (u'13.5', 'NN'),
 (u'million', u'CD'),
 (u',', u','),
 (u'divided', u'VBN'),
 (u'into', u'IN'),
 (u'at', u'IN'),
 (u'least', u'AP'),
 (u'seven', u'CD'),
 (u'major', u'JJ'),
 (u'``', u'``'),
 (u'culture', 'NN'),
 (u'clusters', u'NNS'),
 (u"''", u"''"),
 (u'and', u'CC'),
 (u'innumerable', 'NN'),
 (u'tribes', 'NN'),
 (u'speaking', u'VBG'),
 (u'400', u'CD'),
 (u'separate', u'JJ'),
 (u'dialects', 'NN'),
 (u'.', u'.')]

In [26]:
#Evaluate our chained tagger
t2.evaluate(test_sents)

#results should be high, having a balance between precision and recall

0.844911791089405

## Transformation-Based Tagging

In [27]:
from nltk.tbl import demo as brill_tagger
brill_tagger.demo()

# Depending on your NLTK version, you might need to use the follwoing instead
#nltk.tag.brill.demo()

Loading tagged data from treebank... 
Read testing data (200 sents/5251 wds)
Read training data (800 sents/19933 wds)
Read baseline data (800 sents/19933 wds) [reused the training set]
Trained baseline tagger
    Accuracy on test set: 0.8345
Training tbl tagger...
TBL train (fast) (seqs: 800; tokens: 19933; tpls: 24; min score: 3; min acc: None)
Finding initial useful rules...
    Found 12960 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
  23  23   0   0  | POS->VBZ if Pos:PRP@[-2,-1]
  16  17   1   0  | NN->VB if Pos:-NONE-@[-2] & Pos:TO@[-1]
  15  16   1   0  | VBN->VBD if Pos:PRP@[-1]
  12  12   0   0  | VBP->VB if Pos:MD@[-2,-1]
