# POS Tagging

In [202]:
import nltk
from nltk.corpus import brown
import matplotlib.pyplot as plt
%matplotlib inline
from nltk import pos_tag

In [2]:
text = 'Mohammad loves UAE, and he want to become the greatest data scientist.'

In [3]:
text_tokens = nltk.word_tokenize(text)

In [4]:
nltk.pos_tag(text_tokens)

[('Mohammad', 'NNP'),
 ('loves', 'VBZ'),
 ('UAE', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('he', 'PRP'),
 ('want', 'VBP'),
 ('to', 'TO'),
 ('become', 'VB'),
 ('the', 'DT'),
 ('greatest', 'JJS'),
 ('data', 'NNS'),
 ('scientist', 'NN'),
 ('.', '.')]

In [None]:
nltk.help.upenn_tagset('NNS')

In [None]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

In [None]:
text = nltk.Text(word.lower() for word in brown.words())

In [None]:
text.similar('over')

In [None]:
text.similar('sex')

### Representing Tagged Tokens

In [None]:
tagged_tokens = nltk.tag.str2tuple('fly/NN')
tagged_tokens

In [None]:
tagged_tokens[0]

In [None]:
nltk.corpus.brown.tagged_words()

In [None]:
nltk.corpus.treebank.tagged_words()[:10]

In [None]:
    nltk.corpus.treebank.tagged_words(tagset='universal')

In [None]:
brown.readme()

In [None]:
print(brown.tagged_sents()[0])

### tagging

In [None]:
brown_news_tagging = brown.tagged_words(categories='news') 
tag_fd = nltk.FreqDist([tag for (word,tag) in brown_news_tagging])

In [None]:
tag_fd

In [None]:
plt.figure(figsize=(12,6))
tag_fd.plot(cumulative=True)

In [None]:
word_tag_pairs = nltk.bigrams(brown_news_tagging)
li = list(word_tag_pairs)

In [None]:
li[0][1][1]

In [None]:
print(list(nltk.FreqDist(a[1] for (a,b) in li if b[1]=='NP')))

### verbs

In [None]:
wsj = nltk.corpus.treebank.tagged_words()
wsj

In [None]:
fd = nltk.FreqDist([w for (w,tag) in wsj if tag.startswith('V')])

In [None]:
fd.most_common(7)

In [None]:
cfd1 = nltk.ConditionalFreqDist(wsj)

In [None]:
cfd1['can']

In [None]:
cfd1['cut']

In [None]:
cfd2 = nltk.ConditionalFreqDist((tag,word) for (word,tag) in wsj)

In [None]:
cfd2['NNP']

In [None]:
past_participle = list(cfd2['VBN'])

In [None]:
wsj_bigrams = list(nltk.bigrams(wsj))
wsj_bigrams[:5]

In [None]:
[a for (a,b) in wsj_bigrams if b[0] in past_participle ][:10]

### Adjectives and Adverbs

In [5]:
brown_learned = brown.words(categories='learned')
brown_learned

['1', '.', 'Introduction', 'It', 'has', 'recently', ...]

In [6]:
sorted(set(b for (a,b) in nltk.bigrams(brown_learned) if a=='often'))[:7]

[',', '.', 'accomplished', 'analytically', 'appear', 'apt', 'associated']

In [7]:
brown_learned_tagged = brown.tagged_words(categories='learned')
tags = [b[1] for (a,b) in nltk.bigrams(brown_learned_tagged) if a[0]=='often']

In [8]:
fd = nltk.FreqDist(tags)
fd.tabulate()

VBN  VB VBD  JJ  IN  QL   ,  CS  RB  AP VBG  RP VBZ QLP BEN WRB   .  TO  HV 
 15  10   8   5   4   3   3   3   3   1   1   1   1   1   1   1   1   1   1 


In [12]:
def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            print(w1,w2,w3)

In [13]:
for tagged_sent in brown.tagged_sents(categories='romance'):
        process(tagged_sent)      

cared to see
time to pay
determined to hold
beginning to fold
wanted to smoke
seem to get
trying to find
like to keep
seem to snap
like to think
beginning to find
beginning to look
going to last
going to prove
hoped to die
gone to live
stayed to get
turned to go
going to see
going to laugh
tried to bite
seem to rise
come to see
got to know
seem to take
beginning to creep
seemed to rain
like to hear
come to make
started to move
bent to pick
permitted to operate
beginning to get
seemed to think
tried to make
wanted to present
expected to stay
wish to start
got to run
like to talk
disappointed to find
tried to reason
trying to close
want to help
surprised to see
trying to find
neglected to play
wanted to call
like to offer
want to say
wished to see
overheard to say
like to get
expected to perform
going to bring
seek to storm
used to defend
shocked to find
hesitate to speak
beginning to study
grow to devote
wish to turn
going to fail
wished to change
wanted to take
wanted to bring
like to 

In [9]:
brown_news_tagged = brown.tagged_words(categories='news')
data = nltk.ConditionalFreqDist((word.lower(),tag)
                               for (word,tag) in brown_news_tagged)

In [32]:
li=[]
for word in data.conditions():
    if len(data[word])>3:
        tags = data[word].keys()
        s=(word,' '.join(tags))
        li.append(s)
li[:7]

[('no', 'AT RB AT-HL AT-TL'),
 ('that', 'CS WPS DT QL WPO'),
 ('place', 'NN VB NN-TL NP'),
 ('in', 'IN RP IN-HL IN-TL'),
 ('for', 'IN IN-TL IN-HL CS RB'),
 ('by', 'IN IN-HL IN-TL RB'),
 ('to', 'TO IN IN-HL TO-HL IN-TL TO-TL NPS')]

### Mapping Words to Properties Using Python Dictionaries

In [22]:
pos = {}
pos

{}

In [23]:
pos['colorless']='ADJ'
pos['ideas']='N'
pos['sleep']='V'
pos['furiously']='ADJ'
pos

{'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADJ'}

In [24]:
pos['ideas']

'N'

In [25]:
pos['green']

KeyError: 'green'

In [26]:
print(list(pos))
print(sorted(pos))

['colorless', 'ideas', 'sleep', 'furiously']
['colorless', 'furiously', 'ideas', 'sleep']


In [27]:
[w for w in pos if w.endswith('s')]

['colorless', 'ideas']

In [28]:
for word in pos:
    print(word,':',pos[word])

colorless : ADJ
ideas : N
sleep : V
furiously : ADJ


In [29]:
print(pos.keys())
print(pos.values())
print(pos.items())

dict_keys(['colorless', 'ideas', 'sleep', 'furiously'])
dict_values(['ADJ', 'N', 'V', 'ADJ'])
dict_items([('colorless', 'ADJ'), ('ideas', 'N'), ('sleep', 'V'), ('furiously', 'ADJ')])


In [32]:
print('before modifying the dict',pos)
pos['sleep']=['N','V']
print('After',pos)

before modifying the dict {'colorless': 'ADJ', 'ideas': 'N', 'sleep': ['N', 'V'], 'furiously': 'ADJ'}
After {'colorless': 'ADJ', 'ideas': 'N', 'sleep': ['N', 'V'], 'furiously': 'ADJ'}


#### Defining Dictionaries

In [39]:
import nltk
pos = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')

In [40]:
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}

In [46]:
frequency = nltk.defaultdict(int)
frequency['horse']=14

In [49]:
frequency['dog']
frequency
# default vale is zero

defaultdict(int, {'horse': 14, 'dog': 0})

In [51]:
dict_list = nltk.defaultdict(list)
dict_list['components']=['ready']
dict_list['materials']
dict_list

defaultdict(list, {'components': ['ready'], 'materials': []})

#### specifying a certain value

In [53]:
pos = nltk.defaultdict(lambda: 'N')
pos['wnet']=['V']
pos['the']=['DET']
pos['computer']
pos

defaultdict(<function __main__.<lambda>()>,
            {'wnet': ['V'], 'the': ['DET'], 'computer': 'N'})

In [60]:
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = list(vocab)[:1000]
mapping = nltk.defaultdict(lambda:'UNK')

In [71]:
for word in v1000:
    mapping[word]=word
list(mapping)[10:15]

['of', 'said', ",'", 'Alice', 'in']

In [79]:
alice2 = [mapping[v] for v in alice]
print(alice2[:40])

['[', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'by', 'UNK', 'UNK', 'UNK', 'UNK', 'CHAPTER', 'I', '.', 'Down', 'the', 'Rabbit', '-', 'UNK', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing']


#### Incrementally Updating a Dictionary

In [80]:
counts = nltk.defaultdict(int)
from nltk.corpus import brown

In [86]:
for (word,tag) in brown.tagged_words(categories='news'):
    counts[tag] = counts[tag]+1

In [91]:
counts['NN']

26324

In [96]:
from operator import itemgetter
sorted(counts.items(), key=itemgetter(1), reverse=True)[:5]

[('NN', 26324), ('IN', 21232), ('AT', 17786), ('NP', 13732), (',', 10266)]

In [97]:
s='Mohammad'
itemgetter(1)(s)

'o'

In [100]:
last_letters = nltk.defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words:
    key = word[-2:]
    last_letters[key].append(word)
    

In [104]:
last_letters['ly'][:5]

['abactinally', 'abandonedly', 'abasedly', 'abashedly', 'abashlessly']

In [105]:
sorted('ifsif')

['f', 'f', 'i', 'i', 's']

In [106]:
anagrams = nltk.defaultdict(list)
for word in words:
    key = ''.join(sorted(word))
    anagrams[key].append(word)

In [114]:
anagrams['ent']

['net', 'ten']

#### Inverting a Dictionary

In [134]:
pos = {'colorful': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV','walk':'V'}
pos2 = nltk.defaultdict(list)
for (key,value) in pos.items():
    pos2[value].append(key)
    

In [135]:
pos2

defaultdict(list,
            {'ADJ': ['colorful'],
             'N': ['ideas'],
             'V': ['sleep', 'walk'],
             'ADV': ['furiously']})

### Automatic Tagging

In [206]:
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
print(brown_tagged_sents[0])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]


In [207]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
fd= nltk.FreqDist(tags)
print(fd.most_common(5))
fd.max()

[('NN', 13162), ('IN', 10616), ('AT', 8893), ('NP', 6866), (',', 5133)]


'NN'

In [208]:
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.tag(tokens))

[('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('green', 'NN'), ('eggs', 'NN'), ('and', 'NN'), ('ham', 'NN'), (',', 'NN'), ('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('them', 'NN'), ('Sam', 'NN'), ('I', 'NN'), ('am', 'NN'), ('!', 'NN')]


In [209]:
default_tagger.evaluate(brown_tagged_sents)
# number of nouns in news categories divide by number of all tags inside news one

0.13089484257215028

### The Regular Expression Tagger

In [210]:
patterns = [ (r'.*ing$', 'VBG'), # gerunds 
             (r'.*ed$', 'VBD'), # simple past
             (r'.*es$', 'VBZ'), # 3rd singular present
             (r'.*ould$', 'MD'), # modals
             (r'.*\'s$', 'NN$'), # possessive nouns
             (r'.*s$', 'NNS'), # plural nouns
             (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
             (r'.*', 'NN') # nouns (default)
]

In [211]:
regexp_tagger  = nltk.RegexpTagger(patterns)
print(regexp_tagger.tag(brown_sents[0]))

[('The', 'NN'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), ('said', 'NN'), ('Friday', 'NN'), ('an', 'NN'), ('investigation', 'NN'), ('of', 'NN'), ("Atlanta's", 'NN$'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'), ('that', 'NN'), ('any', 'NN'), ('irregularities', 'VBZ'), ('took', 'NN'), ('place', 'NN'), ('.', 'NN')]


In [212]:
regexp_tagger.evaluate(brown_tagged_sents)

0.20326391789486245

#### The Lookup Tagger

In [318]:
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news',tagset='universal'))

In [319]:
most_freq = list(fd.keys())[:100]
likely_tags = dict((word,cfd[word].max()) for word in most_freq)

In [320]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)

0.03989895976291346

In [321]:
sent = brown.sents(categories='news')[5]

In [322]:
print(baseline_tagger.tag(sent))

[('It', 'PRON'), ('recommended', 'VERB'), ('that', 'ADP'), ('Fulton', 'NOUN'), ('legislators', 'NOUN'), ('act', 'NOUN'), ('``', '.'), ('to', 'PRT'), ('have', 'VERB'), ('these', None), ('laws', 'NOUN'), ('studied', None), ('and', 'CONJ'), ('revised', None), ('to', 'PRT'), ('the', 'DET'), ('end', None), ('of', 'ADP'), ('modernizing', None), ('and', 'CONJ'), ('improving', None), ('them', None), ("''", '.'), ('.', '.')]


In [323]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags , backoff=nltk.DefaultTagger('NN'))
# make default tag is NN

In [324]:
sent = brown.sents(categories='news')[5]
print(baseline_tagger.tag(sent))

[('It', 'PRON'), ('recommended', 'VERB'), ('that', 'ADP'), ('Fulton', 'NOUN'), ('legislators', 'NOUN'), ('act', 'NOUN'), ('``', '.'), ('to', 'PRT'), ('have', 'VERB'), ('these', 'NN'), ('laws', 'NOUN'), ('studied', 'NN'), ('and', 'CONJ'), ('revised', 'NN'), ('to', 'PRT'), ('the', 'DET'), ('end', 'NN'), ('of', 'ADP'), ('modernizing', 'NN'), ('and', 'CONJ'), ('improving', 'NN'), ('them', 'NN'), ("''", '.'), ('.', '.')]


## N-Gram Tagging

#### unigram

In [362]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents  = brown.sents(categories='news')

In [363]:
uni_tag = nltk.UnigramTagger(brown_tagged_sents)

In [364]:
print(uni_tag.tag(brown_sents[20]))

[('The', 'AT'), ('jury', 'NN'), ('said', 'VBD'), ('it', 'PPS'), ('found', 'VBD'), ('the', 'AT'), ('court', 'NN'), ('``', '``'), ('has', 'HVZ'), ('incorporated', 'VBN'), ('into', 'IN'), ('its', 'PP$'), ('operating', 'VBG'), ('procedures', 'NNS'), ('the', 'AT'), ('recommendations', 'NNS'), ("''", "''"), ('of', 'IN'), ('two', 'CD'), ('previous', 'JJ'), ('grand', 'JJ'), ('juries', 'NNS'), (',', ','), ('the', 'AT'), ('Atlanta', 'NP'), ('Bar', 'NN-TL'), ('Association', 'NN-TL'), ('and', 'CC'), ('an', 'AT'), ('interim', 'NN'), ('citizens', 'NNS'), ('committee', 'NN'), ('.', '.')]


In [349]:
size = int(len(brown_tagged_sents)*0.9)
size

4160

In [367]:
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
uni_tag = nltk.UnigramTagger(train_sents)

In [375]:
uni_tag.evaluate(test_sents)

0.8121200039868434

#### bigrams

In [376]:
bigram_tagger = nltk.BigramTagger(train_sents)

In [383]:
print(bigram_tagger.tag(brown_sents[2007]))

[('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'), ('apartments', 'NNS'), ('are', 'BER'), ('of', 'IN'), ('the', 'AT'), ('terrace', 'NN'), ('type', 'NN'), (',', ','), ('being', 'BEG'), ('on', 'IN'), ('the', 'AT'), ('ground', 'NN'), ('floor', 'NN'), ('so', 'CS'), ('that', 'CS'), ('entrance', 'NN'), ('is', 'BEZ'), ('direct', 'JJ'), ('.', '.')]


In [380]:
unseen_sent = brown_sents[4203]

In [382]:
print(bigram_tagger.tag(unseen_sent))

[('The', 'AT'), ('population', 'NN'), ('of', 'IN'), ('the', 'AT'), ('Congo', 'NP'), ('is', 'BEZ'), ('13.5', None), ('million', None), (',', None), ('divided', None), ('into', None), ('at', None), ('least', None), ('seven', None), ('major', None), ('``', None), ('culture', None), ('clusters', None), ("''", None), ('and', None), ('innumerable', None), ('tribes', None), ('speaking', None), ('400', None), ('separate', None), ('dialects', None), ('.', None)]


In [384]:
bigram_tagger.evaluate(test_sents)

0.10206319146815508

In [388]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents,backoff=t0)
t2 = nltk.BigramTagger(train_sents,backoff=t1)
t3 = nltk.TrigramTagger(train_sents,backoff=t2)
t3.evaluate(test_sents)

0.843317053722715

In [389]:
t2.evaluate(test_sents)

0.8452108043456593

## Storing Taggers

In [394]:
from pickle import dump
output = open('t2.pkl','wb')

In [395]:
dump(t2,output,-1)

In [396]:
output.close()

In [397]:
from pickle import load
input = open('t2.pkl','rb')
tagger = load(input)
input.close()

In [402]:
text = """The board's action shows what free enterprise is up against in our complex maze of regulatory laws ."""
tokens = text.split(' ')

In [405]:
print(tagger.tag(tokens))

[('The', 'AT'), ("board's", 'NN$'), ('action', 'NN'), ('shows', 'NNS'), ('what', 'WDT'), ('free', 'JJ'), ('enterprise', 'NN'), ('is', 'BEZ'), ('up', 'RP'), ('against', 'IN'), ('in', 'IN'), ('our', 'PP$'), ('complex', 'JJ'), ('maze', 'NN'), ('of', 'IN'), ('regulatory', 'NN'), ('laws', 'NNS'), ('.', '.')]


In [408]:
tri=(nltk.trigrams(tokens))

#### confusion matrix.

In [423]:
test_tags = [tag for sent in brown.sents(categories='editorial') for (word, tag) in t2.tag(sent)]

In [427]:
gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]

In [434]:
print(nltk.ConfusionMatrix(gold_tags, test_tags))

           |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

##  Automatic Tagging

In [436]:
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'

In [437]:
tokens = nltk.word_tokenize(raw)
print(tokens)

['I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', ',', 'I', 'do', 'not', 'like', 'them', 'Sam', 'I', 'am', '!']


In [468]:
default_tagger = nltk.DefaultTagger('NN')

In [469]:
default_tagger.tag(tokens)
print(default_tagger.tag(tokens))

[('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('green', 'NN'), ('eggs', 'NN'), ('and', 'NN'), ('ham', 'NN'), (',', 'NN'), ('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('them', 'NN'), ('Sam', 'NN'), ('I', 'NN'), ('am', 'NN'), ('!', 'NN')]


In [470]:
default_tagger.evaluate(brown_tagged_sents)


0.13089484257215028