In [1]:
import nltk
from nltk.corpus import treebank

train_data = treebank.tagged_sents()[:3500]
test_data = treebank.tagged_sents()[3500:]
print(train_data[0])
print(len(treebank.tagged_sents()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
3914


In [118]:
# nltk.download('help/tagsets/PY3/upenn_tagset.pickle')
# nltk.help.upenn_tagset()

In [114]:
from nltk.tag import hmm
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)
print(tagger)

<HiddenMarkovModelTagger 46 states and 11668 output symbols>


In [104]:
nodes = tagger._states
transitions = tagger._transitions_matrix()
words = tagger._symbols
priors = tagger._priors 
posteriors = tagger._outputs

triples = [[nodes[j], nodes[i], transitions[i][j]]  for i in range(len(nodes)) for j in range(len(nodes))]
triples.sort(key=lambda x: x[2], reverse=True)
print('Most probable transitions:')
for triple in triples[:20]:
    print('{} ---> {} (p = {:1.4f})'.format(*triple))

Most probable transitions:
SYM ---> NNP (p = 0.0000)
# ---> CD (p = 0.0000)
$ ---> CD (p = -0.0166)
PDT ---> DT (p = -0.2016)
. ---> '' (p = -0.2224)
WDT ---> -NONE- (p = -0.2321)
MD ---> VB (p = -0.2987)
WP ---> -NONE- (p = -0.3735)
RBS ---> JJ (p = -0.4436)
WP$ ---> NN (p = -0.7370)
EX ---> VBZ (p = -0.7655)
TO ---> VB (p = -0.7941)
VBN ---> -NONE- (p = -0.8513)
FW ---> VBZ (p = -1.0000)
DT ---> NN (p = -1.0906)
LS ---> -RRB- (p = -1.1155)
JJ ---> NN (p = -1.1763)
PRP$ ---> NN (p = -1.2082)
POS ---> NN (p = -1.3209)
NNP ---> NNP (p = -1.3766)


In [105]:
print('Most probable nouns:')
nouns = [[word, posteriors['NN'].prob(word)] for word in words if word.isalpha()]
nouns.sort(key=lambda x: x[1], reverse=True)
for noun in nouns[:20]:
    print('{} (p = {:1.4f})'.format(*noun))

Most probable nouns:
company (p = 0.0181)
year (p = 0.0159)
market (p = 0.0141)
trading (p = 0.0115)
stock (p = 0.0106)
program (p = 0.0102)
president (p = 0.0095)
share (p = 0.0079)
business (p = 0.0058)
government (p = 0.0057)
price (p = 0.0053)
index (p = 0.0051)
time (p = 0.0049)
money (p = 0.0045)
issue (p = 0.0045)
yesterday (p = 0.0044)
interest (p = 0.0043)
investment (p = 0.0043)
week (p = 0.0039)
number (p = 0.0036)


In [106]:
print('Most probable verbs:')
nouns = [[word, posteriors['VB'].prob(word)] for word in words if word.isalpha()]
nouns.sort(key=lambda x: x[1], reverse=True)
for noun in nouns[:20]:
    print('{} (p = {:1.4f})'.format(*noun))

Most probable nouns:
be (p = 0.1356)
have (p = 0.0410)
make (p = 0.0257)
buy (p = 0.0188)
take (p = 0.0157)
get (p = 0.0157)
help (p = 0.0140)
do (p = 0.0135)
sell (p = 0.0126)
yield (p = 0.0122)
pay (p = 0.0118)
see (p = 0.0100)
go (p = 0.0078)
say (p = 0.0078)
raise (p = 0.0074)
keep (p = 0.0061)
give (p = 0.0061)
become (p = 0.0057)
remain (p = 0.0052)
want (p = 0.0052)


In [112]:
print('Most probable POS tags for "talks":')
word = [[tag, posteriors[tag].prob('talks')] for tag in nodes]
word.sort(key=lambda x: x[1], reverse=True)
for w in word[:20]:
    if w[1] > 0:
        print('{} (p = {:1.4f})'.format(*w))

Most probable POS tags for "talks":
NNS (p = 0.0026)
VBZ (p = 0.0010)


In [99]:
print(tagger.tag("Today is a good day .".split()))

print(tagger.tag("Joe met Joanne in Delhi .".split()))

print(tagger.tag("Chicago is the birthplace of Ginny".split()))

print(tagger.tag("The chief talks".split()))

[('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN'), ('.', '.')]
[('Joe', 'NNP'), ('met', 'VBD'), ('Joanne', 'NNP'), ('in', 'IN'), ('Delhi', 'NNP'), ('.', 'NNP')]
[('Chicago', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('birthplace', 'NNP'), ('of', 'NNP'), ('Ginny', 'NNP')]
[('The', 'DT'), ('chief', 'NN'), ('talks', 'NNS')]


In [4]:
tagger.evaluate(test_data)

0.41872398102430053

In [100]:
from nltk.tag import tnt
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
tnt_pos_tagger.evaluate(test_data)

0.8777229160615742

In [113]:
print(tnt_pos_tagger.tag("Today is a good day .".split()))

print(tnt_pos_tagger.tag("Joe met Joanne in Delhi .".split()))

print(tnt_pos_tagger.tag("Chicago is the birthplace of Ginny".split()))

print(tnt_pos_tagger.tag("The chief talks".split()))

[('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN'), ('.', '.')]
[('Joe', 'NNP'), ('met', 'VBD'), ('Joanne', 'NNP'), ('in', 'IN'), ('Delhi', 'Unk'), ('.', '.')]
[('Chicago', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('birthplace', 'Unk'), ('of', 'IN'), ('Ginny', 'Unk')]
[('The', 'DT'), ('chief', 'JJ'), ('talks', 'NNS')]
