# Ejemplos POS Tagging

1.- Aprendizaje y evaluación de etiquetadores basados en unigramas y bigramas

In [1]:
import pprint
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank

In [2]:
talla = int(len(treebank.tagged_sents()) * 0.9)
train_sents = treebank.tagged_sents(tagset = 'universal')[:talla]

In [3]:
unigram_tagger = UnigramTagger(train_sents)

In [4]:
bigram_tagger = BigramTagger(train_sents)

In [5]:
test_sents = treebank.tagged_sents()[talla:]
okUni = round(unigram_tagger.evaluate(test_sents) * 100, 2);
okBi = round(bigram_tagger.evaluate(test_sents) * 100, 2);
print("Tasas de acierto:")
print("Unigramas: " + str(okUni) + "%")
print("Bigramas: " + str(okBi) + "%")

Tasas de acierto:
Unigramas: 3.91%
Bigramas: 0.11%


In [6]:
sent1 = treebank.sents()[talla+1]
print (sent1)
pprint.pprint(unigram_tagger.tag(sent1))
pprint.pprint(bigram_tagger.tag(sent1))

['In', 'early', 'trading', 'in', 'Tokyo', 'Thursday', ',', 'the', 'Nikkei', 'index', 'fell', '63.79', 'points', 'to', '35500.64', '.']
[('In', 'IN'),
 ('early', 'JJ'),
 ('trading', 'NN'),
 ('in', 'IN'),
 ('Tokyo', 'NNP'),
 ('Thursday', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Nikkei', 'NN'),
 ('index', 'NN'),
 ('fell', 'VBD'),
 ('63.79', 'NN'),
 ('points', 'NNS'),
 ('to', 'TO'),
 ('35500.64', 'NN'),
 ('.', '.')]


NameError: name 'bigram_tagger' is not defined

2.- Definición y evaluación del etiquetador por defecto, usando la etiqueta más frecuente

In [20]:
import pprint
from nltk.corpus import treebank
from nltk.tag import DefaultTagger
from nltk.probability import FreqDist
tags = [tag for (word, tag) in treebank.tagged_words()]
masFTag = FreqDist(tags).max()
default_tagger = DefaultTagger(masFTag)

In [21]:
test_sents = treebank.tagged_sents()[3000:]
print(default_tagger.evaluate(test_sents))

0.14331966328512843


In [22]:
sent1 = treebank.sents()[talla+1]
print(sent1)
tag = default_tagger.tag(sent1)
pprint.pprint(tag)

['First', 'of', 'America', 'said', '0', 'some', 'of', 'the', 'managers', 'will', 'take', 'other', 'jobs', 'with', 'First', 'of', 'America', '.']
[('First', 'NN'),
 ('of', 'NN'),
 ('America', 'NN'),
 ('said', 'NN'),
 ('0', 'NN'),
 ('some', 'NN'),
 ('of', 'NN'),
 ('the', 'NN'),
 ('managers', 'NN'),
 ('will', 'NN'),
 ('take', 'NN'),
 ('other', 'NN'),
 ('jobs', 'NN'),
 ('with', 'NN'),
 ('First', 'NN'),
 ('of', 'NN'),
 ('America', 'NN'),
 ('.', 'NN')]


3.- Definición y evaluación de un etiquetador usando backoff

In [7]:
import pprint
from nltk.corpus import treebank
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger
from nltk.probability import FreqDist
tags = [tag for (word, tag) in treebank.tagged_words()]
masFTag = FreqDist(tags).max()
default_tagger = DefaultTagger(masFTag)

talla = 3000
train_sents = treebank.tagged_sents()[:talla]
unigram_tagger = UnigramTagger(train_sents, backoff = default_tagger)
backoff_tagger = BigramTagger(train_sents, backoff= unigram_tagger)

In [8]:
test_sents = treebank.tagged_sents()[talla:]
okTriBO = round(backoff_tagger.evaluate(test_sents) * 100, 2);
print("Bigramas - backoff: " + str(okTriBO) + "%")

Bigramas - backoff: 88.1%


In [9]:
sent1 = treebank.sents()[talla+1]
print(sent1)
#tag = backoff_tagger.tag(sent1)
#pprint.pprint(tag)

['In', 'early', 'trading', 'in', 'Tokyo', 'Thursday', ',', 'the', 'Nikkei', 'index', 'fell', '63.79', 'points', 'to', '35500.64', '.']
