In [1]:
import nltk
from collections import defaultdict

In [2]:
corpus = nltk.corpus.gutenberg
bible = 'bible-kjv.txt'
bible_words = corpus.words(bible)

In [3]:
bible_pos = nltk.pos_tag(bible_words)

In [4]:
# create a set as the default entry value
tags = defaultdict(set)
for word, tag in bible_pos:
    tags[word].add(tag)

# look at words with more than 8 tags:
for word, tag_list in tags.items():
    if len(tag_list) > 8:
        print("{}: {}".format(word, tag_list))

unto: {'RBR', 'VBP', 'IN', 'VBZ', 'VBD', 'RP', 'PRP$', 'NNS', 'VB', 'MD', 'JJ', 'CC', 'NN', 'RB', 'NNP'}
forth: {'VBP', 'IN', 'VBZ', 'VBD', 'RP', 'JJS', 'NNS', 'VB', 'JJ', 'NN', 'VBN', 'PDT', 'RB'}
hath: {'VBD', 'VBP', 'VBZ', 'IN', 'VB', 'MD', 'JJ', 'PRP', 'NN', 'PDT', 'RB'}
wherein: {'VBP', 'IN', 'VBZ', 'VBD', 'WRB', 'WDT', "''", 'WP', 'NNP', 'VB', 'JJ', 'CC', 'JJR', 'NN', 'RB', 'EX'}
behold: {'VBD', 'VBP', 'VB', 'JJ', 'NN', 'VBN', 'CC', 'RB', 'UH'}
till: {'VBP', 'IN', 'VBZ', 'VB', 'JJ', 'RB', 'NN', 'CC', 'EX'}
evil: {'VBP', 'VBD', 'VBZ', 'EX', 'NNS', 'VB', 'FW', 'JJ', 'NN', 'VBN', 'CC', 'RB'}
goeth: {'VBD', 'VBP', 'VBZ', 'VBG', 'NNS', 'VB', 'JJ', 'NN', 'RB'}
thou: {'VBD', 'VBP', 'VBZ', 'IN', 'RP', "''", 'EX', 'NNS', 'VB', 'NNP', 'MD', 'JJ', 'JJR', 'PRP', 'NN', 'VBN', 'CC', 'RB'}
eat: {'VBP', 'IN', 'VBZ', 'VBD', 'VB', 'JJ', 'NN', 'RB', 'NNP'}
shalt: {'VBD', 'VBP', 'VBZ', 'NNS', 'VB', 'FW', 'JJ', 'JJR', 'PRP', 'NN', 'VBN', 'RB', 'MD'}
thereof: {'VBD', 'VBP', 'VBZ', 'WDT', 'RP', 'EX', '

In [5]:
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
style = "en-ptb"  # penn treebank POS

simplify = lambda tag : nltk.map_tag(style, "universal", tag)

def simple_pos(text):
    pos = nltk.pos_tag(text)
    return [(word, simplify(tag)) for word, tag in pos]

# create a set as the default entry value
tags = defaultdict(set)
for word, tag in simple_pos(bible_words):
    tags[word].add(tag)

# look at words with more than 8 tags:
for word, tag_list in tags.items():
    if len(tag_list) > 8:
        print("{}: {}".format(word, tag_list))

wherein: {'CONJ', 'VERB', 'ADJ', 'NOUN', 'DET', 'ADP', 'ADV', 'PRON', '.'}
thou: {'CONJ', 'PRT', 'VERB', 'ADJ', 'NOUN', 'ADP', 'ADV', 'DET', 'PRON', '.'}
ye: {'CONJ', 'PRT', 'VERB', 'ADJ', 'NOUN', 'ADV', 'DET', 'NUM', 'ADP', 'X', 'PRON', '.'}
doth: {'CONJ', 'PRT', 'VERB', 'ADJ', 'NOUN', 'DET', 'ADP', 'ADV', 'X'}
thee: {'CONJ', 'PRT', 'VERB', 'ADJ', 'NOUN', 'ADV', 'ADP', 'DET', 'PRON'}


In [7]:
corpus = nltk.corpus.brown
corpus.tagged_words()[:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [8]:
corpus.tagged_words(tagset = "universal")[:10]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP')]