In [1]:
from nltk.corpus.reader import PlaintextCorpusReader
from nltk.util import pad_sequence, everygrams
from nltk.lm.preprocessing import pad_both_ends, flatten, padded_everygram_pipeline
from nltk.lm import MLE

In [2]:
my_corpus = PlaintextCorpusReader("./", ".*\.txt")

for sent in my_corpus.sents(fileids="hamlet.txt"):
    print(sent)

['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Hamlet', ',', 'by', 'William', 'Shakespeare']
['This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.']
['You', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're', '-', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included', 'with', 'this', 'eBook', 'or', 'online', 'at', 'www', '.', 'gutenberg', '.', 'org']
['Title', ':', 'Hamlet']
['Author', ':', 'William', 'Shakespeare']
['Editor', ':', 'Charles', 'Kean']
['Release', 'Date', ':', 'January', '10', ',', '2009', '[', 'EBook', '#', '27761', ']']
['Language', ':', 'English']
['Character', 'set', 'encoding', ':', 'UTF', '-', '8']
['***', 'START', 'OF', 'THIS', 'PROJECT', 'GUTENBERG', 'EBOOK', 'HAMLET', '***']
['Produced', 'by', 'David', 'Starner', ',', 'Curtis', 'Weyant', 'and', 'the', 'Online', 'Distributed', 'Proofreading', 'Team', 'at', 'htt

In [3]:
padded_trigrams = list(pad_both_ends(my_corpus.sents(fileids="hamlet.txt")[1104], n=2))
list(everygrams(padded_trigrams, max_len=3))

[('<s>',),
 ('<s>', '_Ham'),
 ('<s>', '_Ham', '.'),
 ('_Ham',),
 ('_Ham', '.'),
 ('_Ham', '.', '_'),
 ('.',),
 ('.', '_'),
 ('.', '_', 'To'),
 ('_',),
 ('_', 'To'),
 ('_', 'To', 'be'),
 ('To',),
 ('To', 'be'),
 ('To', 'be', ','),
 ('be',),
 ('be', ','),
 ('be', ',', 'or'),
 (',',),
 (',', 'or'),
 (',', 'or', 'not'),
 ('or',),
 ('or', 'not'),
 ('or', 'not', 'to'),
 ('not',),
 ('not', 'to'),
 ('not', 'to', 'be'),
 ('to',),
 ('to', 'be'),
 ('to', 'be', ','),
 ('be',),
 ('be', ','),
 ('be', ',', 'that'),
 (',',),
 (',', 'that'),
 (',', 'that', 'is'),
 ('that',),
 ('that', 'is'),
 ('that', 'is', 'the'),
 ('is',),
 ('is', 'the'),
 ('is', 'the', 'question'),
 ('the',),
 ('the', 'question'),
 ('the', 'question', ':['),
 ('question',),
 ('question', ':['),
 ('question', ':[', '8'),
 (':[',),
 (':[', '8'),
 (':[', '8', ']'),
 ('8',),
 ('8', ']'),
 ('8', ']', 'Whether'),
 (']',),
 (']', 'Whether'),
 (']', 'Whether', "'"),
 ('Whether',),
 ('Whether', "'"),
 ('Whether', "'", 'tis'),
 ("'",),
 ("'",

In [4]:
list(
    flatten(pad_both_ends(sent, n=2) for sent in my_corpus.sents(fileids="hamlet.txt"))
)

['<s>',
 'The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Hamlet',
 ',',
 'by',
 'William',
 'Shakespeare',
 '</s>',
 '<s>',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '.',
 '</s>',
 '<s>',
 'You',
 'may',
 'copy',
 'it',
 ',',
 'give',
 'it',
 'away',
 'or',
 're',
 '-',
 'use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'Project',
 'Gutenberg',
 'License',
 'included',
 'with',
 'this',
 'eBook',
 'or',
 'online',
 'at',
 'www',
 '.',
 'gutenberg',
 '.',
 'org',
 '</s>',
 '<s>',
 'Title',
 ':',
 'Hamlet',
 '</s>',
 '<s>',
 'Author',
 ':',
 'William',
 'Shakespeare',
 '</s>',
 '<s>',
 'Editor',
 ':',
 'Charles',
 'Kean',
 '</s>',
 '<s>',
 'Release',
 'Date',
 ':',
 'January',
 '10',
 ',',
 '2009',
 '[',
 'EBook',
 '#',
 '27761',
 ']',
 '</s>',
 '<s>',
 'Language',
 ':',
 'English',
 '</s>',
 '<s>',
 'Character',
 'set',
 'encoding',
 ':',
 'UTF',


In [5]:
train, vocab = padded_everygram_pipeline(3, my_corpus.sents(fileids="hamlet.txt"))

In [6]:
lm = MLE(3)
len(lm.vocab)

0

In [7]:
lm.fit(train, vocab)
print(lm.vocab)
len(lm.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 6738 items>


6738

In [8]:
lm.vocab.lookup(my_corpus.sents(fileids="hamlet.txt")[1104])

('_Ham',
 '.',
 '_',
 'To',
 'be',
 ',',
 'or',
 'not',
 'to',
 'be',
 ',',
 'that',
 'is',
 'the',
 'question',
 ':[',
 '8',
 ']',
 'Whether',
 "'",
 'tis',
 'nobler',
 'in',
 'the',
 'mind',
 'to',
 'suffer',
 'The',
 'slings',
 'and',
 'arrows',
 'of',
 'outrageous',
 'fortune',
 ',',
 'Or',
 'to',
 'take',
 'arms',
 'against',
 'a',
 'sea',
 'of',
 'troubles',
 ',[',
 '9',
 ']',
 'And',
 ',',
 'by',
 'opposing',
 'end',
 'them',
 '?--',
 'To',
 'die',
 ',--',
 'to',
 'sleep',
 ',',
 'No',
 'more',
 ';--',
 'and',
 'by',
 'a',
 'sleep',
 ',',
 'to',
 'say',
 'we',
 'end',
 'The',
 'heart',
 '-',
 'ache',
 ',',
 'and',
 'the',
 'thousand',
 'natural',
 'shocks',
 'That',
 'flesh',
 'is',
 'heir',
 'to',
 ':',
 "'",
 'tis',
 'a',
 'consummation',
 'Devoutly',
 'to',
 'be',
 'wished',
 '.')

In [9]:
lm.vocab.lookup(["aliens", "from", "Mars"])

('<UNK>', 'from', 'Mars')

In [10]:
print(lm.counts)
lm.counts[["to"]]["be"]

<NgramCounter with 3 ngram orders and 178020 ngrams>


43

In [11]:
print(lm.score("be"))
print(lm.score("be", ["to"]))
print(lm.score("be", ["not", "to"]))

0.003209677419354839
0.05702917771883289
0.2727272727272727


In [12]:
print(lm.logscore("be"))
print(lm.logscore("be", ["to"]))
print(lm.logscore("be", ["not", "to"]))

-8.283355974505314
-4.132155958566567
-1.8744691179161412


In [13]:
test = [("to", "be"), ("or", "not"), ("to", "be")]
print(lm.entropy(test))
print(lm.perplexity(test))

4.9951370352801066
31.892317578121798


In [15]:
lm.generate(6, ["to", "be"])

['a', 'sight', 'indeed', ',', 'if', 'you']