In [184]:
import numpy as np
import re

import nltk
# if you haven't downloaded punkt before, you only need to run the line below once 
#nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize

from nltk.util import bigrams
from nltk.lm.preprocessing import padded_everygram_pipeline

import requests

In [203]:


# You will need to leverage the requests package
r = requests.get(r'https://www.gutenberg.org/files/84/84-0.txt')
Frank_stein = r.text

# Remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\t"]:
    Frank_stein = Frank_stein.replace(char, " ")

# Check
print(Frank_stein[:148])

ï»¿The Project Gutenberg eBook of Frankenstein, by Mary Wollstonecraft Shelley    This eBook is for the use of anyone anywhere in the United States 


In [204]:
# remove the metadata at the beginning - this is slightly different for each book
Frank_stein = Frank_stein[1530:]

In [205]:
print(Frank_stein[:60])

 You will rejoice to hear that no disaster has accompanied t


In [206]:
###Creating an N-gram Model
# 2 is for bigrams
n = 2
#specify the text you want to use
text = Frank_stein

In [207]:
# step 1: tokenize the text into sentences
sentences = nltk.sent_tokenize(text)

# step 2: tokenize each sentence into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# step 3: convert each word to lowercase
tokenized_text = [[word.lower() for word in sent] for sent in tokenized_sentences]

#notice the sentence breaks and what the first 10 items of the tokenized text
print(tokenized_text[0])

['you', 'will', 'rejoice', 'to', 'hear', 'that', 'no', 'disaster', 'has', 'accompanied', 'the', 'commencement', 'of', 'an', 'enterprise', 'which', 'you', 'have', 'regarded', 'with', 'such', 'evil', 'forebodings', '.']


In [208]:
#Why tokenize sentences and words? We want to be able to retain sentence boundaries to encode that, too.
print(text[:10])

 You will 


In [209]:
# we imported this function from nltk
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

In [210]:
from nltk.lm import MLE
# we imported this function from nltk linear models (lm) 
# it is for Maximum Likelihood Estimation

# MLE is the model we will use
lm = MLE(n)

In [211]:
# currently the vocab length is 0: it has no prior knowledge
len(lm.vocab)

0

In [212]:
# fit the model 
# training data is the bigrams and unigrams 
# the vocab is all the sentence tokens in the corpus 

lm.fit(train_data, padded_sents)
len(lm.vocab)

7872

In [213]:
# inspect the model's vocabulary. 
# be sure that a sentence you know exists (from tokenized_text) is in the 
print(lm.vocab.lookup(tokenized_text[0]))

('you', 'will', 'rejoice', 'to', 'hear', 'that', 'no', 'disaster', 'has', 'accompanied', 'the', 'commencement', 'of', 'an', 'enterprise', 'which', 'you', 'have', 'regarded', 'with', 'such', 'evil', 'forebodings', '.')


In [214]:
# see what happens when we include a word that is not in the vocab. 
print(lm.vocab.lookup('you will rejoice to chocolate that'.split()))

('you', 'will', 'rejoice', 'to', '<UNK>', 'that')


In [215]:
# how many times does 'with' appear in the model?
print(lm.counts['with'])

# what is the probability of 'with' appearing? 
# this is technically the relative frequency of daisy appearing 
lm.score('with')

711


0.007567613594031058

In [216]:
# what is the score of 'UNK'? 

lm.score("<UNK>")

0.0

In [217]:
##Does the relative frequency of 'UNK' change your assumption about how the model behaves?

##How should we change our model to account for the fact the <UNK> words are not accounted for by the model?


In [218]:
#There is a certain amount of randomness encoded into n-gram models. This prevents a model from becoming entirely deterministic. Maximum Likelihood Estimation without some degree of randomness will only produce the most likely result every time. Setting Random Seed means we will get the same result every time.
# generate a 20 word sentence starting with the word, 'I'

print(lm.generate(20, text_seed= 'The', random_seed=200))

[',', 'and', 'she', 'busied', 'myself', 'and', 'a', 'little', 'chance', ',', 'but', 'it', 'appeared', 'to', 'five', 'persons', ',', 'and', 'i', 'remembered']


In [219]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(lm, num_words, text_seed, random_seed=200):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in lm.generate(num_words, text_seed=text_seed, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [220]:
# Now generate sentences that look much nicer. 
generate_sent(lm, 20, text_seed='I', random_seed = 200)

', and she busied myself and a little chance, but it appeared to five persons, and i remembered'