# N-Gram Language Modeling

In [27]:
from nltk.corpus import shakespeare
from nltk.tokenize import word_tokenize

In [28]:
import nltk
nltk.download('shakespeare')

[nltk_data] Downloading package shakespeare to /root/nltk_data...
[nltk_data]   Package shakespeare is already up-to-date!


True

In [30]:
print(nltk.corpus.shakespeare.fileids())

['a_and_c.xml', 'dream.xml', 'hamlet.xml', 'j_caesar.xml', 'macbeth.xml', 'merchant.xml', 'othello.xml', 'r_and_j.xml']


In [31]:
# Loading the corpus
corpus = shakespeare.words('hamlet.xml')

# Case folding and getting vocab
lower_case_corpus = [w.lower() for w in corpus]
vocab = set(lower_case_corpus)

print('CORPUS EXAMPLE: ' + str(lower_case_corpus[:30]) + '\n\n')
print('VOCAB EXAMPLE: ' + str(list(vocab)[:10]))

CORPUS EXAMPLE: ['the', 'tragedy', 'of', 'hamlet', ',', 'prince', 'of', 'denmark', 'dramatis', 'personae', 'claudius', ',', 'king', 'of', 'denmark', '.', 'hamlet', ',', 'son', 'to', 'the', 'late', ',', 'and', 'nephew', 'to', 'the', 'present', 'king', '.']


VOCAB EXAMPLE: ['often', 'trappings', 'worser', 'shows', 'truepenny', 'screen', 'bearers', 'something', 'ago', 'persuade']


In [32]:
print('Total words in Corpus: ' + str(len(lower_case_corpus)))
print('Vocab of the Corpus: ' + str(len(vocab)))

Total words in Corpus: 40379
Vocab of the Corpus: 4568


In [35]:
bigram_counts = {}
trigram_counts = {}

# Sliding through corpus to get bigram and trigram counts
for i in range(len(lower_case_corpus) - 2):
    # Getting bigram and trigram at each slide
    bigram = (lower_case_corpus[i], lower_case_corpus[i+1])
    trigram = (lower_case_corpus[i], lower_case_corpus[i+1], lower_case_corpus[i+2])
    
    # Keeping track of the bigram counts
    if bigram in bigram_counts.keys():
        bigram_counts[bigram] += 1
    else:
        bigram_counts[bigram] = 1
    
    # Keeping track of trigram counts
    if trigram in trigram_counts.keys():
        trigram_counts[trigram] += 1
    else:
        trigram_counts[trigram] = 1

print("Example, count for bigram ('prince', 'of') is: " + str(bigram_counts[('prince', 'of')]))

Example, count for bigram ('prince', 'of') is: 2


In [38]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
# Function takes sentence as input and suggests possible words that comes after the sentence  
def suggest_next_word(input_, bigram_counts, trigram_counts, vocab):
    # Consider the last bigram of sentence
    tokenized_input = word_tokenize(input_.lower())
    last_bigram = tokenized_input[-2:]
    
    # Calculating probability for each word in vocab
    vocab_probabilities = {}
    for vocab_word in vocab:
        test_trigram = (last_bigram[0], last_bigram[1], vocab_word)
        test_bigram = (last_bigram[0], last_bigram[1])

        test_trigram_count = trigram_counts.get(test_trigram, 0)
        test_bigram_count = bigram_counts.get(test_bigram, 0)
        
        probability = test_trigram_count / test_bigram_count
        vocab_probabilities[vocab_word] = probability
    
    # Sorting the vocab probability in descending order to get top probable words
    top_suggestions = sorted(vocab_probabilities.items(), key=lambda x: x[1], reverse=True)[:3]
    return top_suggestions

In [40]:
suggest_next_word('I am the king', bigram_counts, trigram_counts, vocab)

[("'", 0.14583333333333334), (',', 0.14583333333333334), ('.', 0.125)]

In [41]:
suggest_next_word('I am the king of', bigram_counts, trigram_counts, vocab)

[('denmark', 0.5), ('shreds', 0.25), ('infinite', 0.25)]

In [42]:
suggest_next_word('I am the king of denmark', bigram_counts, trigram_counts, vocab)

[(',', 0.2), ('.', 0.2), ('goes', 0.1)]

In [44]:
suggest_next_word('I am the king of denmark,', bigram_counts, trigram_counts, vocab)

[('and', 0.6666666666666666), ('to', 0.3333333333333333), ('often', 0.0)]