In [1]:
!pip install nltk



In [5]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\vtung\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


In [12]:
# Loading corpus
corpus = brown.words()

# Convert word to lowercase and get vocab
lowercase_corpus = [w.lower() for w in corpus] # -> lower-case corpus
vocab = set(lowercase_corpus) # -> distinctive words from corpus

print(f"CORPUS: {lowercase_corpus[:20]}\n")
print(f"VOCAB: {list(vocab)[:10]}")

CORPUS: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']

VOCAB: ['allied', 'praying', 'insinuates', 'actors', "kirov's", 'exasperating', 'tripoli', '269', '3%', 'beers']


In [13]:
# How many words in corpus and vocab?
print(f"Total words in corpus: {len(lowercase_corpus)}")
print(f"Total vocabs in corpus: {len(vocab)}")

Total words in corpus: 1161192
Total vocabs in corpus: 49815


In [19]:
bigram_counts = {}
trigram_counts = {}

# Count bigrams and trigrams
for i in range(len(lowercase_corpus) - 2):
    bigram = (lowercase_corpus[i], lowercase_corpus[i+1])
    trigram = (lowercase_corpus[i], lowercase_corpus[i+1], lowercase_corpus[i+2])
    
    if bigram in bigram_counts.keys():
        bigram_counts[bigram] += 1
    else:
        bigram_counts[bigram] = 1
    
    if trigram in trigram_counts.keys():
        trigram_counts[trigram] += 1
    else:
        trigram_counts[trigram] = 1 

print(f"Count for bigram ('grand', 'jury') is: {bigram_counts[('grand', 'jury')]}")
print(f"Count for trigram ('grand', 'jury', 'said') is: {trigram_counts[('grand', 'jury', 'said')]}")

Count for bigram ('grand', 'jury') is: 10
Count for trigram ('grand', 'jury', 'said') is: 1


In [26]:
def suggestion_model(_input, bigram_counts, trigram_counts, vocab):
    # Consider las bigram of sentence
    tokenized_input = word_tokenize(_input.lower())
    last_bigram = tokenized_input[-2:] # -> last two words of input
    
    # Prob for each word in vocab
    vocab_probs = {}
    for vocab_word in vocab:
        test_trigram = (last_bigram[0], last_bigram[1], vocab_word)
        test_bigram = (last_bigram[0], last_bigram[1])
        
        test_trigram_count = trigram_counts.get(test_trigram, 0) # -> Return 0 if not exist
        test_bigram_count = bigram_counts.get(test_bigram, 0) # -> Return 0 if not exist
        
        probability = (test_trigram_count + 1) / (test_bigram_count + len(vocab)) # Add-one smoothing
        
        vocab_probs[vocab_word] = probability
        
    top_suggestions = sorted(vocab_probs.items(), key= lambda x: x[1], reverse = True)[:3]
    return top_suggestions

In [27]:
suggestion_model("an investigation of", bigram_counts, trigram_counts, vocab)

[('the', 0.7333333333333333),
 ('a', 0.06666666666666667),
 ("atlanta's", 0.06666666666666667)]