In [1]:
import nltk
from nltk.util import ngrams
from collections import Counter

In [2]:
from nltk.corpus import gutenberg
print(gutenberg.fileids())

nltk.download('gutenberg')
text = gutenberg.raw("shakespeare-caesar.txt")

words = nltk.word_tokenize(text.lower())
words = [word for word in words if word.isalpha()]

trigrams = list(ngrams(words, 3))
trigram_counts = Counter(trigrams)

bigrams = list(ngrams(words, 2))
bigram_counts = Counter(bigrams)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
def mlf_bigram(bigram):
    if bigram in bigram_counts:
        return bigram_counts[bigram] / sum(bigram_counts.values())
    else:
        return 0
    
def mlf_trigram(trigram):
    bigram = trigram[:2]
    if trigram in trigram_counts:
        return trigram_counts[trigram] / bigram_counts[bigram]
    elif bigram in bigram_counts:
        return bigram_counts[bigram] / sum(bigram_counts.values())
    else:
        return 0

print("Top 10 bigram probabilities:")

for bigram in list(bigram_counts.items())[:10]:
    print(bigram, "->", mlf_bigram(bigram))
    
print("\nTop 50 trigram probabilities:")


mlf_results = {trigram: mlf_trigram(trigram) for trigram in trigram_counts}
for trigram, prob in list(mlf_results.items())[:50]:
    print(trigram, "----", prob)

print("\nHere are some bigram results with probabilities:")

for bigram, count in list(bigram_counts.items())[:10]:
    print(bigram, "->", count / sum(bigram_counts.values()))

Top 10 bigram probabilities:
(('the', 'tragedie'), 2) -> 0
(('tragedie', 'of'), 2) -> 0
(('of', 'julius'), 1) -> 0
(('julius', 'caesar'), 1) -> 0
(('caesar', 'by'), 1) -> 0
(('by', 'william'), 1) -> 0
(('william', 'shakespeare'), 1) -> 0
(('shakespeare', 'actus'), 1) -> 0
(('actus', 'primus'), 1) -> 0
(('primus', 'scoena'), 1) -> 0

Top 50 trigram probabilities:
('the', 'tragedie', 'of') ---- 1.0
('tragedie', 'of', 'julius') ---- 0.5
('of', 'julius', 'caesar') ---- 1.0
('julius', 'caesar', 'by') ---- 1.0
('caesar', 'by', 'william') ---- 1.0
('by', 'william', 'shakespeare') ---- 1.0
('william', 'shakespeare', 'actus') ---- 1.0
('shakespeare', 'actus', 'primus') ---- 1.0
('actus', 'primus', 'scoena') ---- 1.0
('primus', 'scoena', 'prima') ---- 1.0
('scoena', 'prima', 'enter') ---- 1.0
('prima', 'enter', 'flauius') ---- 1.0
('enter', 'flauius', 'murellus') ---- 1.0
('flauius', 'murellus', 'and') ---- 1.0
('murellus', 'and', 'certaine') ---- 0.5
('and', 'certaine', 'commoners') ---- 1.0
('

In [4]:
unigrams = list(ngrams(words, 1))
unigram_counts = Counter(unigrams)
def mlf_unigram(unigram):
    if unigram in unigram_counts:
        return unigram_counts[unigram] / sum(unigram_counts.values())
    else:
        return 0

def interpolate(trigram, bigram, unigram, alpha1=0.4, alpha2=0.3, alpha3=0.3):
    trigram_prob = mlf_trigram(trigram)
    bigram_prob = mlf_bigram(bigram)
    unigram_prob = mlf_unigram(unigram)

    return alpha1 * trigram_prob + alpha2 * bigram_prob + alpha3 * unigram_prob

def backoff(trigram, bigram, unigram):
    trigram_prob = mlf_trigram(trigram)
    if trigram_prob > 0:
        return trigram_prob
    else:
        bigram_prob = mlf_bigram(bigram)
        if bigram_prob > 0:
            return bigram_prob
        else:
            return mlf_unigram(unigram)


In [10]:
import random
def predict_next_word(context, ngram_counts, ngram_type='bigram'):
    if ngram_type == 'trigram' and len(context) == 2:
        trigram = tuple(context[-2:])
        trigram_prob = mlf_trigram(trigram)
        if trigram_prob > 0:
            return get_next_word_from_ngram(ngram_counts, trigram, 2)
    if ngram_type == 'bigram' and len(context) == 1:
        bigram = tuple(context[-1:])
        bigram_prob = mlf_bigram(bigram)
        if bigram_prob > 0:
            return get_next_word_from_ngram(ngram_counts, bigram, 1)
    return random.choice(words)

def get_next_word_from_ngram(ngram_counts, ngram, ngram_size):
    possible_next_words = [ngram[ngram_size] for ngram in ngram_counts if ngram[:ngram_size] == tuple(ngram)]
    return random.choice(possible_next_words) if possible_next_words else random.choice(words)

def generate_sentence(prompt, n_words=10):
    prompt_words = prompt.split()
    generated_words = prompt_words[:]
    
    for _ in range(n_words):
        context = generated_words[-2:]  
        next_word = predict_next_word(context, trigram_counts, ngram_type='trigram')  
        generated_words.append(next_word)
    
    return ' '.join(generated_words)

prompt = "friends are always"
generated_sentence = generate_sentence(prompt, n_words=10)
print("Generated Sentence:", generated_sentence)

Generated Sentence: friends are always there gentle our you a things will intermit sorry for
