In [1]:
import nltk
from nltk.util import ngrams
from nltk import trigrams, bigrams
from nltk.corpus import reuters
from collections import defaultdict

# Download necessary NLTK datasets (if not already available)
nltk.download('reuters')
nltk.download('punkt')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Tokenize the Reuters corpus into words
words = nltk.word_tokenize(' '.join(reuters.words()))

# Create the trigram model
tri_grams = list(trigrams(words))

In [4]:
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))
for w1, w2, w3 in tri_grams:
    trigram_model[(w1, w2)][w3] += 1

In [5]:
for w1_w2 in trigram_model:
    total_count = float(sum(trigram_model[w1_w2].values()))
    for w3 in trigram_model[w1_w2]:
        trigram_model[w1_w2][w3] /= total_count

In [7]:
def predict_next_word(trigram_model,w1, w2):
    next_word = trigram_model[w1, w2]
    if next_word:
        predicted_word = max(next_word, key=next_word.get)  # Choose the most likely next word
        return predicted_word
    else:
        return False

In [9]:
print("Next Word:", predict_next_word(trigram_model,'the', 'stock'))

Next Word: of


In [10]:
def generate_sentence_trigram(trigram_model, w1, w2, num_words=10):
    sentence = [w1, w2]
    for _ in range(num_words):
        next_word = predict_next_word(trigram_model,w1, w2)
        if not next_word:
            break
        w3 = next_word  # Choose the most probable next word
        sentence.append(w3)
        w1, w2 = w2, w3  # Shift words for the next trigram prediction
    return ' '.join(sentence)

# Test the trigram-based sentence generation
print(generate_sentence_trigram(trigram_model, 'the', 'stock'))

the stock of the company ' s & lt ; BP >


In [12]:
bi_grams = list(bigrams(words))

# Initialize a defaultdict for the bigram model
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

# Build the bigram model (count the frequency of each bigram)
for w1, w2 in bi_grams:
    bigram_model[w1][w2] += 1

# Convert counts to probabilities
for w1 in bigram_model:
    total_count = float(sum(bigram_model[w1].values()))
    for w2 in bigram_model[w1]:
        bigram_model[w1][w2] /= total_count

# Function to predict the next word based on a given word
def predict_next_word_bigram(bigram_model, w1):
    next_word = bigram_model[w1]
    if next_word:
        predicted_word = max(next_word, key=next_word.get)  # Choose the most likely next word
        return predicted_word
    else:
        return False

# Function to generate a sequence of words using bigram model
def generate_sentence_bigram(bigram_model, w1, num_words=10):
    sentence = [w1]
    for _ in range(num_words):
        next_word = predict_next_word_bigram(bigram_model, w1)
        if not next_word:
            break
        sentence.append(next_word)
        w1 = next_word  # Shift to the next word
    return ' '.join(sentence)

# Test the bigram-based sentence generation
print(generate_sentence_bigram(bigram_model, 'the', 10))

the U . The company said . The company said .


In [13]:
# Create the 4-gram model
quad_grams = list(ngrams(words, 4))

# Initialize a defaultdict for the 4-gram model
fourgram_model = defaultdict(lambda: defaultdict(lambda: 0))

# Build the 4-gram model (count the frequency of each 4-gram)
for w1, w2, w3, w4 in quad_grams:
    fourgram_model[(w1, w2, w3)][w4] += 1

# Convert counts to probabilities
for w1_w2_w3 in fourgram_model:
    total_count = float(sum(fourgram_model[w1_w2_w3].values()))
    for w4 in fourgram_model[w1_w2_w3]:
        fourgram_model[w1_w2_w3][w4] /= total_count

# Function to predict the next word based on three previous words
def predict_next_word_fourgram(fourgram_model, w1, w2, w3):
    next_word = fourgram_model[w1, w2, w3]
    if next_word:
        predicted_word = max(next_word, key=next_word.get)  # Choose the most likely next word
        return predicted_word
    else:
        return False

# Function to generate a sequence of words using the 4-gram model
def generate_sentence_fourgram(fourgram_model, w1, w2, w3, num_words=10):
    sentence = [w1, w2, w3]
    for _ in range(num_words):
        next_word = predict_next_word_fourgram(fourgram_model, w1, w2, w3)
        if not next_word:
            break
        sentence.append(next_word)
        w1, w2, w3 = w2, w3, next_word  # Shift to the next set of words for prediction
    return ' '.join(sentence)

# Test the 4-gram-based sentence generation
print(generate_sentence_fourgram(fourgram_model, 'the', 'stock', 'market', 10))

the stock market collapse -- prices fell nearly 15 pct -- means that


In [16]:
# Create the 5-gram model
five_grams = list(ngrams(words, 5))

# Initialize a defaultdict for the 5-gram model
fivegram_model = defaultdict(lambda: defaultdict(lambda: 0))

# Build the 5-gram model (count the frequency of each 5-gram)
for w1, w2, w3, w4, w5 in five_grams:
    fivegram_model[(w1, w2, w3, w4)][w5] += 1

# Convert counts to probabilities
for w1_w2_w3_w4 in fivegram_model:
    total_count = float(sum(fivegram_model[w1_w2_w3_w4].values()))
    for w5 in fivegram_model[w1_w2_w3_w4]:
        fivegram_model[w1_w2_w3_w4][w5] /= total_count

# Function to predict the next word based on four previous words
def predict_next_word_fivegram(fivegram_model, w1, w2, w3, w4):
    next_word = fivegram_model[w1, w2, w3, w4]
    if next_word:
        predicted_word = max(next_word, key=next_word.get)  # Choose the most likely next word
        return predicted_word
    else:
        return False

# Function to generate a sequence of words using the 5-gram model
def generate_sentence_fivegram(fivegram_model, w1, w2, w3, w4, num_words=10):
    sentence = [w1, w2, w3, w4]
    for _ in range(num_words):
        next_word = predict_next_word_fivegram(fivegram_model, w1, w2, w3, w4)
        if not next_word:
            break
        sentence.append(next_word)
        w1, w2, w3, w4 = w2, w3, w4, next_word  # Shift to the next set of words for prediction
    return ' '.join(sentence)

# Test the 5-gram-based sentence generation

In [11]:
print(generate_sentence_fivegram(fivegram_model, 'the', 'stock', 'market', 'collapse', 10))

the stock market collapse as companies have shifted funds away from financial investments to


Observations


    Bigram Model (n=2):\ "the U . The company said . The company said ."

Observation:\ The bigram model generates short, repetitive phrases. It lacks context and often ends up repeating common phrases without much semantic coherence. Since the context window is small (only two words), the model struggles to maintain a meaningful or coherent sentence over a long sequence.

    Trigram Model (n=3):\ "the stock of the company ' s & lt ; BP >"

Observation:\ The trigram model improves on the bigram model by generating slightly more meaningful phrases. However, it's still prone to generating awkward or incomplete phrases (e.g., "& lt ; BP >"). While some local word pairs are coherent, the sentence as a whole may not make complete sense. The added context of three words allows for better prediction than bigrams, but longer sentences still suffer from inconsistency.

    Fourgram Model (n=4): "the stock market collapse -- prices fell nearly 15 pct -- means that"

Observation:\ The fourgram model produces more coherent and contextually relevant sentences. The sentence structure is more complete, and the information conveyed makes logical sense ("collapse -- prices fell nearly 15 pct"). With four words of context, the model can maintain better sentence flow, though it may still end abruptly or lack sufficient grammatical closure.

    Fivegram Model (n=5): "the stock market collapse as companies have shifted funds away from financial investments to"

Observation:\ The fivegram model produces even more coherent sentences, and the sentence begins to resemble natural language more closely. The phrase "the stock market collapse as companies have shifted funds away from financial investments" is well-structured and contextually meaningful. However, the sentence may still end abruptly without completing its thought ("to" without the next word), as the sequence generation stops after 10 words.
