In [1]:
import nltk
import random
from nltk import trigrams
from collections import Counter, defaultdict
import pandas as pd

nltk.download("punkt")
nltk.download('gutenberg')

from nltk.corpus import gutenberg
corpus = gutenberg.sents("austen-emma.txt")[:1000]  # Limit to 1,000 sentences

flat_corpus = [word.lower() for sentence in corpus for word in sentence if word.isalpha()]

# Trigram Model
trigram_counts = Counter(trigrams(flat_corpus))
total_trigrams = sum(trigram_counts.values())

# Create trigram dictionary
n_gram = defaultdict(list)
for sentence in corpus:
    words = [word.lower() for word in sentence if word.isalpha()]
    for i in range(len(words) - 2):
        n_gram[(words[i], words[i + 1])].append(words[i + 2])

# Predict next word based on trigram probabilities
def predict_trigram(sequence):
    if len(sequence) < 2:
        return random.choice(flat_corpus)

    last_bigram = tuple(sequence[-2:])
    if last_bigram in n_gram:
        return random.choice(n_gram[last_bigram])
    else:
        return random.choice(flat_corpus)  # fallback to unigram if no trigram available

# Generate sentence from trigrams
def generate_sentence(num_words=15):
    words = []
    first_words = random.choice(list(n_gram.keys()))
    words.extend(first_words)
    while len(words) < num_words:
        next_word = predict_trigram(words)
        words.append(next_word)
    return " ".join(words)

# Sample sequences
sample_sequences = [
    ["she", "was", "so"],
    ["he", "had", "not"],
    ["they", "were", "about"],
    ["it", "is", "a"],
    ["i", "think", "i"],
    ["you", "should", "have"],
    ["we", "are", "going"],
    ["this", "was", "tremendously"],
    ["do", "you", "love"],
    ["how", "can", "model"]
]

results = []

for sequence in sample_sequences:
    trigram_prediction = predict_trigram(sequence)
    results.append({
        "Sequence": " ".join(sequence),
        "Trigram Prediction": trigram_prediction
    })

results_df = pd.DataFrame(results)
print(results_df)

# Generate a random sentence
print(generate_sentence())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


                Sequence Trigram Prediction
0             she was so             little
1             he had not               gone
2        they were about                the
3                it is a               very
4              i think i              ought
5        you should have           realised
6           we are going               want
7  this was tremendously                yet
8            do you love        considering
9          how can model               same
figure pieces in her silence but beginning to apprehend the bewitching flattery of that before
