## **1. Import Libraries**

In [1]:
import nltk
from nltk.corpus import gutenberg
from nltk import word_tokenize
from collections import Counter, defaultdict
import random
import string

In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

## **2. Load and Preprocess the Corpus**

In [6]:
raw_text = gutenberg.raw('austen-emma.txt')

In [7]:
tokens = word_tokenize(raw_text.lower())
tokens = [word for word in tokens if word.isalpha()]

## **3. Build N-gram Models**

In [8]:
# Unigram
unigram_counts = Counter(tokens)

# Bigram
bigrams = list(nltk.bigrams(tokens))
bigram_counts = Counter(bigrams)

# Trigram
trigrams = list(nltk.trigrams(tokens))
trigram_counts = Counter(trigrams)

## **4. Estimate Probabilities**

In [9]:
# For unigrams: P(w) = count(w) / total_words
total_unigrams = sum(unigram_counts.values())
unigram_prob = {word: count / total_unigrams for word, count in unigram_counts.items()}

# For bigrams: P(w2|w1) = count(w1, w2) / count(w1)
bigram_prob = defaultdict(dict)
for (w1, w2), count in bigram_counts.items():
    bigram_prob[w1][w2] = count / unigram_counts[w1]

# For trigrams: P(w3|w1,w2) = count(w1, w2, w3) / count(w1, w2)
trigram_prob = defaultdict(dict)
bigram_pairs = Counter(list(nltk.bigrams(tokens)))
for (w1, w2, w3), count in trigram_counts.items():
    trigram_prob[(w1, w2)][w3] = count / bigram_pairs[(w1, w2)]

## **5. Implement Word Prediction**

In [10]:
def predict_next_word(input_text, top_k=3):
    words = input_text.lower().split()
    if len(words) == 1:
        w1 = words[0]
        if w1 in bigram_prob:
            sorted_probs = sorted(bigram_prob[w1].items(), key=lambda x: x[1], reverse=True)
            return sorted_probs[:top_k]
    elif len(words) >= 2:
        w1, w2 = words[-2], words[-1]
        if (w1, w2) in trigram_prob:
            sorted_probs = sorted(trigram_prob[(w1, w2)].items(), key=lambda x: x[1], reverse=True)
            return sorted_probs[:top_k]
    return [("No prediction", 0.0)]

## **6. Evaluate Results**

In [11]:
while True:
    user_input = input("Enter one or two words (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    predictions = predict_next_word(user_input)
    print("Predicted next words:")
    for word, prob in predictions:
        print(f"{word} (Prob: {prob:.4f})")

Enter one or two words (or 'exit' to quit): I am
Predicted next words:
sure (Prob: 0.2716)
not (Prob: 0.0812)
very (Prob: 0.0711)
Enter one or two words (or 'exit' to quit): How are
Predicted next words:
you (Prob: 0.5000)
they (Prob: 0.5000)
Enter one or two words (or 'exit' to quit): He is not
Predicted next words:
a (Prob: 0.0887)
the (Prob: 0.0645)
it (Prob: 0.0565)
Enter one or two words (or 'exit' to quit): exit
