In [1]:
# Import Necessary Libraries

import nltk
from nltk import ngrams
from collections import defaultdict
import random

In [2]:
# Sample Text Data
text = """
Once upon a luminous, starry night in the quaint, enigmatic town of Serendipity, 
a curious young explorer named Amelia embarked on an extraordinary adventure. 
With her trusty magnifying glass in hand and an indomitable spirit, she embarked on a quest to discover the elusive Elysian treasure hidden deep within the labyrinthine forest. 
As she ventured through the verdant woods, Amelia encountered an eccentric, talking squirrel named Percival, who spoke in riddles and guided her toward the treasure's whereabouts. 
The forest was resplendent with bioluminescent flora, illuminating her path with a kaleidoscope of colors. 
Amelia soon reached a precipice overlooking an awe-inspiring, cerulean waterfall, its cascading waters echoing a melodious serenade. 
Beside the waterfall stood a colossal, moss-covered stone with cryptic inscriptions. 
With Percival's guidance, she deciphered the ancient runes and uncovered the entrance to the treasure trove. 
Inside, she discovered an opulent chest adorned with intricate, golden filigree. 
Upon opening it, a symphony of shimmering jewels, radiant gemstones, and glistening artifacts greeted her with an ethereal glow. 
The Elysian treasure was hers, a testament to her dauntless courage and insatiable curiosity. 
Amelia's return to Serendipity was celebrated with jubilant revelry, and her remarkable journey became a legend, inspiring others to embark on their own adventures in the wondrous realm of imagination and discovery.
"""


In [3]:
# Tokenize the text into words
words = nltk.word_tokenize(text)

# Preprocess the words (convert to lowercase, remove punctuation)
words = [word.lower() for word in words if word.isalnum()]

words

['once',
 'upon',
 'a',
 'luminous',
 'starry',
 'night',
 'in',
 'the',
 'quaint',
 'enigmatic',
 'town',
 'of',
 'serendipity',
 'a',
 'curious',
 'young',
 'explorer',
 'named',
 'amelia',
 'embarked',
 'on',
 'an',
 'extraordinary',
 'adventure',
 'with',
 'her',
 'trusty',
 'magnifying',
 'glass',
 'in',
 'hand',
 'and',
 'an',
 'indomitable',
 'spirit',
 'she',
 'embarked',
 'on',
 'a',
 'quest',
 'to',
 'discover',
 'the',
 'elusive',
 'elysian',
 'treasure',
 'hidden',
 'deep',
 'within',
 'the',
 'labyrinthine',
 'forest',
 'as',
 'she',
 'ventured',
 'through',
 'the',
 'verdant',
 'woods',
 'amelia',
 'encountered',
 'an',
 'eccentric',
 'talking',
 'squirrel',
 'named',
 'percival',
 'who',
 'spoke',
 'in',
 'riddles',
 'and',
 'guided',
 'her',
 'toward',
 'the',
 'treasure',
 'whereabouts',
 'the',
 'forest',
 'was',
 'resplendent',
 'with',
 'bioluminescent',
 'flora',
 'illuminating',
 'her',
 'path',
 'with',
 'a',
 'kaleidoscope',
 'of',
 'colors',
 'amelia',
 'soon',

In [4]:
# Define the order of the N-gram model (N=2 for bigrams)
N = 2

# Create N-grams from the tokenized words
ngrams_list = list(ngrams(words, N))

# Create a defaultdict to store N-grams and their frequency
ngram_freq = defaultdict(int)
for ngram in ngrams_list:
    ngram_freq[ngram] += 1

In [5]:
# Define Function
def predict_next_word(prefix):
    # Filter N-grams that start with the given prefix
    matching_ngrams = [(ngram, freq) for ngram, freq in ngram_freq.items() if ngram[:-1] == prefix]

    if not matching_ngrams:
        return "No prediction available."

    # Sort N-grams by frequency in descending order
    sorted_ngrams = sorted(matching_ngrams, key=lambda x: x[1], reverse=True)

    # Select the N-gram with the highest frequency as the prediction
    prediction = sorted_ngrams[0][0][-1]

    return prediction

In [8]:
# You can use this code snippet to interactively test the model with user input
user_input = input("Enter a prefix for next-word prediction: ").lower().split()
if len(user_input) != N - 1:
    print("Please enter a valid prefix.")
else:
    prefix = tuple(user_input)
    prediction = predict_next_word(prefix)
    print(f"Next word prediction: {prediction}")

Enter a prefix for next-word prediction:  john


Next word prediction: No prediction available.
