In [1]:
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
import random

In [2]:
# Download the necessary NLTK data
nltk.download('punkt')

def tokenize_text(text):
    return nltk.word_tokenize(text.lower())

[nltk_data] Downloading package punkt to C:\Users\Muhammad Bilal
[nltk_data]     Ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

In [5]:
def calculate_ngram_probabilities(ngrams):
    ngram_freqs = defaultdict(Counter)
    for ngram in ngrams:
        prefix = ngram[:-1]
        next_word = ngram[-1]
        ngram_freqs[prefix][next_word] += 1
    
    ngram_probabilities = defaultdict(dict)
    for prefix, counter in ngram_freqs.items():
        total_count = float(sum(counter.values()))
        for word, count in counter.items():
            ngram_probabilities[prefix][word] = count / total_count
    
    return ngram_probabilities

In [6]:
def predict_next_word(test_input, ngram_probabilities, n):
    tokens = tokenize_text(test_input)
    if len(tokens) < n - 1:
        raise ValueError(f"Test input should have at least {n-1} words.")
    
    prefix = tuple(tokens[-(n-1):])
    if prefix in ngram_probabilities:
        next_word_candidates = ngram_probabilities[prefix]
        next_word = max(next_word_candidates, key=next_word_candidates.get)
        return next_word
    else:
        return None


In [7]:
# Sample input text
sample_text = "This is a sample text. This text is just a sample."

# Tokenize the input text
tokens = tokenize_text(sample_text)

# Define the value of n
n = 3

# Generate n-grams
ngrams_list = generate_ngrams(tokens, n)

# Calculate n-gram probabilities
ngram_probabilities = calculate_ngram_probabilities(ngrams_list)

# Test input
test_input = "This is"

# Predict the next word
next_word = predict_next_word(test_input, ngram_probabilities, n)
print(f"Next word prediction for '{test_input}': {next_word}")

Next word prediction for 'This is': a
