In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict


In [2]:
# necessary NLTK data files
nltk.download('punkt')

def generate_ngrams(text, n):
    # tokenize the text into words and generate n-grams
    tokens = word_tokenize(text.lower())
    n_grams = list(ngrams(tokens, n))
    return n_grams

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mubas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
def calculate_ngram_probabilities(text, n, smoothing=False):
    n_grams = generate_ngrams(text, n)
    n_minus_1_grams = generate_ngrams(text, n-1)
    
    n_gram_counts = Counter(n_grams)
    n_minus_1_gram_counts = Counter(n_minus_1_grams)
    
    n_gram_probabilities = defaultdict(dict)
    
    vocabulary_size = len(set(word_tokenize(text.lower())))
    
    for n_gram in n_gram_counts:
        n_minus_1_gram = n_gram[:-1]
        if smoothing:
            n_gram_probabilities[n_minus_1_gram][n_gram[-1]] = (n_gram_counts[n_gram] + 1) / (n_minus_1_gram_counts[n_minus_1_gram] + vocabulary_size)
        else:
            n_gram_probabilities[n_minus_1_gram][n_gram[-1]] = n_gram_counts[n_gram] / n_minus_1_gram_counts[n_minus_1_gram]
    
    return n_gram_probabilities


In [12]:

def predict_top_n_next_words(text, n, preceding_words, top_n=3, smoothing=True):
    n_gram_probabilities = calculate_ngram_probabilities(text, n, smoothing=smoothing)
    preceding_tuple = tuple(preceding_words.lower().split()[-(n-1):])
    
    if preceding_tuple in n_gram_probabilities:
        sorted_predictions = sorted(n_gram_probabilities[preceding_tuple].items(), key=lambda item: item[1], reverse=True)
        return sorted_predictions[:top_n]
    else:
        return []

In [4]:
def predict_next_word(text, n, preceding_words):
    n_gram_probabilities = calculate_ngram_probabilities(text, n)
    preceding_tuple = tuple(preceding_words.lower().split()[-(n-1):])
    
    if preceding_tuple in n_gram_probabilities:
        next_word = max(n_gram_probabilities[preceding_tuple], key=n_gram_probabilities[preceding_tuple].get)
        return next_word, n_gram_probabilities[preceding_tuple][next_word]
    else:
        return None, 0.0

In [14]:
if __name__ == "__main__":
    sample_text = "I work in the HPC Lab with my team members in the HPC Lab in the peaceful environment"

    next_word, probability = predict_next_word(sample_text, 2, "HPC")
    print(f"The word next to HPC is: {next_word}, with robability: {probability}")

    next_word, probability = predict_next_word(sample_text, 3, "in the")
    print(f"The word next to in the is: {next_word}, with probability: {probability}")

The word next to HPC is: lab, with robability: 1.0
The word next to in the is: hpc, with probability: 0.6666666666666666


In [15]:

# Predict the top 3 possible next words after "HPC Lab"
top_predictions = predict_top_n_next_words(sample_text, 2, "in the", top_n=3)

for word, prob in top_predictions:
    print(f"Next word: {word}, Probability: {prob}")

Next word: hpc, Probability: 0.2
Next word: peaceful, Probability: 0.13333333333333333
