In [3]:
import nltk
from nltk import word_tokenize, FreqDist
from nltk.util import ngrams
from collections import defaultdict, Counter
import random

nltk.download('punkt')
text = "This is a sample text corpus. This corpus is used to demonstrate the generation of unigrams, bigrams, and trigrams. The bigram probabilities are also calculated. Lastly, next word prediction is performed."
tokens = word_tokenize(text.lower())

# Unigrams
unigrams = tokens
unigram_freq = FreqDist(unigrams)

# Bigrams
bigrams = list(ngrams(tokens, 2))
bigram_freq = FreqDist(bigrams)

# Trigrams
trigrams = list(ngrams(tokens, 3))
trigram_freq = FreqDist(trigrams)

# Bigram Probabilities
bigram_probabilities = defaultdict(lambda: defaultdict(lambda: 0))
for w1, w2 in bigrams:
    bigram_probabilities[w1][w2] += 1

for w1 in bigram_probabilities:
    total_count = float(sum(bigram_probabilities[w1].values()))
    for w2 in bigram_probabilities[w1]:
        bigram_probabilities[w1][w2] /= total_count

# Next word prediction
def predict_next_word(prev_word, bigram_probabilities):
    next_word_probs = bigram_probabilities[prev_word]
    if not next_word_probs:
        return None
    next_word = max(next_word_probs, key=next_word_probs.get)
    return next_word


print("Unigrams:")
print(unigram_freq.most_common())

print("\nBigrams:")
print(bigram_freq.most_common())

print("\nTrigrams:")
print(trigram_freq.most_common())

print("\nBigram Probabilities:")
for w1 in bigram_probabilities:
    for w2 in bigram_probabilities[w1]:
        print(f"P({w2}|{w1}) = {bigram_probabilities[w1][w2]:.4f}")


prev_word = 'this'
predicted_next_word = predict_next_word(prev_word, bigram_probabilities)
print(f"\nPredicted next word for '{prev_word}': {predicted_next_word}")

Unigrams:
[('.', 4), ('is', 3), (',', 3), ('this', 2), ('corpus', 2), ('the', 2), ('a', 1), ('sample', 1), ('text', 1), ('used', 1), ('to', 1), ('demonstrate', 1), ('generation', 1), ('of', 1), ('unigrams', 1), ('bigrams', 1), ('and', 1), ('trigrams', 1), ('bigram', 1), ('probabilities', 1), ('are', 1), ('also', 1), ('calculated', 1), ('lastly', 1), ('next', 1), ('word', 1), ('prediction', 1), ('performed', 1)]

Bigrams:
[(('this', 'is'), 1), (('is', 'a'), 1), (('a', 'sample'), 1), (('sample', 'text'), 1), (('text', 'corpus'), 1), (('corpus', '.'), 1), (('.', 'this'), 1), (('this', 'corpus'), 1), (('corpus', 'is'), 1), (('is', 'used'), 1), (('used', 'to'), 1), (('to', 'demonstrate'), 1), (('demonstrate', 'the'), 1), (('the', 'generation'), 1), (('generation', 'of'), 1), (('of', 'unigrams'), 1), (('unigrams', ','), 1), ((',', 'bigrams'), 1), (('bigrams', ','), 1), ((',', 'and'), 1), (('and', 'trigrams'), 1), (('trigrams', '.'), 1), (('.', 'the'), 1), (('the', 'bigram'), 1), (('bigram', 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
