In [19]:
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
import math

# Sample training text
text = "I love exploring new things. I love playing cricket."

# Step 1: Tokenize text
tokens = nltk.word_tokenize(text.lower())  # Convert to lowercase for consistency

# Step 2: Generate bigrams (pairs of two words)
bigrams = list(ngrams(tokens, 2))

# Step 3: Count occurrences of words and bigrams
word_counts = Counter(tokens)  # Count individual words
bigram_counts = Counter(bigrams)  # Count word pairs

# Step 4: Compute probability for each bigram
bigram_probabilities = {}

for (w1, w2), count in bigram_counts.items():
    probability = count / word_counts[w1]  # Apply formula P(w2 | w1)
    bigram_probabilities[(w1, w2)] = probability

# Step 5: Display results
print("\n🔹 Bigram Probabilities:")
for (w1, w2), prob in bigram_probabilities.items():
    print(f"P({w2} | {w1}) = {prob:.2f}")

# Step 6: Predict the next word given a starting word
def predict_next_word(word):
    candidates = {pair[1]: prob for pair, prob in bigram_probabilities.items() if pair[0] == word}
    if candidates:
        return max(candidates, key=candidates.get)  # Return word with highest probability
    else:
        return "No prediction available"

# Example prediction
print("\n🔹 Next word prediction:")
print(f"Next word after 'i': {predict_next_word('i')}")
print(f"Next word after 'love': {predict_next_word('love')}")



🔹 Bigram Probabilities:
P(love | i) = 1.00
P(exploring | love) = 0.50
P(new | exploring) = 1.00
P(things | new) = 1.00
P(. | things) = 1.00
P(i | .) = 0.50
P(playing | love) = 0.50
P(cricket | playing) = 1.00
P(. | cricket) = 1.00

🔹 Next word prediction:
Next word after 'i': love
Next word after 'love': exploring
