<a href="https://colab.research.google.com/github/ImtiazAhmed07/Word-Predictor/blob/main/Next%20Word%20predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import re
from collections import defaultdict

# Sample corpus
corpus = """
I am happy because I am learning.
Learning is fun because I am gaining knowledge.
Knowledge is power.
"""

# Preprocess the corpus
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

words = preprocess(corpus)

In [2]:
# Count bigrams
bigram_counts = defaultdict(int)
for i in range(len(words) - 1):
    bigram = (words[i], words[i + 1])
    bigram_counts[bigram] += 1

# Count unigrams (single words)
unigram_counts = defaultdict(int)
for word in words:
    unigram_counts[word] += 1

In [3]:
def predict_next_word(prev_word, bigram_counts, unigram_counts):
    candidates = [(bigram[1], count) for bigram, count in bigram_counts.items() if bigram[0] == prev_word]
    if not candidates:
        return None
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates

# Example prediction
prev_word = 'i'
candidates = predict_next_word(prev_word, bigram_counts, unigram_counts)
print(f"Next word candidates for '{prev_word}': {candidates}")

Next word candidates for 'i': [('am', 3)]


In [6]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def predict_with_softmax(prev_word, bigram_counts, unigram_counts):
    candidates = [(bigram[1], count) for bigram, count in bigram_counts.items() if bigram[0] == prev_word]
    if not candidates:
        return None
    words, counts = zip(*candidates)
    probabilities = softmax(np.array(counts))
    return list(zip(words, probabilities))

# Example prediction with softmax
prev_word = 'i'
candidates = predict_with_softmax(prev_word, bigram_counts, unigram_counts)
print(f"Next word probabilities for '{prev_word}': {candidates}")

Next word probabilities for 'i': [('am', 1.0)]
