In [6]:
import re
from collections import defaultdict, Counter

# ---------- Preprocessing ----------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    tokens = text.split()
    return tokens

# ---------- N-gram Generation ----------
def get_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# ---------- Probability Computation ----------
def compute_bigram_prob(bigram_counts, unigram_counts):
    probs = defaultdict(float)
    for (w1, w2), count in bigram_counts.items():
        probs[(w1, w2)] = count / unigram_counts[(w1,)]
    return probs

def compute_trigram_prob(trigram_counts, bigram_counts):
    probs = defaultdict(float)
    for (w1, w2, w3), count in trigram_counts.items():
        probs[(w1, w2, w3)] = count / bigram_counts[(w1, w2)]
    return probs

# ---------- Prediction ----------
def predict_next_word(input_text, bigram_probs, trigram_probs):
    tokens = preprocess(input_text)
    
    if len(tokens) >= 2:
        w1, w2 = tokens[-2], tokens[-1]
        candidates = {k: v for k, v in trigram_probs.items() if k[0] == w1 and k[1] == w2}
        if candidates:
            return max(candidates, key=candidates.get)[2]
    
    if len(tokens) >= 1:
        w1 = tokens[-1]
        candidates = {k: v for k, v in bigram_probs.items() if k[0] == w1}
        if candidates:
            return max(candidates, key=candidates.get)[1]
    
    return "No prediction available."

# ---------- Display Helper ----------
def display_counts(title, counts):
    print(f"\n--- {title} ---")
    for k, v in counts.items():
        print(f"{k}: {v}")

def display_probs(title, probs):
    print(f"\n--- {title} ---")
    for k, v in probs.items():
        print(f"P{str(k)} = {v:.4f}")

# ---------- Main Function ----------
def main():
    # 🔹 Sample corpus text (no file needed!)
    text = text = "I love learning new things every day."

    
    # Preprocess
    tokens = preprocess(text)

    # N-gram models
    unigram_counts = Counter(get_ngrams(tokens, 1))
    bigram_counts = Counter(get_ngrams(tokens, 2))
    trigram_counts = Counter(get_ngrams(tokens, 3))

    bigram_probs = compute_bigram_prob(bigram_counts, unigram_counts)
    trigram_probs = compute_trigram_prob(trigram_counts, bigram_counts)

    # Output
    display_counts("Unigram Counts", unigram_counts)
    display_counts("Bigram Counts", bigram_counts)
    display_counts("Trigram Counts", trigram_counts)

    display_probs("Bigram Probabilities", bigram_probs)
    display_probs("Trigram Probabilities", trigram_probs)

    # Prediction
    while True:
        user_input = input("\nEnter a word or phrase (or type 'exit' to quit): ")
        if user_input.lower() == "exit":
            break
        prediction = predict_next_word(user_input, bigram_probs, trigram_probs)
        print("Predicted next word:", prediction)

# Run the program
if __name__ == "__main__":
    main()



--- Unigram Counts ---
('i',): 1
('love',): 1
('learning',): 1
('new',): 1
('things',): 1
('every',): 1
('day',): 1

--- Bigram Counts ---
('i', 'love'): 1
('love', 'learning'): 1
('learning', 'new'): 1
('new', 'things'): 1
('things', 'every'): 1
('every', 'day'): 1

--- Trigram Counts ---
('i', 'love', 'learning'): 1
('love', 'learning', 'new'): 1
('learning', 'new', 'things'): 1
('new', 'things', 'every'): 1
('things', 'every', 'day'): 1

--- Bigram Probabilities ---
P('i', 'love') = 1.0000
P('love', 'learning') = 1.0000
P('learning', 'new') = 1.0000
P('new', 'things') = 1.0000
P('things', 'every') = 1.0000
P('every', 'day') = 1.0000

--- Trigram Probabilities ---
P('i', 'love', 'learning') = 1.0000
P('love', 'learning', 'new') = 1.0000
P('learning', 'new', 'things') = 1.0000
P('new', 'things', 'every') = 1.0000
P('things', 'every', 'day') = 1.0000



Enter a word or phrase (or type 'exit' to quit):  learning


Predicted next word: new



Enter a word or phrase (or type 'exit' to quit):  exit


In [9]:
import re
from collections import defaultdict

# Preprocessing: lowercase, remove punctuation, tokenize
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    tokens = text.split()
    return tokens

# Build n-grams
def build_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# Count frequencies
def count_ngrams(ngrams):
    counts = defaultdict(int)
    for ng in ngrams:
        counts[ng] += 1
    return counts

# Compute bigram probabilities with explanation
def compute_bigram_prob(bigram_counts, unigram_counts):
    probs = defaultdict(float)
    print("\n--- Bigram Probabilities ---")
    for (w1, w2), count in bigram_counts.items():
        prob = count / unigram_counts[(w1,)]
        probs[(w1, w2)] = prob
        print(f"P({w2} | {w1}) = {count} / {unigram_counts[(w1,)]} = {prob:.3f}")
    return probs

# Compute trigram probabilities with explanation
def compute_trigram_prob(trigram_counts, bigram_counts):
    probs = defaultdict(float)
    print("\n--- Trigram Probabilities ---")
    for (w1, w2, w3), count in trigram_counts.items():
        prob = count / bigram_counts[(w1, w2)]
        probs[(w1, w2, w3)] = prob
        print(f"P({w3} | {w1} {w2}) = {count} / {bigram_counts[(w1, w2)]} = {prob:.3f}")
    return probs

# Predict next word
def predict_next_word(input_text, bigram_probs, trigram_probs):
    input_tokens = input_text.lower().split()
    if len(input_tokens) >= 2:
        context = tuple(input_tokens[-2:])
        candidates = {k[2]: v for k, v in trigram_probs.items() if k[:2] == context}
        if candidates:
            max_prob = max(candidates.values())
            best_words = [word for word, prob in candidates.items() if prob == max_prob]
            return best_words
    if len(input_tokens) >= 1:
        context = tuple([input_tokens[-1]])
        candidates = {k[1]: v for k, v in bigram_probs.items() if k[0] == context[0]}
        if candidates:
            max_prob = max(candidates.values())
            best_words = [word for word, prob in candidates.items() if prob == max_prob]
            return best_words
    return ["No prediction available"]


# Main function
def main():
    # Use a slightly complex example
    text = "I love coding and I love learning."
    
    # Preprocess
    tokens = preprocess(text)

    # Build n-grams
    unigrams = build_ngrams(tokens, 1)
    bigrams = build_ngrams(tokens, 2)
    trigrams = build_ngrams(tokens, 3)

    # Count frequencies
    unigram_counts = count_ngrams(unigrams)
    bigram_counts = count_ngrams(bigrams)
    trigram_counts = count_ngrams(trigrams)

    # Print counts
    print("--- Unigram Counts ---")
    for k, v in unigram_counts.items():
        print(f"{k}: {v}")

    print("\n--- Bigram Counts ---")
    for k, v in bigram_counts.items():
        print(f"{k}: {v}")

    print("\n--- Trigram Counts ---")
    for k, v in trigram_counts.items():
        print(f"{k}: {v}")

    # Calculate and show probabilities
    bigram_probs = compute_bigram_prob(bigram_counts, unigram_counts)
    trigram_probs = compute_trigram_prob(trigram_counts, bigram_counts)

    # Prediction
    print("\n--- Next Word Prediction ---")
    while True:
        inp = input("Enter a word or phrase (or 'exit' to quit): ")
        if inp.lower() == "exit":
            break
        prediction = predict_next_word(inp, bigram_probs, trigram_probs)
        print("Predicted next word:", prediction)

# Run the main function
if __name__ == "__main__":
    main()


--- Unigram Counts ---
('i',): 2
('love',): 2
('coding',): 1
('and',): 1
('learning',): 1

--- Bigram Counts ---
('i', 'love'): 2
('love', 'coding'): 1
('coding', 'and'): 1
('and', 'i'): 1
('love', 'learning'): 1

--- Trigram Counts ---
('i', 'love', 'coding'): 1
('love', 'coding', 'and'): 1
('coding', 'and', 'i'): 1
('and', 'i', 'love'): 1
('i', 'love', 'learning'): 1

--- Bigram Probabilities ---
P(love | i) = 2 / 2 = 1.000
P(coding | love) = 1 / 2 = 0.500
P(and | coding) = 1 / 1 = 1.000
P(i | and) = 1 / 1 = 1.000
P(learning | love) = 1 / 2 = 0.500

--- Trigram Probabilities ---
P(coding | i love) = 1 / 2 = 0.500
P(and | love coding) = 1 / 1 = 1.000
P(i | coding and) = 1 / 1 = 1.000
P(love | and i) = 1 / 1 = 1.000
P(learning | i love) = 1 / 2 = 0.500

--- Next Word Prediction ---


Enter a word or phrase (or 'exit' to quit):  love


Predicted next word: ['coding', 'learning']


Enter a word or phrase (or 'exit' to quit):  i


Predicted next word: ['love']


Enter a word or phrase (or 'exit' to quit):  exit
