# 📘 Task 03: Text Generation with Markov Chains

This notebook demonstrates how to build a basic **Markov Chain-based text generator** that learns word transitions from a given corpus and uses them to produce **new semi-plausible sentences**.

---

In [None]:
!pip install matplotlib markovify spacy nltk
!python -m spacy download en_core_web_sm

In [None]:
import random
import re
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import nltk
import spacy

nltk.download('gutenberg')
nlp = spacy.load("en_core_web_sm")

## 📚 Dataset: Shakespeare Corpus

We'll use three Shakespeare plays from NLTK's Gutenberg corpus.

In [None]:
from nltk.corpus import gutenberg

# Load texts
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')

# Clean function
def clean_text(text):
    text = re.sub(r'--', ' ', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    return ' '.join(text.split())

# Apply cleaning
hamlet_clean = clean_text(hamlet)
macbeth_clean = clean_text(macbeth)
caesar_clean = clean_text(caesar)

# Tokenize using spaCy
doc = nlp(hamlet_clean + " " + macbeth_clean + " " + caesar_clean)
sentences = [sent.text for sent in doc.sents if len(sent.text) > 5]
joined_sentences = " ".join(sentences)

## 🧩 Markov Chain Implementation

In [None]:
class MarkovChain:
    def __init__(self, order=2):
        self.order = order
        self.model = defaultdict(list)

    def train(self, text):
        words = re.findall(r'\b\w+\b', text.lower())
        if len(words) < self.order:
            raise ValueError("Text too short for this model.")
        for i in range(len(words) - self.order):
            history = tuple(words[i:i + self.order])
            next_word = words[i + self.order]
            self.model[history].append(next_word)

    def generate(self, length=50, seed=None):
        if not self.model:
            raise ValueError("Model not trained yet.")

        if seed:
            seed_words = tuple(seed.lower().split())
            if seed_words in self.model:
                current = seed_words
            else:
                print("Seed not found, starting randomly.")
                current = random.choice(list(self.model.keys()))
        else:
            current = random.choice(list(self.model.keys()))

        output = list(current)

        for _ in range(length - self.order):
            next_words = self.model.get(current)
            if not next_words:
                break
            next_word = random.choice(next_words)
            output.append(next_word)
            current = tuple(output[-self.order:])
        return ' '.join(output)

## 🎯 Train and Generate

In [None]:
# Create and train model
mc = MarkovChain(order=2)
mc.train(joined_sentences)

# Generate sample text
print("Generated Text (seeded):")
print(mc.generate(length=30, seed="the forest"))

print("\nGenerated Text (random start):")
print(mc.generate(length=30))

## 📊 Optional: Visualize Word Transitions

In [None]:
# Pick a history to analyze
history_to_check = ('the', 'forest')

# Count frequencies
counts = Counter(mc.model.get(history_to_check, []))

# Plot
if counts:
    plt.figure(figsize=(10, 4))
    plt.bar(counts.keys(), counts.values(), color='teal')
    plt.title(f"Next word frequencies after: {history_to_check}")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print(f"No transitions found for {history_to_check}")