In [1]:
import random
from collections import defaultdict
import re

In [2]:
class MarkovChainTextGenerator:
    def __init__(self, n = 3):
        self.model = defaultdict(list)
        self.n = n
    
    def train(self, text):
        words = text.split()
        if(len(words) - self.n <= 0):
            print("Training text too low!")
            return
        
        for i in range(len(words) - self.n):
            key = tuple(words[i:i + self.n])
            next_word = words[i + self.n]
            self.model[key].append(next_word)

    def generate_text(self, seed_text, num_words = 20):
        words = seed_text.split()
        if len(words) < self.n:
            key = random.choice(list(self.model.keys()))
        else:
            key = tuple(words[-self.n:])
            if key not in self.model:
                key = random.choice(list(self.model.keys()))

        result = list(key)

        for _ in range(num_words):
            next_words = self.model.get(key)
            if not next_words:
                break
            next_word = random.choice(next_words)
            result.append(next_word)
            key = tuple(result[-self.n:])
            
        return " ".join(result)

In [3]:
text = "the cat sat on the mat and the cat lay on the rug and the dog barked"

generator = MarkovChainTextGenerator()
generator.train(text)

for word, next_words in generator.model.items():
    print(f"{word} -> {next_words}")

('the', 'cat', 'sat') -> ['on']
('cat', 'sat', 'on') -> ['the']
('sat', 'on', 'the') -> ['mat']
('on', 'the', 'mat') -> ['and']
('the', 'mat', 'and') -> ['the']
('mat', 'and', 'the') -> ['cat']
('and', 'the', 'cat') -> ['lay']
('the', 'cat', 'lay') -> ['on']
('cat', 'lay', 'on') -> ['the']
('lay', 'on', 'the') -> ['rug']
('on', 'the', 'rug') -> ['and']
('the', 'rug', 'and') -> ['the']
('rug', 'and', 'the') -> ['dog']
('and', 'the', 'dog') -> ['barked']


In [4]:
print(generator.generate_text("the cat sat", 15))

the cat sat on the mat and the cat lay on the rug and the dog barked


In [5]:
def clean_shakespeare_text(text):
    # Remove Project Gutenberg header/footer
    start_marker = "*** START OF"
    end_marker = "*** END OF"
    start = text.find(start_marker)
    end = text.find(end_marker)
    if start != -1 and end != -1:
        text = text[start:end]

    # Remove stage directions in brackets [ ... ]
    text = re.sub(r'\[.*?\]', '', text)

    # Remove all-uppercase lines (usually character names and stage notes)
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped.isupper() and len(stripped) > 0:
            cleaned_lines.append(stripped)
    text = ' '.join(cleaned_lines)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    return text


In [None]:
with open("./shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()
    
cleaned_text = clean_shakespeare_text(text)

generator = MarkovChainTextGenerator(n=3)
generator.train(cleaned_text)

In [7]:
generator = MarkovChainTextGenerator(n=3)
generator.train(text)

In [8]:
print(generator.generate_text("To bear such", 50))

To bear such idleness so near the truth as I will desires among five thousand, and five hundred too. CAIUS. By gar, then I have as much mockvater as de Englishman. Scurvy jack-dog priest! By gar, me vill kill de priest, for he speak for a jackanape to Anne Page. HOST. Let him
