- Build ("train") the Markov model. We will not use matrices / arrays, but dictionaries.
- Use the trained model to generate poems. We'll do 4 lines at a time.

In [3]:
import string
import numpy as np
from collections import defaultdict


In [4]:
class Dataset:
    def __init__(self, text_path: str, label: int):
        self.file = self._load(text_path)
        self.label = label
        
        self._values = None

    def _load(self, path: str):
        with open(path, mode = "r", newline = "\n") as file:
            return file.read().splitlines()
    
    def clean_lines(self) -> list:
        input_text = []

        for line in self.file:
            line = line.rstrip().lower()

            if not line:
                continue

            line = line.translate( str.maketrans("", "", string.punctuation) )
            input_text.append(line)

        return input_text

    def get_samples(self) -> tuple[list, list]:
        return self.clean_lines()
    
    @property
    def values(self) -> list:
        if self._values is None:
            self._values = self.get_samples()
        return self._values
    
    def __getitem__(self, idx: int) -> str:
        return self.values[idx].split()
    
    def __len__(self) -> int:
        return len(self.values)

In [5]:
frost = Dataset("../datasets/MarkovModelClassification/robert_frost.txt", 1)

In [None]:
class MarkovGenerator:
    def __init__(self):
        self.initial = defaultdict(int)  # Counts of starting tokens
        self.first_order = defaultdict(lambda: defaultdict(int))  # t1 -> t2 counts
        self.second_order = defaultdict(lambda: defaultdict(int))  # (t0, t1) -> t2 counts

    def train(self, dataset):
        for line in dataset:
            if not line:
                continue
            self.initial[line[0]] += 1

            # First-order transitions (t1 -> t2)
            for t1, t2 in zip(line[:-1], line[1:]):
                self.first_order[t1][t2] += 1

            # Second-order transitions ((t0, t1) -> t2)
            for t0, t1, t2 in zip([None] + line[:-1], line, line[1:] + ["END"]):
                if t0 is not None:  # Skip first word (no previous context)
                    self.second_order[(t0, t1)][t2] += 1

        self._normalize()

    def _normalize(self):
        total = sum(self.initial.values())
        self.initial = {k: v / total for k, v in self.initial.items()}

        for t1 in self.first_order:
            total = sum(self.first_order[t1].values())
            self.first_order[t1] = {t2: cnt / total for t2, cnt in self.first_order[t1].items()}

        for (t0, t1) in self.second_order:
            total = sum(self.second_order[(t0, t1)].values())
            self.second_order[(t0, t1)] = {t2: cnt / total for t2, cnt in self.second_order[(t0, t1)].items()}

    def generate(self, lines: int = 4):

        for i in range(lines):
            sentence = ""

            w0 = self.sample_word( self.initial )
            w1 = self.sample_word(self.first_order[w0])
            
            sentence += f"{w0} {w1}"
            while True:
                w2 = self.sample_word( self.second_order[ (w0,w1) ] )

                if w2 == "END":
                    break
                
                w0 = w1
                w1 = w2

                sentence += f" {w2}"
            print(sentence)  

    def sample_word(self, sampler: dict) -> str:
        return np.random.choice(list(sampler.keys()), p = list(sampler.values()))

In [26]:
generator = MarkovGenerator()
generator.train(frost)
generator.generate()

what do you see him living many years
now he snapped his eyes
but then he cant see what ive tasted of desire
you can be
