In [2]:
import csv
from pathlib import Path
import unicodedata as ud

# import tokenizer
from mmdt_tokenizer.core import MyanmarSyllableTokenizer

In [5]:
def normalize(text: str) -> str:
    return ud.normalize("NFC", text.strip())

def flatten_if_nested(seq):
    """Flatten if tokenizer returns nested list like [[...]]."""
    if seq and isinstance(seq[0], list):
        flat = []
        for sub in seq:
            flat.extend(sub)
        return flat
    return seq

def build_lexicons(csv_path: Path) -> dict[str, list[tuple[str, ...]]]:
    tokenizer = MyanmarSyllableTokenizer()
    lexicons: dict[str, list[tuple[str, ...]]] = {}

    with open(csv_path, encoding="utf-8-sig") as f:
        reader = csv.reader(f)
        header_checked = False
        for row in reader:
            if not row or row[0].startswith("#"):
                continue
            if not header_checked and row[0].lower() == "tag":
                header_checked = True
                continue
            header_checked = True

            if len(row) < 2:
                continue

            tag, word = normalize(row[0]), normalize(row[1])
            syllables = tokenizer.tokenize(word, return_list=True)
            syllables = flatten_if_nested(syllables)

            if not syllables:
                continue
            lexicons.setdefault(tag, []).append(tuple(syllables))
    return lexicons


def write_lexicons(lexicons: dict[str, list[tuple[str, ...]]], out_path: Path):
    lines = [
        f"# Auto-generated by build_lexicons.py\n# Total tags: {len(lexicons)}\n\n"
    ]
    for tag, entries in lexicons.items():
        lines.append(f"{tag.upper()} = {{")
        for tokens in entries:
            key = "(" + ", ".join(repr(s) for s in tokens) + ")"
            lines.append(f"    {key}: '{tag.upper()}',")
        lines.append("}\n\n")

    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text("\n".join(lines), encoding="utf-8")
    print(f"[OK] Wrote lexicon definitions to {out_path}")


def main():
    default_in = Path("../data/raw_lex_data.csv")
    default_out = Path("../data/formatted_lex_data.txt")

    src = default_in if default_in.exists() else None
    dst = default_out
    if not src:
        print(f"[ERROR] Input CSV not found: {default_in}")
        print("Please place your CSV file at data/raw_lex_data.csv")
        return

    lexicons = build_lexicons(src)
    write_lexicons(lexicons, dst)

main()


[OK] Wrote lexicon definitions to ../data/formatted_lex_data.txt
