
# 01 — Morph **n‑gram** mapping (1–3 grams) + DP translator

We keep **emMorph**’s **finest** segments as units (no coalescing), then build a dictionary that covers
**unigrams, bigrams, trigrams** of morphs whenever a *compact* English word matches well.  
At runtime, a **dynamic program** chooses between 1‑, 2‑ or 3‑morph mappings that yield the lowest
phonetic cost for the sentence.


## Config

In [2]:

from pathlib import Path

# HU source
CORPUS_DIR = Path("./data/hu_corpus")   # *.txt, one sentence per line; if empty, falls back to wordfreq

# emtsv
EMTSV_EXEC = "emtsv"                     # or "~/.local/bin/emtsv"

# Outputs
OUT_DIR = Path("./data/coverage"); OUT_DIR.mkdir(parents=True, exist_ok=True)
CSV_UNIGRAMS = OUT_DIR / "hu_morph1_full.csv"
CSV_BIGRAMS  = OUT_DIR / "hu_morph2_full.csv"
CSV_TRIGRAMS = OUT_DIR / "hu_morph3_full.csv"
CSV_MAP      = OUT_DIR / "morph_ngram_map.csv"  # the final merged mapping

# Coverage trimming by token mass
TARGET_COVERAGE = 0.995

# EN candidate pool settings
MIN_ZIPF = 4.0        # drop rare/foreign words; raise to 3.8/4.0 if needed
MAX_CAND = 50000      # cap EN candidates considered
ALLOW_2SYLL_FOR_N = {1: False, 2: True, 3: True}
LEN_RANGE_FOR_N = {1:(2,4), 2:(3,6), 3:(3,7)}  # phone length ranges per n-gram

# Cost weights
LENGTH_GAP_W = 0.6
EN_LEN_W     = 0.08
RARITY_W     = 0.25
STEP_PENALTY = 0.05

# DP limits
NGRAM_MAX = 3


## Imports & project wiring

In [3]:

import sys, os, re, math, json, shlex, subprocess, collections, pandas as pd
from typing import List, Dict, Tuple, Sequence, Optional
from dataclasses import dataclass

# src/
THIS = Path.cwd()
SRC  = (THIS / "./src").resolve()
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))

from src.g2p_hu import hu_text_to_ipa
from src.cmudict_utils import load_cmudict
from src.ipa_map import proxies_for_hu_phone
from src.phone_mapping import phone_distance

from wordfreq import top_n_list, zipf_frequency


## emMorph (finest) segmentation → morph list

In [4]:

def _run(cmd: str, text: str):
    return subprocess.run(
        shlex.split(cmd),
        input=(text if text.endswith("\n") else text + "\n").encode("utf-8"),
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
    )

def _emtsv_lines(text: str, emtsv_exec: str):
    tries = [
        f"{emtsv_exec} tok,emMorph --output-header",
        f"{emtsv_exec} emMorph --output-header",
    ]
    alt = os.path.expanduser("~/.local/bin/emtsv")
    if emtsv_exec != alt:
        tries += [f"{alt} tok,emMorph --output-header", f"{alt} emMorph --output-header"]
    last_err = None
    for cmd in tries:
        try:
            p = _run(cmd, text)
            return p.stdout.decode("utf-8", errors="replace").splitlines()
        except Exception as e:
            last_err = e
    raise RuntimeError(f"emtsv failed; last error: {last_err}")

def _parse_tsv(lines: List[str]):
    if not lines: return [], []
    header = [h.strip() for h in lines[0].split("\t")]
    rows = []
    for line in lines[1:]:
        if not line.strip():
            rows.append(None)
            continue
        parts = line.split("\t")
        if len(parts) < len(header):
            parts += [""] * (len(header)-len(parts))
        rows.append(dict(zip(header, parts)))
    return header, rows

def _segments_from_morphana_string(morphana: str) -> List[str]:
    segs = []
    for chunk in morphana.split("+"):
        chunk = chunk.strip()
        if not chunk:
            continue
        if "=" in chunk:
            _, rhs = chunk.split("=", 1)
            rhs = rhs.strip()
            if rhs:
                segs.append(rhs)
    return segs

def _pick_finest_from_anas_json(anas_json: str) -> Optional[List[str]]:
    try:
        arr = json.loads(anas_json)
        best = None; best_score = -1
        for item in arr:
            ma = item.get("morphana") or ""
            score = ma.count("+")
            if score > best_score and ma:
                best_score = score; best = ma
        if best:
            return _segments_from_morphana_string(best)
    except Exception:
        pass
    return None

def morph_segments_for_text(text: str, emtsv_exec: str = "emtsv") -> List[str]:
    lines = _emtsv_lines(text, emtsv_exec)
    header, rows = _parse_tsv(lines)
    segs_all: List[str] = []
    for row in rows:
        if row is None:
            continue
        form = row.get("form") or row.get("FORM") or ""
        anas = row.get("anas") or row.get("ANAS") or ""
        segs = None
        if anas.strip().startswith("[") and "morphana" in anas:
            segs = _pick_finest_from_anas_json(anas)
        if not segs:
            if form and any(ch.isalpha() for ch in form):
                segs = [form]
            else:
                segs = []
        segs_all.extend(segs)
    return segs_all

# smoke test
try:
    print(morph_segments_for_text("Áradás.", emtsv_exec=EMTSV_EXEC))
except Exception as e:
    print("NOTE: emtsv may not be available in this environment. Error:", e)


['ár', 'ad', 'ás']


In [5]:
# --- Simple Hungarian grapheme syllabifier and stem micro-splitter ---

HU_VOWELS = set(list("aáeéiíoóöőuúüűAÁEÉIÍOÓÖŐUÚÜŰ"))
# Treat Hungarian digraphs/trigraphs as single consonant graphemes
DIGRAPHS3 = ["dzs","DZS"]
DIGRAPHS2 = ["dz","cs","gy","ly","ny","sz","ty","zs",
             "DZ","CS","GY","LY","NY","SZ","TY","ZS"]

def _graphemes(word: str) -> list[str]:
    """Split Hungarian word into graphemes, keeping digraphs/trigraphs intact."""
    s = word
    g = []
    i = 0
    while i < len(s):
        # longest-first (dzs)
        if i+3 <= len(s) and s[i:i+3] in DIGRAPHS3:
            g.append(s[i:i+3]); i += 3; continue
        if i+2 <= len(s) and s[i:i+2] in DIGRAPHS2:
            g.append(s[i:i+2]); i += 2; continue
        g.append(s[i]); i += 1
    return g

def _is_vowel_g(gr: str) -> bool:
    # any of its letters is a vowel → treat grapheme as vowel
    return any(ch in HU_VOWELS for ch in gr)

def syllabify_hu_graphemes(word: str) -> list[str]:
    """
    Very simple syllabification by graphemes:
      - each syllable has exactly one vowel nucleus;
      - apply 'maximal onset' with a conservative coda: keep at most 1 consonant in coda.
    """
    g = _graphemes(word)
    if not g: return []
    # positions of vowel nuclei
    nuclei = [i for i,gr in enumerate(g) if _is_vowel_g(gr)]
    if not nuclei:
        return ["".join(g)]  # all-consonant (rare), don't split

    sylls: list[list[str]] = []

    # handle onset before first nucleus
    start = 0
    for idx, nuc in enumerate(nuclei):
        if idx == 0:
            onset = g[:nuc]    # all leading consonants as onset of first syllable
        else:
            prev_nuc = nuclei[idx-1]
            # between prev_nuc and nuc:
            bridge = g[prev_nuc+1:nuc]
            # keep at most 1 consonant as coda for previous syllable, rest goes to onset here
            coda_prev = bridge[:1]
            onset = bridge[1:]
            # append coda to previous syllable
            sylls[-1].extend(coda_prev)
        sylls.append(onset + [g[nuc]])  # onset + nucleus

    # tail after last nucleus → coda of last syllable
    tail = g[nuclei[-1]+1:]
    if tail:
        # keep at most 2 in coda (Hungarian allows some clusters), tweak if you like
        coda = tail[:2]
        onset_extra = tail[2:]  # if any, attach to last coda conservatively
        sylls[-1].extend(coda + onset_extra)

    return ["".join(x) for x in sylls]

# Micro-split long stems based on grapheme syllables
REFINE_LONG_STEMS = True
MAX_CHARS_PER_UNIT = 6   # if a morph is longer than this (characters), split into syllables
MIN_KEEP_CHARS    = 3    # very short morphs are left as-is

def refine_morph_units(morphs: list[str]) -> list[str]:
    out: list[str] = []
    for m in morphs:
        if (not REFINE_LONG_STEMS) or (len(m) <= MAX_CHARS_PER_UNIT) or (len(m) < MIN_KEEP_CHARS):
            out.append(m)
            continue
        # split into grapheme syllables
        syls = syllabify_hu_graphemes(m)
        # merge ultra-short sylls back to neighbors to avoid 1-char pieces
        merged: list[str] = []
        for s in syls:
            if not merged:
                merged.append(s)
            elif len(s) == 1:
                merged[-1] = merged[-1] + s
            else:
                merged.append(s)
        out.extend(merged)
    return out


## Build morph n‑gram inventories (1–3)

In [None]:

def iter_source_lines():
    if CORPUS_DIR.exists():
        for p in sorted(CORPUS_DIR.glob("*.txt")):
            with open(p, "r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    line = line.strip()
                    if line: yield line
        return
    if top_n_list is not None:
        for w in top_n_list("hu", 200000):
            yield w
        return
    yield from ["Sárga bögre, görbe bögre", "Mit sütsz, kis szűcs", "Árvíztűrő tükörfúrógép"]

def count_morph_ngrams(nmax=3, emtsv_exec: str = "emtsv"):
    cnts = {1: collections.Counter(), 2: collections.Counter(), 3: collections.Counter()}
    seen = 0
    for line in iter_source_lines():
        try:
            morphs = morph_segments_for_text(line, emtsv_exec=emtsv_exec)
            morphs = refine_morph_units(morphs)
        except Exception:
            morphs = []
        L = len(morphs)
        for i in range(L):
            for n in (1,2,3):
                if i+n <= L:
                    seq = tuple(morphs[i:i+n])
                    cnts[n][seq] += 1
        seen += 1 
    return cnts, seen

cnts, seen = count_morph_ngrams(3, emtsv_exec=EMTSV_EXEC)
print("Lines processed:", seen, "uni:", len(cnts[1]), "bi:", len(cnts[2]), "tri:", len(cnts[3]))

def write_top(counter, path: Path):
    df = pd.DataFrame([{"unit": " + ".join(u), "count": c} for u,c in counter.items()]).sort_values(["count","unit"], ascending=[False, True]).reset_index(drop=True)
    df.to_csv(path, index=False)
    cum = df["count"].cumsum().astype(float); total=float(df["count"].sum())
    cut = (cum/ max(1.0,total) <= TARGET_COVERAGE)
    if cut.any():
        last = cut[cut].index.max()+1
        df_top = df.iloc[:last].copy()
    else:
        df_top = df
    return df, df_top

df1, top1 = write_top(cnts[1], CSV_UNIGRAMS)
df2, top2 = write_top(cnts[2], CSV_BIGRAMS)
df3, top3 = write_top(cnts[3], CSV_TRIGRAMS)

print("Wrote:", CSV_UNIGRAMS, "rows:", len(df1))
print("Wrote:", CSV_BIGRAMS,  "rows:", len(df2))
print("Wrote:", CSV_TRIGRAMS, "rows:", len(df3))

top1.head(10), top2.head(10), top3.head(10)


Lines processed: 46637 uni: 12564 bi: 27949 tri: 24107
Wrote: data/coverage/hu_morph1_full.csv rows: 12564
Wrote: data/coverage/hu_morph2_full.csv rows: 27949
Wrote: data/coverage/hu_morph3_full.csv rows: 24107


(  unit  count
 0    t   4093
 1    i   2263
 2   ás   1816
 3   et   1794
 4   és   1557
 5   ek   1523
 6   el   1425
 7   at   1348
 8  meg   1282
 9   ok   1236,
        unit  count
 0  tA/ + Ad    416
 1     é + t    344
 2     á + t    317
 3    és + é    282
 4     t + a    274
 5   ek + et    253
 6     t + e    248
 7    ás + a    235
 8    ás + á    224
 9    és + e    220,
                unit  count
 0  tA/ + Ad + j]=té    165
 1    Ad + j]=té + k    161
 2  tA/ + Ad + j]=tá    156
 3    Ad + j]=tá + k    154
 4        ás + á + t    105
 5        és + é + t     95
 6       ás + ár + a     72
 7       és + é + re     67
 8   Ad + j]=ot + tá     62
 9    j]=ot + tá + k     62)

In [None]:
top1 = top1.head(2894)
top2 = top2.head(4040)
top3 = top3.head(1633)


2894

## Curated EN candidate pool (by n‑gram size)

In [8]:

VOWELS = {
    "i","ɪ","e","ɛ","æ","ɑ","a","ɒ","ʌ","ɝ","ə","o","ɔ","ʊ","u","y","ø",
    "eɪ","oʊ","aɪ","aʊ","ɔɪ","i:","e:","a:","o:","u:","y:","ø:","iː","eː","aː","oː","uː","yː","øː"
}
def is_vowel(p): return p in VOWELS
def syllables(pron: Sequence[str]) -> int: return sum(1 for t in pron if is_vowel(t))

def onset_coda(pron: Sequence[str]):
    j = next((i for i,p in enumerate(pron) if is_vowel(p)), None)
    if j is None: return (), None, ()
    return tuple(pron[:j]), j, tuple(pron[j+1:])

ALLOWED_ONSET_1 = {"p","b","t","d","k","g","f","v","θ","ð","s","z","ʃ","ʒ","h","m","n","l","r","w","j","tʃ","dʒ"}
ALLOWED_ONSET_2 = {("s","p"),("s","t"),("s","k"),("s","m"),("s","n"),("s","w"),("s","l"),
                   ("p","l"),("p","r"),("b","l"),("b","r"),("t","r"),("t","w"),("d","r"),
                   ("k","l"),("k","r"),("g","l"),("g","r"),("f","l"),("f","r"),("θ","r"),("ʃ","r")}
ALLOWED_CODA_1 = {"p","b","t","d","k","g","f","v","θ","ð","s","z","ʃ","ʒ","m","n","ŋ","l","r","tʃ","dʒ"}
ALLOWED_CODA_2 = {("p","s"),("t","s"),("k","s"),("g","z"),("f","t"),("f","s"),
                   ("m","p"),("n","t"),("n","d"),("ŋ","k"),("ŋ","g"),
                   ("l","d"),("l","t"),("l","k"),("l","f"),("l","z"),
                   ("r","d"),("r","t"),("r","k"),("r","z"),("r","s")}
BANNED = {"schnee","meh","heh","uh","ugh","yah","ya","yo","bo","shwa","shwaa","umm","mmm"}

def good_shape(p: Sequence[str]) -> bool:
    onset, j, coda = onset_coda(p)
    if j is None or len(onset)>2 or len(coda)>2: return False
    ok_on = (len(onset)==0) or (len(onset)==1 and onset[0] in ALLOWED_ONSET_1) \
            or (len(onset)==2 and tuple(onset) in ALLOWED_ONSET_2)
    ok_cd = (len(coda)==0) or (len(coda)==1 and coda[0] in ALLOWED_CODA_1) \
            or (len(coda)==2 and tuple(coda) in ALLOWED_CODA_2)
    return ok_on and ok_cd

@dataclass
class EnCand:
    word: str
    pron: Tuple[str,...]
    zipf: float
    syll: int
    length: int

cmu = load_cmudict(min_zipf=MIN_ZIPF, max_words=None)
EN_CANDS: List[EnCand] = []
for w, prons in cmu.items():
    if not prons: continue
    p = prons[0]
    if not re.fullmatch(r"[a-z']+", w or ""): continue
    if w in BANNED: continue
    s = syllables(p); L = len(p)
    if s > 2 or not (2 <= L <= 7): continue
    if not good_shape(p): continue
    EN_CANDS.append(EnCand(w, tuple(p), zipf_frequency(w,"en"), s, L))

EN_CANDS.sort(key=lambda x: (-x.zipf, x.syll, x.length))

def en_candidates_for_n(n: int, hu_len: int) -> List[EnCand]:
    minL, maxL = LEN_RANGE_FOR_N.get(n, (2,4))
    allow2 = ALLOW_2SYLL_FOR_N.get(n, False)
    out = []
    for c in EN_CANDS:
        if c.length < minL or c.length > maxL: 
            continue
        if (c.syll > 1) and not allow2:
            continue
        out.append(c)
    out.sort(key=lambda x: (abs(x.length - hu_len), x.syll, -x.zipf, x.length))
    return out[:MAX_CAND] if MAX_CAND else out

len(EN_CANDS)


1372

## Costs (proxy‑aware, length/rarity penalties)

In [9]:

VOWELS_EN = {"i","ɪ","eɪ","ɛ","æ","ɑ","ʌ","ɝ","ə","oʊ","ɔ","ʊ","u","aɪ","aʊ","ɔɪ"}
def isv_en(p): return p in VOWELS_EN

def phone_cost(en_p: str, hu_p: str, alpha=1.5, cv_pen=1.2) -> float:
    best = math.inf
    for en_proxy, pen in proxies_for_hu_phone(hu_p):
        d = phone_distance(en_proxy, en_p)
        if isv_en(en_proxy) != isv_en(en_p):
            d += cv_pen
        cand = alpha*d + pen
        if cand < best:
            best = cand
    return float(best)

def unit_cost_base(en_unit: Tuple[str,...], hu_unit: Tuple[str,...]) -> float:
    if len(en_unit) == len(hu_unit):
        if not hu_unit: return 0.0
        return sum(phone_cost(e,h) for e,h in zip(en_unit, hu_unit)) / len(hu_unit)
    m = min(len(en_unit), len(hu_unit))
    if m == 0:
        return 2.0 + LENGTH_GAP_W * abs(len(en_unit) - len(hu_unit))
    base = sum(phone_cost(en_unit[i], hu_unit[i]) for i in range(m)) / m
    return base + LENGTH_GAP_W * abs(len(en_unit) - len(hu_unit))

def rarity_penalty(z: float) -> float:
    return max(0.0, 4.0 - float(z)) * RARITY_W

def unit_cost(en_unit: Tuple[str,...], hu_unit: Tuple[str,...], z: float) -> float:
    return unit_cost_base(en_unit, hu_unit) + EN_LEN_W * len(en_unit) + rarity_penalty(z)

def hu_text_to_ipa_tokens(text: str) -> Tuple[str,...]:
    iph = [p for p in hu_text_to_ipa(text) if isinstance(p, str) and p]
    return tuple(iph)

def best_en_for_ngram(morphs: Tuple[str,...]) -> Tuple[str, Tuple[str,...], float]:
    hu_text = "".join(morphs)
    hu_ipa  = hu_text_to_ipa_tokens(hu_text)
    if not hu_ipa:
        return "", tuple(), float("inf")
    cand_list = en_candidates_for_n(len(morphs), len(hu_ipa))
    best_w, best_p, best_c = "", tuple(), float("inf")
    for c in cand_list:
        cst = unit_cost(c.pron, hu_ipa, c.zipf)
        if cst < best_c:
            best_c = cst; best_w = c.word; best_p = c.pron
    return best_w, best_p, best_c


## Build mapping for top uni/bi/tri‑grams

In [10]:

def parse_units(df: pd.DataFrame) -> List[Tuple[Tuple[str,...], int]]:
    out = []
    for _,r in df.iterrows():
        u = tuple(str(r["unit"]).split(" + "))
        c = int(r.get("count", 1))
        out.append((u,c))
    return out

rows = []
for n, dftop in [(1, top1), (2, top2), (3, top3)]:
    items = parse_units(dftop)
    for u, c in items:
        w, p, cost = best_en_for_ngram(u)
        rows.append({
            "hu_ngram": " + ".join(u),
            "n": n,
            "en_word": w,
            "en_pron": " ".join(p),
            "cost": round(cost, 3),
            "count": c,
        })

map_df = pd.DataFrame(rows).sort_values(["n","cost","count"], ascending=[True, True, False]).reset_index(drop=True)
map_df.to_csv(CSV_MAP, index=False)
print("Wrote:", CSV_MAP, "rows:", len(map_df))
map_df.head(15)


Wrote: data/coverage/morph_ngram_map.csv rows: 8567


Unnamed: 0,hu_ngram,n,en_word,en_pron,cost,count
0,et,1,et,ɛ t,0.16,1794
1,el,1,l,ɛ l,0.16,1425
2,en,1,n,ɛ n,0.16,858
3,em,1,m,ɛ m,0.16,600
4,an,1,on,ɑ n,0.16,522
5,ad,1,odd,ɑ d,0.16,512
6,Ad,1,odd,ɑ d,0.16,416
7,ot,1,ought,ɔ t,0.16,415
8,ed,1,ed,ɛ d,0.16,329
9,in,1,in,ɪ n,0.16,293


## DP translator (choose 1/2/3‑gram mappings per context)

In [11]:

WORD_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)

def load_ngram_map(csv_path: str | Path) -> Dict[int, Dict[Tuple[str,...], Tuple[str, Tuple[str,...], float]]]:
    df = pd.read_csv(csv_path)
    byn: Dict[int, Dict[Tuple[str,...], Tuple[str, Tuple[str,...], float]]] = {1:{},2:{},3:{}}
    for _, r in df.iterrows():
        n = int(r["n"]); u = tuple(str(r["hu_ngram"]).split(" + "))
        w = str(r["en_word"]); p = tuple(str(r["en_pron"]).split())
        c = float(r["cost"])
        byn[n][u] = (w, p, c)
    return byn

def dp_translate_tokens(morphs: List[str], ngram_map: Dict[int, Dict[Tuple[str,...], Tuple[str, Tuple[str,...], float]]]):
    N = len(morphs)
    dp = [ (float("inf"), []) for _ in range(N+1) ]
    dp[0] = (0.0, [])
    for i in range(N):
        cur_cost, cur_path = dp[i]
        if not math.isfinite(cur_cost): continue
        for k in range(1, NGRAM_MAX+1):
            j = i + k
            if j > N: break
            u = tuple(morphs[i:j])
            if u in ngram_map.get(k, {}):
                w, pr, c = ngram_map[k][u]
            else:
                w, pr, c = best_en_for_ngram(u)
            new_cost = cur_cost + c + STEP_PENALTY
            if new_cost < dp[j][0]:
                dp[j] = (new_cost, cur_path + [w if w else "uh"])
    return dp[N]

def translate_text(text: str, ngram_map_csv: str | Path, emtsv_exec: str = EMTSV_EXEC) -> str:
    nmap = load_ngram_map(ngram_map_csv)
    out = []
    for tok in WORD_RE.findall(text):
        if not tok.strip() or not tok.isalpha():
            out.append(tok); continue
        morphs = morph_segments_for_text(tok, emtsv_exec=emtsv_exec)
        cost, words = dp_translate_tokens(morphs, nmap)
        out.append(" ".join(words))
    s = " ".join(out)
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)
    s = re.sub(r"\(\s+", "(", s); s = re.sub(r"\s+\)", ")", s)
    return s

# Demo (requires emtsv in your environment)
try:
    print(translate_text("Tűnj el, tűnj el te vagy az igazi bűnjel. Áradás.", CSV_MAP, emtsv_exec=EMTSV_EXEC))
except Exception as e:
    print("Demo skipped (needs emtsv).", e)


turn you l, turn you l tell lodge oz blocks been well. blood.
