## Config

In [None]:

from pathlib import Path

# Inputs
TEST_HU_PATH   = Path("./data_v1/test_set_hu.txt")                 # 1 HU sentence per line
NGRAM_MAP_CSV  = Path("./data_v1/coverage/morph_ngram_dp_dict_v1.csv")
DATA_DIR = Path("./data_v1")
AUDIO_DIR = DATA_DIR / "audio_morph_dp_v1"
AUDIO_DIR.mkdir(exist_ok=True, parents=True)

# emtsv binary
EMTSV_EXEC     = "emtsv"   # or "~/.local/bin/emtsv"

# Fallback behavior for unseen n‑grams
USE_FALLBACK_ON_MISSING_NGRAM = True
MAX_NGRAM = 3                          # 1..3

# Cost weights
LENGTH_GAP_W = 0.6     # |len(EN)-len(HU)| in phones
EN_LEN_W     = 0.08    # per‑EN‑phone penalty
RARITY_W     = 0.25    # penalty for low‑frequency EN words
INS_COST     = 0.7     # insertion cost in phone‑DP
DEL_COST     = 0.7     # deletion cost in phone‑DP

# Outputs
OUT_CSV = Path("./data_v1/results/results_morph_ngram_dp_v1.csv")
OUT_SUMMARY = Path("./data_v1/results/results_morph_ngram_summary_dp_v1.txt")
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)


## Imports & project wiring

In [2]:

import sys, os, re, math, json, subprocess, shlex, pandas as pd
from typing import List, Tuple, Sequence, Optional, Dict
from dataclasses import dataclass

# Add project src/
THIS = Path.cwd()
SRC  = (THIS / "./src").resolve()
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))

# Project modules
from src.g2p_hu import hu_text_to_ipa
from src.cmudict_utils import load_cmudict
from src.ipa_map import proxies_for_hu_phone
from src.phone_mapping import phone_distance
from src.metrics import wer, cer, per
from src.tts_asr import synthesize_en_text, transcribe_hu
from tqdm import tqdm
import time

from wordfreq import zipf_frequency

  from .autonotebook import tqdm as notebook_tqdm


## emMorph (finest) segmentation

In [3]:

def _run(cmd: str, text: str):
    return subprocess.run(
        shlex.split(cmd),
        input=(text if text.endswith("\n") else text + "\n").encode("utf-8"),
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
    )

def _emtsv_lines(text: str, emtsv_exec: str):
    tries = [
        f"{emtsv_exec} tok,emMorph --output-header",
        f"{emtsv_exec} emMorph --output-header",
    ]
    alt = os.path.expanduser("~/.local/bin/emtsv")
    if emtsv_exec != alt:
        tries += [f"{alt} tok,emMorph --output-header", f"{alt} emMorph --output-header"]
    last_err = None
    for cmd in tries:
        try:
            p = _run(cmd, text)
            return p.stdout.decode("utf-8", errors="replace").splitlines()
        except Exception as e:
            last_err = e
    raise RuntimeError(f"emtsv failed; last error: {last_err}")

def _parse_tsv(lines: List[str]):
    if not lines: return [], []
    header = [h.strip() for h in lines[0].split("\t")]
    rows = []
    for line in lines[1:]:
        if not line.strip():
            rows.append(None)
            continue
        parts = line.split("\t")
        if len(parts) < len(header):
            parts += [""] * (len(header)-len(parts))
        rows.append(dict(zip(header, parts)))
    return header, rows

def _segments_from_morphana_string(morphana: str) -> List[str]:
    segs = []
    for chunk in morphana.split("+"):
        chunk = chunk.strip()
        if not chunk:
            continue
        if "=" in chunk:
            _, rhs = chunk.split("=", 1)
            rhs = rhs.strip()
            if rhs:
                segs.append(rhs)
    return segs

def _pick_finest_from_anas_json(anas_json: str) -> Optional[List[str]]:
    try:
        arr = json.loads(anas_json)
        best = None; best_score = -1
        for item in arr:
            ma = item.get("morphana") or ""
            score = ma.count("+")
            if score > best_score and ma:
                best_score = score; best = ma
        if best:
            return _segments_from_morphana_string(best)
    except Exception:
        pass
    return None

def morph_segments_for_text(text: str, emtsv_exec: str = "emtsv") -> List[str]:
    lines = _emtsv_lines(text, emtsv_exec)
    header, rows = _parse_tsv(lines)
    segs_all: List[str] = []
    for row in rows:
        if row is None:
            continue
        form = row.get("form") or row.get("FORM") or ""
        anas = row.get("anas") or row.get("ANAS") or ""
        segs = None
        if anas.strip().startswith("[") and "morphana" in anas:
            segs = _pick_finest_from_anas_json(anas)
        if not segs:
            if form and any(ch.isalpha() for ch in form):
                segs = [form]
            else:
                segs = []
        segs_all.extend(segs)
    return segs_all


## EN candidates & costs (for fallback scoring)

In [4]:

# Build a broad EN candidate set (moderate filter; used only when an n‑gram is missing from the map)
@dataclass
class EnCand:
    word: str
    pron: Tuple[str,...]
    zipf: float
    length: int
    syll: int

VOWELS = {"i","ɪ","e","ɛ","æ","ɑ","a","ɒ","ʌ","ɝ","ə","o","ɔ","ʊ","u","y","ø",
          "eɪ","oʊ","aɪ","aʊ","ɔɪ","i:","e:","a:","o:","u:","y:","ø:","iː","eː","aː","oː","uː","yː","øː"}
def is_vowel(p): return p in VOWELS
def syllables(pron: Sequence[str]) -> int: return sum(1 for t in pron if is_vowel(t))

cmu = load_cmudict(min_zipf=3.2, max_words=None)  # fairly broad
EN_CANDS: List[EnCand] = []
for w, prons in cmu.items():
    if not prons: continue
    p = tuple(prons[0])
    if not re.fullmatch(r"[a-z']+", w or ""): continue
    s = syllables(p)
    if s > 2: continue
    EN_CANDS.append(EnCand(w, p, zipf_frequency(w,"en"), len(p), s))

EN_CANDS.sort(key=lambda x: (-x.zipf, x.syll, x.length))

VOWELS_EN = {"i","ɪ","eɪ","ɛ","æ","ɑ","ʌ","ɝ","ə","oʊ","ɔ","ʊ","u","aɪ","aʊ","ɔɪ"}
def isv_en(p): return p in VOWELS_EN

def phone_cost(en_p: str, hu_p: str, alpha=1.5, cv_pen=1.2) -> float:
    best = math.inf
    for en_proxy, pen in proxies_for_hu_phone(hu_p):
        d = phone_distance(en_proxy, en_p)
        if isv_en(en_proxy) != isv_en(en_p):
            d += cv_pen
        cand = alpha*d + pen
        if cand < best:
            best = cand
    return float(best)

def unit_cost_base(en_unit: Tuple[str,...], hu_unit: Tuple[str,...]) -> float:
    if len(en_unit) == len(hu_unit):
        if not hu_unit: return 0.0
        return sum(phone_cost(e,h) for e,h in zip(en_unit, hu_unit)) / len(hu_unit)
    m = min(len(en_unit), len(hu_unit))
    if m == 0:
        return 2.0 + LENGTH_GAP_W * abs(len(en_unit) - len(hu_unit))
    base = sum(phone_cost(en_unit[i], hu_unit[i]) for i in range(m)) / m
    return base + LENGTH_GAP_W * abs(len(en_unit) - len(hu_unit))

def rarity_penalty(z: float) -> float:
    return max(0.0, 4.0 - float(z)) * RARITY_W

def unit_cost(en_unit: Tuple[str,...], hu_unit: Tuple[str,...], z: float) -> float:
    return unit_cost_base(en_unit, hu_unit) + EN_LEN_W*len(en_unit) + rarity_penalty(z)


## Load n‑gram map & DP translator

In [5]:

def load_ngram_map(csv_path: str | Path) -> Dict[int, Dict[Tuple[str,...], Tuple[str, Tuple[str,...], float]]]:
    df = pd.read_csv(csv_path)
    byn: Dict[int, Dict[Tuple[str,...], Tuple[str, Tuple[str,...], float]]] = {1:{},2:{},3:{}}
    for _, r in df.iterrows():
        n = int(r["n"]); u = tuple(str(r["hu_ngram"]).split(" + "))
        w = str(r["en_word"]); p = tuple(str(r["en_pron"]).split())
        c = float(r["cost"])
        byn[n][u] = (w, p, c)
    return byn

def hu_text_to_ipa_tokens(text: str) -> Tuple[str,...]:
    iph = [p for p in hu_text_to_ipa(text) if isinstance(p, str) and p]
    return tuple(iph)

def best_en_for_ngram(morphs: Tuple[str,...]) -> Tuple[str, Tuple[str,...], float]:
    hu_text = "".join(morphs)
    hu_ipa  = hu_text_to_ipa_tokens(hu_text)
    if not hu_ipa:
        return "", tuple(), float("inf")
    # lightweight prefilter by length
    prelim = []
    for c in EN_CANDS:
        prelim_cost = abs(c.length - len(hu_ipa)) + (c.syll > 1)*0.5
        prelim.append((prelim_cost, c))
    prelim.sort(key=lambda t: t[0])
    shortlist = [c for _, c in prelim[:256]]  # beam
    best_w, best_p, best_c = "", tuple(), float("inf")
    for c in shortlist:
        cst = unit_cost(c.pron, hu_ipa, c.zipf)
        if cst < best_c:
            best_c = cst; best_w = c.word; best_p = c.pron
    return best_w, best_p, best_c

def dp_translate_morphs(morphs: List[str], ngram_map: Dict[int, Dict[Tuple[str,...], Tuple[str, Tuple[str,...], float]]],
                        max_ngram: int = MAX_NGRAM, use_fallback: bool = USE_FALLBACK_ON_MISSING_NGRAM):
    N = len(morphs)
    dp = [ (float("inf"), [], []) for _ in range(N+1) ]  # (cost, words, ks)
    dp[0] = (0.0, [], [])
    for i in range(N):
        cur_cost, cur_words, cur_ks = dp[i]
        if not math.isfinite(cur_cost): continue
        for k in range(1, max_ngram+1):
            j = i + k
            if j > N: break
            u = tuple(morphs[i:j])
            if u in ngram_map.get(k, {}):
                w, pr, c = ngram_map[k][u]
            elif use_fallback:
                w, pr, c = best_en_for_ngram(u)
            else:
                w, pr, c = "uh", tuple(), 2.0 + 0.5*(k-1)  # high cost
            new_cost = cur_cost + c + 0.05  # small step penalty
            if new_cost < dp[j][0]:
                dp[j] = (new_cost, cur_words + [w], cur_ks + [k])
    return dp[N]  # total_cost, words, ks


In [None]:

ngram_map = load_ngram_map(NGRAM_MAP_CSV)

def eval_one(text: str,
             do_tts_asr: bool = False,
             idx: int = 0,
             audio_dir = AUDIO_DIR):
    row = {"hu_text": text}

    # DP search
    start_dp = time.perf_counter()
    morphs = morph_segments_for_text(text, emtsv_exec=EMTSV_EXEC)

    cost_morph, words_morph, ks = dp_translate_morphs(morphs, ngram_map, max_ngram=MAX_NGRAM, use_fallback=USE_FALLBACK_ON_MISSING_NGRAM)
    en_text = " ".join(words_morph)
    elapsed_s_morph = time.perf_counter() - start_dp

    row.update({
        "en_text_morph": en_text,
        "morph_cost": cost_morph,
        "time": elapsed_s_morph
    })

    # Metrics via audio loop (optional)
    if do_tts_asr:
        # Morph
        if en_text.strip():
            wav_dp = audio_dir / f"item_{idx:02d}_dp.wav"
            synthesize_en_text(en_text, str(wav_dp))
            hyp_hu_dp = transcribe_hu(str(wav_dp))
            row["hu_hyp_morph"] = hyp_hu_dp
            row["morph_wer"] = wer(text.lower().split(), hyp_hu_dp.lower().split())
            row["morph_cer"] = cer(list(text.lower()), list(hyp_hu_dp.lower()))
            row["morph_per"] = per(hu_text_to_ipa(text), hu_text_to_ipa(hyp_hu_dp), sc=phone_distance)

    return row


## Evaluation loop

In [None]:

# Load test HU sentences
items = []
if TEST_HU_PATH.exists():
    with open(TEST_HU_PATH, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if not s:
                continue
            parts = s.split(".", 1)
            text = parts[1].strip() if len(parts) == 2 and parts[0].isdigit() else s
            items.append(text)
else:
    # Fallback: tiny demo list if the file is missing (edit as needed)
    items = ["Sárga bögre, görbe bögre.", "Mit sütsz, kis szűcs?"]
print(f"Loaded {len(items)} items.")
items[:3]

rows = []
for i, s in enumerate(tqdm(items, desc="Evaluating")):
    rows.append(eval_one(s, True, idx=i))

OUT_CSV = Path("./data_v1/results/results_morph_ngram_dp_v1_partial2.csv")

df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False)
df.head(10)


Loaded 28 items.


Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 28/28 [10:25<00:00, 22.33s/it]


Unnamed: 0,hu_text,en_text_morph,morph_cost,time,hu_hyp_morph,morph_wer,morph_cer,morph_per
0,Hány nyár,hands,3.21,2.453442,Hands.,1.0,0.777778,1.333333
1,Szép zöld fű,subjects,3.415,11.838759,Subjects,1.0,0.916667,1.777778
2,Vár a víz,victims,3.267143,2.536568,"Köszönöm, hogy megnézted!",1.0,2.444444,3.714286
3,Gyúl a Gyertya,girlfriends,3.383333,3.09013,Göröfréns.,1.0,0.928571,1.777778


## Summary & diagnostics

In [None]:

import numpy as np

def fmt(x):
    return "NA" if x is None or (isinstance(x, float) and (np.isnan(x) or np.isinf(x))) else f"{x:.4f}"

def summarize(df):
    n = len(df)
    per = df["morph_per"].astype(float).mean()
    cer = df["morph_cer"].astype(float).mean()
    wer = df["morph_wer"].astype(float).mean()
    lines = []
    lines.append(f"Examples: {n}")
    lines.append(f"WER: {fmt(wer)}")
    lines.append(f"CER: {fmt(cer)}")
    lines.append(f"PER: {fmt(per)}")
    return "\n".join(lines)

OUT_SUMMARY.write_text(summarize(df), encoding="utf-8")
print(OUT_SUMMARY.read_text(encoding="utf-8"))
print(AUDIO_DIR)


Examples: 4
WER: 2.5833
CER: 4.7480
PER: 3.2460
data/audio_morph_dp
