In [None]:
# Configuration
from pathlib import Path

# Paths (relative to HU_EN_HOMOPHONC dir, root of all files and env. Jupyter server ran at this level)
DATA_DIR = Path("./data")
AUDIO_DIR = DATA_DIR / "audio_dp_vs_gemini_5zipf"
AUDIO_DIR.mkdir(exist_ok=True, parents=True)

# Toggles
RUN_TTS_ASR   = False      # Set True to synthesize EN and transcribe to HU for metrics
RUN_GEMINI    = True      # Set True if you have GEMINI_API_KEY and google-generativeai installed
GEMINI_MODEL  = "gemini-2.5-flash"  # or "gemini-2.5-pro" etc.
THINKING_BUDGET = 0       # at least 128 for "gemini-2.5-pro"

# DP search knobs
BEAM_SIZE     = 48
RARITY_LAMBDA = 0.1
MIN_ZIPF      = 3.9
MAX_WORDS     = 8000
MAX_PRON_LEN  = 12         # filter long pronunciations in cmudict if you've added that option

RESULTS_CSV   = DATA_DIR / "results/results_dp_vs_gemini_8k_time.csv"


In [2]:
# Imports
from tqdm import tqdm
import pandas as pd

from src.g2p_hu import hu_text_to_ipa
from src.cmudict_utils import load_cmudict
from src.search_dp import decode_dp_beam
from src.metrics import wer, cer, per
from src.phone_mapping import phone_distance
from src.tts_asr import synthesize_en_text, transcribe_hu
from src.llm_baseline import GeminiChat
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read test sentences; accept numbered lines "1. text"
test_path = DATA_DIR / "test_set_hu.txt"
items = []
with open(test_path, "r", encoding="utf-8") as f:
    for line in f:
        s = line.strip()
        if not s:
            continue
        parts = s.split(".", 1)
        text = parts[1].strip() if len(parts) == 2 and parts[0].isdigit() else s
        items.append(text)
print(f"Loaded {len(items)} items.")
items[:3]


Loaded 28 items.


['Mit sütsz kis szűcs',
 'Sárga bögre görbe bögre',
 'Fekete bikapata kopog a patika pepita kövezetén']

## Load lexicon (CMUdict subset)

In [4]:
lex = load_cmudict(min_zipf=MIN_ZIPF, max_words=MAX_WORDS)
len(lex)

8000

## Gemini helper (optional)

In [5]:
constraints = "Perform homophonic translation from hungarian to english. Meaning give me common english word sqequence \
    which spoken aloud sounds exactly like the hungarian sentence I gave you. Please use the most common 10000 english words. \
    I will give you input sentences one by one, just answer with the english word sqequence. No comma, nothing else:"

agent = GeminiChat(constraints, GEMINI_MODEL, THINKING_BUDGET)


## Evaluation helpers

In [7]:
def eval_one(text: str,
             do_tts_asr: bool = False,
             do_gemini: bool = False,
             idx: int = 0,
             audio_dir = AUDIO_DIR):
    row = {"hu_text": text}

    # DP search
    start_dp = time.perf_counter()
    hu = hu_text_to_ipa(text)
    sent_dp, words_dp, cost_dp = decode_dp_beam(
        target_phones=hu,
        lexicon=lex,
        beam_size=BEAM_SIZE,
        rarity_lambda=RARITY_LAMBDA
    )
    elapsed_s_dp = time.perf_counter() - start_dp

    row.update({
        "en_text_dp": sent_dp,
        "dp_cost": cost_dp,
        "dp_time": elapsed_s_dp
    })

    sent_gm = ""
    elapsed_s_gm = 0
    if do_gemini:
        try:
            start_gm = time.perf_counter()
            sent_gm = agent.send_message(text)
            elapsed_s_gm = time.perf_counter() - start_gm
        except Exception as e:
            sent_gm = ""
            row["gemini_error"] = str(e)
    row["en_text_gemini"] = sent_gm
    row["gm_time"] = elapsed_s_gm

    # Metrics via audio loop (optional)
    if do_tts_asr:
        # DP
        if sent_dp.strip():
            wav_dp = audio_dir / f"item_{idx:02d}_dp.wav"
            synthesize_en_text(sent_dp, str(wav_dp))
            hyp_hu_dp = transcribe_hu(str(wav_dp))
            row["hu_hyp_dp"] = hyp_hu_dp
            row["dp_wer"] = wer(text.lower().split(), hyp_hu_dp.lower().split())
            row["dp_cer"] = cer(list(text.lower()), list(hyp_hu_dp.lower()))
            row["dp_per"] = per(hu_text_to_ipa(text), hu_text_to_ipa(hyp_hu_dp), sc=phone_distance)

        # Gemini
        if sent_gm:
            wav_gm = audio_dir / f"item_{idx:02d}_gm.wav"
            synthesize_en_text(sent_gm, str(wav_gm))
            hyp_hu_gm = transcribe_hu(str(wav_gm))
            row["hu_hyp_gm"] = hyp_hu_gm
            row["gm_wer"] = wer(text.lower().split(), hyp_hu_gm.lower().split())
            row["gm_cer"] = cer(list(text.lower()), list(hyp_hu_gm.lower()))
            row["gm_per"] = per(hu_text_to_ipa(text), hu_text_to_ipa(hyp_hu_gm), sc=phone_distance)

    return row


## Run evaluation

In [9]:
rows = []
for i, s in enumerate(tqdm(items, desc="Evaluating")):
    rows.append(eval_one(s, do_tts_asr=RUN_TTS_ASR, do_gemini=RUN_GEMINI, idx=i))

df = pd.DataFrame(rows)
df.to_csv(RESULTS_CSV, index=False)
df.head(10)

Evaluating: 100%|████████████████████████████████████████████████████████████████████| 28/28 [5:37:53<00:00, 724.07s/it]


OSError: Cannot save file into a non-existent directory: '/results'

In [None]:
df.to_csv(RESULTS_CSV, index=False)

## Summaries

In [23]:
def mean_or_none(series):
    s = series.dropna()
    return None if s.empty else float(s.mean())

summary = {
    "n_items": len(df),
    "dp_nonempty": int(df["en_text_dp"].astype(str).str.len().gt(0).sum()),
#    "gm_nonempty": int(df.get("en_text_gemini", pd.Series([])).astype(str).str.len().gt(0).sum()),
}

if RUN_TTS_ASR:
    for tag in ["wer","cer","per"]:
        summary[f"dp_{tag}_avg"] = mean_or_none(df[f"dp_{tag}"])
#        summary[f"gm_{tag}_avg"] = mean_or_none(df[f"gm_{tag}"])

pd.DataFrame([summary]).T

Unnamed: 0,0
n_items,28.0
dp_nonempty,28.0
dp_wer_avg,1.420323
dp_cer_avg,1.319986
dp_per_avg,1.574185


In [None]:
lex = load_cmudict(min_zipf=5, max_words=4000)
len(lex)

1146

In [41]:
("mm" in lex.keys())
import re, cmudict
raw = cmudict.dict()
raw

{"'bout": [['B', 'AW1', 'T']],
 "'cause": [['K', 'AH0', 'Z']],
 "'course": [['K', 'AO1', 'R', 'S']],
 "'cuse": [['K', 'Y', 'UW1', 'Z']],
 "'em": [['AH0', 'M']],
 "'frisco": [['F', 'R', 'IH1', 'S', 'K', 'OW0']],
 "'gain": [['G', 'EH1', 'N']],
 "'kay": [['K', 'EY1']],
 "'m": [['AH0', 'M']],
 "'n": [['AH0', 'N']],
 "'round": [['R', 'AW1', 'N', 'D']],
 "'s": [['EH1', 'S']],
 "'til": [['T', 'IH1', 'L']],
 "'tis": [['T', 'IH1', 'Z']],
 "'twas": [['T', 'W', 'AH1', 'Z']],
 'a': [['AH0'], ['EY1']],
 "a's": [['EY1', 'Z']],
 'a.': [['EY1']],
 "a.'s": [['EY1', 'Z']],
 'a.d.': [['EY2', 'D', 'IY1']],
 'a.m.': [['EY2', 'EH1', 'M']],
 'a.s': [['EY1', 'Z']],
 'aaa': [['T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1']],
 'aaberg': [['AA1', 'B', 'ER0', 'G']],
 'aachen': [['AA1', 'K', 'AH0', 'N']],
 'aachener': [['AA1', 'K', 'AH0', 'N', 'ER0']],
 'aaker': [['AA1', 'K', 'ER0']],
 'aalborg': [['AO1', 'L', 'B', 'AO0', 'R', 'G'],
  ['AA1', 'L', 'B', 'AO0', 'R', 'G']],
 'aalburg': [['AE1', 'L', 'B', 'ER0', 'G']],
 'aal

In [None]:
from wordfreq import zipf_frequency
import re, cmudict

raw = cmudict.dict()

word_freq_list = []
for word in raw.keys():
    if zipf_frequency(word,"en") > 4.0:
        word_freq_list.append((word, zipf_frequency(word,"en")))
word_freq_list.sort(key=lambda x: zipf_frequency(x[0], "en"), reverse=True)
print(len(word_freq_list))
word_freq_list

3540


[('the', 7.73),
 ('to', 7.43),
 ('and', 7.41),
 ('of', 7.4),
 ('a', 7.36),
 ('a.', 7.36),
 ('in', 7.27),
 ('in.', 7.27),
 ('i', 7.09),
 ('i.', 7.09),
 ('is', 7.07),
 ('for', 7.01),
 ('that', 7.01),
 ('you', 6.98),
 ('it', 6.95),
 ('on', 6.91),
 ('with', 6.85),
 ('this', 6.82),
 ("this'", 6.82),
 ('was', 6.82),
 ('be', 6.79),
 ('as', 6.77),
 ('are', 6.74),
 ('have', 6.71),
 ('at', 6.7),
 ('he', 6.69),
 ('not', 6.69),
 ('by', 6.66),
 ('but', 6.63),
 ('from', 6.63),
 ('my', 6.57),
 ('or', 6.54),
 ('we', 6.54),
 ('an', 6.53),
 ('your', 6.53),
 ('all', 6.52),
 ('so', 6.52),
 ('his', 6.51),
 ('they', 6.5),
 ('me', 6.48),
 ('if', 6.47),
 ('one', 6.47),
 ('can', 6.46),
 ('will', 6.45),
 ('just', 6.43),
 ('like', 6.41),
 ('about', 6.4),
 ('up', 6.39),
 ('out', 6.38),
 ('what', 6.38),
 ('has', 6.37),
 ('when', 6.37),
 ('more', 6.36),
 ('do', 6.35),
 ('no', 6.35),
 ('were', 6.34),
 ('who', 6.34),
 ('had', 6.33),
 ("it's", 6.33),
 ('their', 6.33),
 ('to-do', 6.32),
 ('there', 6.31),
 ('her', 6.3),