# Piper

# Read Outputs

In [None]:
import re
import json

vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'

# Define phoneme replacement mapping
def normalize_phonemes(s: str) -> str:
    # Handle multi-character phonemes first (order matters)
    replacements = [
        ("j", "y"),
        ("d í", "j"),
        ("t É", "C"),
        (" í", "Z"),   # note: must come after d í replacement
        (" î", "?"),
        ("…ë", "A"),
        ("Àà", ""),
        ("Àå", ""),
        ("Àê", ""),
        (" É", "S"),
        ("q1", "q"),
        ("…°", "g")
    ]
    for old, new in replacements:
        s = s.replace(old, new)

    s = re.sub(rf'([^\w\-\?]|^){vowels_regex}', r'\1?', s)
    return s


for exp_num, input_path in enumerate([
    "piper_experiment_log1.txt",
    "piper_experiment_log2.txt",
    "piper_experiment_log3.txt",
]):
    # Regex patterns
    text_pattern = re.compile(r"PASSED TEXT:\s*(.*)")
    phonemes_pattern = re.compile(r"CORRECTED PHONEMES:\s*(\[\[.*?\]\])", re.DOTALL)
    rtf_pattern = re.compile(
        r"Synthesis time:\s*([\d.]+)s,\s*Audio duration:\s*([\d.]+)s,\s*RTF:\s*([\d.]+)"
    )

    # Read entire file
    with open(input_path, "r", encoding="utf-8") as f:
        data = f.read()

    # Find all matches
    texts = text_pattern.findall(data)
    phonemes = phonemes_pattern.findall(data)
    rtfs = rtf_pattern.findall(data)

    results = []
    for i in range(min(len(texts), len(phonemes), len(rtfs))):
        synth_time, audio_dur, rtf = rtfs[i]

        # Convert to valid JSON then flatten phonemes into one string
        try:
            phoneme_list = json.loads(phonemes[i].replace("'", '"'))
            # Flatten nested lists and join with no separator
            flat_phonemes = "".join(sum(phoneme_list, []))
            # Apply mapping replacements
            flat_phonemes = normalize_phonemes(flat_phonemes)
        except Exception as e:
            flat_phonemes = ""
            print(f"Warning: could not parse phonemes at index {i}: {e}")

        results.append({
            "passed_text": texts[i].strip(),
            "corrected_phonemes": flat_phonemes.strip(),
            "synthesis_time_sec": float(synth_time),
            "audio_duration_sec": float(audio_dur),
            "rtf": float(rtf)
        })

    output_path = f"output_{exp_num + 1}.txt"
    # Save to JSON file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"‚úÖ Extracted {len(results)} records and saved to {output_path}")


‚úÖ Extracted 400 records and saved to output_1.txt
‚úÖ Extracted 400 records and saved to output_2.txt
‚úÖ Extracted 400 records and saved to output_3.txt


# Get Evaluation Data

In [None]:
import pandas as pd

In [None]:
!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv

--2025-11-13 11:49:59--  https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv
Resolving huggingface.co (huggingface.co)... 3.170.185.25, 3.170.185.14, 3.170.185.35, ...
Connecting to huggingface.co (huggingface.co)|3.170.185.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56026 (55K) [text/plain]
Saving to: ‚ÄòSentenceBench.csv‚Äô


2025-11-13 11:49:59 (4.36 MB/s) - ‚ÄòSentenceBench.csv‚Äô saved [56026/56026]



In [None]:
sentence_bench = pd.read_csv('SentenceBench.csv')

In [None]:
sentence_bench.head(3)

Unnamed: 0,dataset,grapheme,phoneme,homograph word,pronunciation
0,homograph,ŸÖŸÜ ŸÇÿØÿ± ÿ™Ÿà ÿ±ÿß ŸÖ€å‚ÄåÿØÿßŸÜŸÖ,man qadr-e to rA mi-dAnam,ŸÇÿØÿ±,qadr
1,homograph,ÿßÿ≤ ŸÇÿ∂ÿß€å ÿßŸÑŸá€å ÿ®Ÿá ŸÇÿØÿ± ÿßŸÑŸá€å ŸæŸÜÿßŸá ŸÖ€å‚Äåÿ®ÿ±ŸÖ,?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram,ŸÇÿØÿ±,qadar
2,homograph,ÿ®Ÿá ÿØÿ≥ÿ™ Ÿà ÿµŸàÿ±ÿ™ŸÖ ⁄©ÿ±ŸÖ ÿ≤ÿØŸÖ,be dast-o suratam kerem zadam,⁄©ÿ±ŸÖ,kerem


### Get ManaTTS

In [None]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]

# Convert to a list of tuples
mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))
# mana_evaluation_data = [(s, l.replace('-', '')) for s, l in mana_evaluation_data]

mana_evaluation_data[:3]

[('ÿØÿ± ÿß€åŸÜ ŸÜŸàÿ¥ÿ™Ÿá ÿ®ŸÜÿß ÿØÿßÿ±€åŸÖ ÿ®ÿß €å⁄© ÿßÿ®ÿ≤ÿßÿ± ÿ≥ÿßÿØŸá Ÿà ŸÖ⁄©ÿßŸÜ€å⁄©€å ÿßŸÅÿ≤ÿß€åÿ¥ ÿ®€åŸÜÿß€å€å ÿ®ÿ±ÿß€å ÿßŸÅÿ±ÿßÿØ ⁄©ŸÖ\u200cÿ®€åŸÜÿß ',
  'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\u200cbinA '),
 ('ÿ®Ÿá ŸÜÿßŸÖ ÿ®€å\u200cŸàŸæÿ™€å⁄© €åÿß ÿπÿØÿ≥€å ÿØŸàÿ±ŸÜŸÖÿß ÿ¢ÿ¥ŸÜÿß ÿ¥Ÿà€åŸÖ. ',
  'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),
 ('ÿØÿ±ÿß€åŸÜ\u200cÿµŸàÿ±ÿ™ÿå ÿßŸÜÿ¨ÿßŸÖ ÿÆŸàÿØÿßÿ±ÿ≤€åÿßÿ®€å Ÿà ÿßÿ±ÿßÿ¶Ÿá ÿ®ÿßÿ≤ÿÆŸàÿ±ÿØ ÿ®ÿ± ÿπŸáÿØŸá ÿÆŸàÿØÿ™ÿßŸÜ ÿßÿ≥ÿ™. ',
  'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]

### Get CommonVoice

In [None]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]

# Convert to a list of tuples
commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))
# commonvoice_evaluation_data = [(s, l.replace('-', '')) for s, l in commonvoice_evaluation_data]

commonvoice_evaluation_data[:3]

[('ÿØÿ± ÿß⁄©ÿ´ÿ± ÿ¥Ÿáÿ±Ÿáÿßÿå ŸÖÿ±⁄©ÿ≤€å ÿ®ÿ±ÿß€å ÿÆÿ±€åÿØ ÿØŸà⁄Üÿ±ÿÆŸá Ÿàÿ¨ŸàÿØ ÿØÿßÿ±ÿØ.',
  'dar ?aksar-e Sahr-hA, markazi barAye xarid-e  doCarxe vojud dArad.'),
 ('Ÿæÿ≥ ÿßÿ≤ ŸÖÿØÿ±ÿ≥Ÿá ⁄©ŸàÿØ⁄©ÿßŸÜ ÿ®Ÿá ÿ≥Ÿà€å ÿÆÿßŸÜŸá ÿ¨ÿ≥ÿ™ Ÿà ÿÆ€åÿ≤ ⁄©ÿ±ÿØŸÜÿØ.',
  'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),
 ('ÿ¥ŸÖÿß ŸÜ⁄Øÿ±ÿßŸÜ ÿ≤ŸÜ Ÿà ÿ®⁄ÜŸá ÿß€åŸÜ ŸÜÿ®ÿßÿ¥.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]

### Get Homograph

In [None]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',	'pronunciation']]

# Convert to a list of tuples
homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))
# homograph_evaluation_data = [(s, l.replace('-', ''), x, y) for s, l, x, y in homograph_evaluation_data]

homograph_evaluation_data[:3]

[('ŸÖŸÜ ŸÇÿØÿ± ÿ™Ÿà ÿ±ÿß ŸÖ€å\u200cÿØÿßŸÜŸÖ', 'man qadr-e to rA mi-dAnam', 'ŸÇÿØÿ±', 'qadr'),
 ('ÿßÿ≤ ŸÇÿ∂ÿß€å ÿßŸÑŸá€å ÿ®Ÿá ŸÇÿØÿ± ÿßŸÑŸá€å ŸæŸÜÿßŸá ŸÖ€å\u200cÿ®ÿ±ŸÖ',
  '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',
  'ŸÇÿØÿ±',
  'qadar'),
 ('ÿ®Ÿá ÿØÿ≥ÿ™ Ÿà ÿµŸàÿ±ÿ™ŸÖ ⁄©ÿ±ŸÖ ÿ≤ÿØŸÖ', 'be dast-o suratam kerem zadam', '⁄©ÿ±ŸÖ', 'kerem')]

In [None]:
benchmark = []

for g, p in mana_evaluation_data:
  benchmark.append((g, p, '', ''))

for g, p in commonvoice_evaluation_data:
  benchmark.append((g, p, '', ''))

for g, p, w, r in homograph_evaluation_data:
  benchmark.append((g, p, w, r))

# Evaluate Method Outputs

## PER Evaluation

In [None]:
def remove_non_word_chars(text):
    pattern = r'[^\w\s\?]'
    cleaned_text = re.sub(pattern, ' ', text)
    return cleaned_text

In [None]:
def remove_white_spaces(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [None]:
def get_word_only_text(text):
  word_only_text = remove_non_word_chars(text)
  extra_space_removed_text = remove_white_spaces(word_only_text)

  return extra_space_removed_text

In [None]:
def get_texts_cer(reference, model_output):
  # Preprocess input texts to only contain word characters
  word_only_reference = get_word_only_text(reference)
  word_only_output = get_word_only_text(model_output)

  # Return +infinity for CER if any of the texts is empty
  if not word_only_reference.strip() or not word_only_output.strip():
    return float('inf')

  return cer(word_only_reference, word_only_output)

In [None]:
def get_avg_cer_of_method(method_outputs, references):
  cers = []
  for idx, o in enumerate(method_outputs):
    cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))
    if cer != float('inf'):
      cers.append(cer)

  return sum(cers) / len(cers)

## Homograph Evaluation

In [None]:
def get_homograph_performance(outputs, references):
  corrects = 0
  total = 0

  for idx, (g, p, homograph, right) in enumerate(references):
    if homograph != '':
      total += 1
      if right in outputs[idx]:
        corrects += 1

  return corrects / total

## Ezafe Evaluation

In [None]:
!git clone https://huggingface.co/datasets/MahtaFetrat/KaamelDict

Cloning into 'KaamelDict'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 43 (delta 2), reused 0 (delta 0), pack-reused 37 (from 1)[K
Unpacking objects: 100% (43/43), 2.76 MiB | 2.34 MiB/s, done.


In [None]:
kaamel_dict = pd.read_csv('/content/KaamelDict/KaamelDict.csv')
word_set = set(kaamel_dict['grapheme'])
phoneme_set = set([''.join(eval(p)[0]) for p in list(kaamel_dict['phoneme']) if eval(p)])

def word_in_dict(word, lookup_set=word_set):
  return word in lookup_set

In [None]:
def get_EZ_words_from_ground_truth(text):
  pattern = r'\b(\w+)(-e|-ye)\b'
  matches = re.findall(pattern, text,)

  # Extract the words along with the suffix
  words_with_suffix = [match[0] + match[1] for match in matches]
  EZ_words = [tuple(re.split(r'(?=-)', w)) for w in words_with_suffix]

  return EZ_words

In [None]:
def get_EZ_words_from_phonetic_model_output(text):
    EZ_words = re.findall(r'\b(\w+)(-e|-ye)', text)
    EZ_word_candidates = []

    other_words = re.findall(r'\b(\w+)(?=(?:[^-\w]|$))', text)
    for word in other_words:
      if len(word) >= 4 and word[-3] in '–µeiuoaƒÅ√§√¢ƒÅ…í√°A' and word.endswith('ye') and word_in_dict(word[:-2], phoneme_set) and not word_in_dict(word, phoneme_set) and not word_in_dict(word[:-1], phoneme_set):
        EZ_words.append((word[:-2], '-ye'))
        continue

      if len(word) >= 3 and word.endswith('e') and word_in_dict(word[:-1], phoneme_set) and not word_in_dict(word, phoneme_set):
        EZ_words.append((word[:-1], '-e'))
        continue

      if len(word) >= 4 and word[-3] in '–µeiuoaƒÅ√§√¢ƒÅ…í√°A' and word.endswith('ye'):
        EZ_word_candidates.append((word[:-2], '-ye'))
        continue

      if len(word) >= 3 and word.endswith('e'):
        EZ_word_candidates.append((word[:-1], '-e'))

    return EZ_words, EZ_word_candidates

In [None]:
from difflib import SequenceMatcher

def get_phonetic_model_TP_FP_TN_FN(gt_finglish, model_finglish):
  gt_word_count = len(re.findall(r'\b\w+(?:-\w+)*\b', gt_finglish))
  gt_EZ_words = get_EZ_words_from_ground_truth(gt_finglish)

  model_EZ_words, model_candidate_EZ_words = get_EZ_words_from_phonetic_model_output(model_finglish)

  TP = 0
  FP = 0
  TN = 0
  FN = 0

  gt_matched_indices = set()
  model_matched_indices = set()
  model_candidate_matched_indices = set()

  for gt_idx, (word, EZ) in enumerate(gt_EZ_words):
    for model_idx, (w, E) in enumerate(model_EZ_words):
      if model_idx not in model_matched_indices and SequenceMatcher(None, word, w).ratio() > 0.65:
        TP += 1
        gt_matched_indices.add(gt_idx)
        model_matched_indices.add(model_idx)
        break
    else:
      for model_c_idx, (w, E) in enumerate(model_candidate_EZ_words):
        if model_c_idx not in model_candidate_matched_indices and SequenceMatcher(None, word, w).ratio() > 0.65:
          TP += 1
          gt_matched_indices.add(gt_idx)
          model_candidate_matched_indices.add(model_c_idx)
          break

  # Calculate FP: model_EZ_words that are not TP
  FP = len(model_EZ_words) - (TP - len(list(model_candidate_matched_indices)))

  # Calculate FN: gt_EZ_words that were not detected
  FN = len(gt_EZ_words) - TP

  # Calculate TN: non-Ezafe words that are correctly not detected as Ezafe
  TN = (gt_word_count - len(gt_EZ_words)) - FP

  return TP, FP, TN, FN


In [None]:
def get_phonetic_model_performance(outputs, references):
  total_TP, total_FP, total_TN, total_FN = 0, 0, 0, 0

  for idx, o in enumerate(outputs):
    TP, FP, TN, FN = get_phonetic_model_TP_FP_TN_FN(references[idx][1], o)
    total_TP += TP
    total_FP += FP
    total_TN += TN
    total_FN += FN


  total_model_EZ = total_TP + total_FP
  total_gt_EZ = total_TP + total_FN

  total_model_T = total_TP + total_TN

  total_gt_words = total_TP + total_TN + total_FP + total_FN

  accuracy = (total_model_T) / (total_gt_words) * 100
  precision = (total_TP) / (total_model_EZ) * 100 if total_model_EZ != 0 else -1
  recall = (total_TP) / (total_gt_EZ) * 100

  return accuracy, precision, recall

# Final Results

In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


## Original Piper

In [None]:
import json
import numpy as np
from jiwer import cer


# --- assume you already have these ---
# from your_module import print_all_metrics, get_avg_cer_of_method, get_phonetic_model_performance, get_homograph_performance

def evaluate_all_runs(run_files, benchmark):
    """
    Evaluate all prediction runs, report average and std for metrics.
    """
    metrics = {
        "per": [],
        "acc": [],
        "prec": [],
        "recall": [],
        "f1": [],
        "homograph": []
    }

    for path in run_files:
        print(f"\nüìò Evaluating {path} ...")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Extract predicted phoneme strings
        predictions = [item["corrected_phonemes"] for item in data]

        # Compute metrics individually (assuming you have these functions)
        per = get_avg_cer_of_method(predictions, benchmark) * 100
        acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)
        homograph = get_homograph_performance(predictions, benchmark) * 100
        f1 = (2 * prec * recall) / (prec + recall) if (prec + recall) != 0 else 0.0

        # Print per-run results
        print(f"PER: {per:.4f}")
        print(f"ACC: {acc:.4f}, PREC: {prec:.4f}, RECALL: {recall:.4f}, F1: {f1:.4f}")
        print(f"HOMOGRAPH: {homograph:.4f}")

        # Collect metrics
        metrics["per"].append(per)
        metrics["acc"].append(acc)
        metrics["prec"].append(prec)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["homograph"].append(homograph)

    # ---- Summary ----
    print("\nüìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===")
    for key, values in metrics.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"{key.upper():<10}: {mean:.4f} ¬± {std:.4f}")

    return metrics


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    metrics = evaluate_all_runs(run_files, benchmark)



üìò Evaluating output_1.txt ...
PER: 6.3246
ACC: 86.9997, PREC: 65.3846, RECALL: 11.5124, F1: 19.5777
HOMOGRAPH: 43.8679

üìò Evaluating output_2.txt ...
PER: 6.3246
ACC: 86.9997, PREC: 65.3846, RECALL: 11.5124, F1: 19.5777
HOMOGRAPH: 43.8679

üìò Evaluating output_3.txt ...
PER: 6.3246
ACC: 86.9997, PREC: 65.3846, RECALL: 11.5124, F1: 19.5777
HOMOGRAPH: 43.8679

üìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===
PER       : 6.3246 ¬± 0.0000
ACC       : 86.9997 ¬± 0.0000
PREC      : 65.3846 ¬± 0.0000
RECALL    : 11.5124 ¬± 0.0000
F1        : 19.5777 ¬± 0.0000
HOMOGRAPH : 43.8679 ¬± 0.0000


In [None]:
import json
import numpy as np

def compute_overall_rtf(file_path):
    """
    Compute overall RTF for one experiment file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    total_synth = sum(item["synthesis_time_sec"] for item in data)
    total_audio = sum(item["audio_duration_sec"] for item in data)

    if total_audio == 0:
        return 0.0
    return total_synth / total_audio


def compute_rtf_across_experiments(run_files):
    """
    Compute overall RTF for each experiment and report mean ¬± std.
    """
    rtfs = []

    for path in run_files:
        rtf_value = compute_overall_rtf(path)
        rtfs.append(rtf_value)
        print(f"{path}: overall RTF = {rtf_value:.4f}")

    mean_rtf = np.mean(rtfs)
    std_rtf = np.std(rtfs)

    print("\nüìä === TOTAL RTF SUMMARY ===")
    print(f"Mean RTF: {mean_rtf:.4f} ¬± {std_rtf:.4f}")

    return rtfs, mean_rtf, std_rtf


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    rtfs, mean_rtf, std_rtf = compute_rtf_across_experiments(run_files)


output_1.txt: overall RTF = 0.1378
output_2.txt: overall RTF = 0.1533
output_3.txt: overall RTF = 0.1671

üìä === TOTAL RTF SUMMARY ===
Mean RTF: 0.1527 ¬± 0.0120


# Our Piper Piped

In [None]:
import json
import numpy as np
from jiwer import cer


# --- assume you already have these ---
# from your_module import print_all_metrics, get_avg_cer_of_method, get_phonetic_model_performance, get_homograph_performance

def evaluate_all_runs(run_files, benchmark):
    """
    Evaluate all prediction runs, report average and std for metrics.
    """
    metrics = {
        "per": [],
        "acc": [],
        "prec": [],
        "recall": [],
        "f1": [],
        "homograph": []
    }

    for path in run_files:
        print(f"\nüìò Evaluating {path} ...")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Extract predicted phoneme strings
        predictions = [item["corrected_phonemes"] for item in data]

        # Compute metrics individually (assuming you have these functions)
        per = get_avg_cer_of_method(predictions, benchmark) * 100
        acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)
        homograph = get_homograph_performance(predictions, benchmark) * 100
        f1 = (2 * prec * recall) / (prec + recall) if (prec + recall) != 0 else 0.0

        # Print per-run results
        print(f"PER: {per:.4f}")
        print(f"ACC: {acc:.4f}, PREC: {prec:.4f}, RECALL: {recall:.4f}, F1: {f1:.4f}")
        print(f"HOMOGRAPH: {homograph:.4f}")

        # Collect metrics
        metrics["per"].append(per)
        metrics["acc"].append(acc)
        metrics["prec"].append(prec)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["homograph"].append(homograph)

    # ---- Summary ----
    print("\nüìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===")
    for key, values in metrics.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"{key.upper():<10}: {mean:.4f} ¬± {std:.4f}")

    return metrics


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 2)]
    metrics = evaluate_all_runs(run_files, benchmark)



üìò Evaluating output_2.txt ...
PER: 6.2927
ACC: 96.9594, PREC: 88.0795, RECALL: 90.0677, F1: 89.0625
HOMOGRAPH: 77.3585

üìò Evaluating output_3.txt ...
PER: 4.0484
ACC: 97.3627, PREC: 88.9130, RECALL: 92.3251, F1: 90.5869
HOMOGRAPH: 77.8302

üìò Evaluating output_4.txt ...
PER: 4.0484
ACC: 97.3627, PREC: 88.9130, RECALL: 92.3251, F1: 90.5869
HOMOGRAPH: 77.8302

üìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===
PER       : 4.7965 ¬± 1.0580
ACC       : 97.2283 ¬± 0.1901
PREC      : 88.6352 ¬± 0.3930
RECALL    : 91.5726 ¬± 1.0641
F1        : 90.0788 ¬± 0.7186
HOMOGRAPH : 77.6730 ¬± 0.2224


In [None]:
import json
import numpy as np

def compute_overall_rtf(file_path):
    """
    Compute overall RTF for one experiment file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    total_synth = sum(item["synthesis_time_sec"] for item in data)
    total_audio = sum(item["audio_duration_sec"] for item in data)

    if total_audio == 0:
        return 0.0
    return total_synth / total_audio


def compute_rtf_across_experiments(run_files):
    """
    Compute overall RTF for each experiment and report mean ¬± std.
    """
    rtfs = []

    for path in run_files:
        rtf_value = compute_overall_rtf(path)
        rtfs.append(rtf_value)
        print(f"{path}: overall RTF = {rtf_value:.4f}")

    mean_rtf = np.mean(rtfs)
    std_rtf = np.std(rtfs)

    print("\nüìä === TOTAL RTF SUMMARY ===")
    print(f"Mean RTF: {mean_rtf:.4f} ¬± {std_rtf:.4f}")

    return rtfs, mean_rtf, std_rtf


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    rtfs, mean_rtf, std_rtf = compute_rtf_across_experiments(run_files)


output_2.txt: overall RTF = 0.1457
output_3.txt: overall RTF = 0.1741
output_4.txt: overall RTF = 0.1803

üìä === TOTAL RTF SUMMARY ===
Mean RTF: 0.1667 ¬± 0.0151


# Piper with HomoGE2PE Piped

In [None]:
import json
import numpy as np
from jiwer import cer


# --- assume you already have these ---
# from your_module import print_all_metrics, get_avg_cer_of_method, get_phonetic_model_performance, get_homograph_performance

def evaluate_all_runs(run_files, benchmark):
    """
    Evaluate all prediction runs, report average and std for metrics.
    """
    metrics = {
        "per": [],
        "acc": [],
        "prec": [],
        "recall": [],
        "f1": [],
        "homograph": []
    }

    for path in run_files:
        print(f"\nüìò Evaluating {path} ...")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Extract predicted phoneme strings
        predictions = [item["corrected_phonemes"] for item in data]

        # Compute metrics individually (assuming you have these functions)
        per = get_avg_cer_of_method(predictions, benchmark) * 100
        acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)
        homograph = get_homograph_performance(predictions, benchmark) * 100
        f1 = (2 * prec * recall) / (prec + recall) if (prec + recall) != 0 else 0.0

        # Print per-run results
        print(f"PER: {per:.4f}")
        print(f"ACC: {acc:.4f}, PREC: {prec:.4f}, RECALL: {recall:.4f}, F1: {f1:.4f}")
        print(f"HOMOGRAPH: {homograph:.4f}")

        # Collect metrics
        metrics["per"].append(per)
        metrics["acc"].append(acc)
        metrics["prec"].append(prec)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["homograph"].append(homograph)

    # ---- Summary ----
    print("\nüìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===")
    for key, values in metrics.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"{key.upper():<10}: {mean:.4f} ¬± {std:.4f}")

    return metrics


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    metrics = evaluate_all_runs(run_files, benchmark)



üìò Evaluating output_1.txt ...
PER: 5.9033
ACC: 96.2768, PREC: 85.6512, RECALL: 87.5847, F1: 86.6071
HOMOGRAPH: 74.0566

üìò Evaluating output_2.txt ...
PER: 4.5919
ACC: 96.6801, PREC: 86.2069, RECALL: 90.2935, F1: 88.2029
HOMOGRAPH: 74.5283

üìò Evaluating output_3.txt ...
PER: 4.3447
ACC: 96.7111, PREC: 86.3931, RECALL: 90.2935, F1: 88.3002
HOMOGRAPH: 75.0000

üìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===
PER       : 4.9466 ¬± 0.6839
ACC       : 96.5560 ¬± 0.1979
PREC      : 86.0837 ¬± 0.3151
RECALL    : 89.3905 ¬± 1.2769
F1        : 87.7034 ¬± 0.7762
HOMOGRAPH : 74.5283 ¬± 0.3851


In [None]:
import json
import numpy as np

def compute_overall_rtf(file_path):
    """
    Compute overall RTF for one experiment file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    total_synth = sum(item["synthesis_time_sec"] for item in data)
    total_audio = sum(item["audio_duration_sec"] for item in data)

    if total_audio == 0:
        return 0.0
    return total_synth / total_audio


def compute_rtf_across_experiments(run_files):
    """
    Compute overall RTF for each experiment and report mean ¬± std.
    """
    rtfs = []

    for path in run_files:
        rtf_value = compute_overall_rtf(path)
        rtfs.append(rtf_value)
        print(f"{path}: overall RTF = {rtf_value:.4f}")

    mean_rtf = np.mean(rtfs)
    std_rtf = np.std(rtfs)

    print("\nüìä === TOTAL RTF SUMMARY ===")
    print(f"Mean RTF: {mean_rtf:.4f} ¬± {std_rtf:.4f}")

    return rtfs, mean_rtf, std_rtf


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    rtfs, mean_rtf, std_rtf = compute_rtf_across_experiments(run_files)


output_1.txt: overall RTF = 0.4540
output_2.txt: overall RTF = 0.4707
output_3.txt: overall RTF = 0.2625

üìä === TOTAL RTF SUMMARY ===
Mean RTF: 0.3957 ¬± 0.0945


# Piper with HomoGE2PE not Piped

In [None]:
import json
import numpy as np
from jiwer import cer


# --- assume you already have these ---
# from your_module import print_all_metrics, get_avg_cer_of_method, get_phonetic_model_performance, get_homograph_performance

def evaluate_all_runs(run_files, benchmark):
    """
    Evaluate all prediction runs, report average and std for metrics.
    """
    metrics = {
        "per": [],
        "acc": [],
        "prec": [],
        "recall": [],
        "f1": [],
        "homograph": []
    }

    for path in run_files:
        print(f"\nüìò Evaluating {path} ...")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Extract predicted phoneme strings
        predictions = [item["corrected_phonemes"] for item in data]

        # Compute metrics individually (assuming you have these functions)
        per = get_avg_cer_of_method(predictions, benchmark) * 100
        acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)
        homograph = get_homograph_performance(predictions, benchmark) * 100
        f1 = (2 * prec * recall) / (prec + recall) if (prec + recall) != 0 else 0.0

        # Print per-run results
        print(f"PER: {per:.4f}")
        print(f"ACC: {acc:.4f}, PREC: {prec:.4f}, RECALL: {recall:.4f}, F1: {f1:.4f}")
        print(f"HOMOGRAPH: {homograph:.4f}")

        # Collect metrics
        metrics["per"].append(per)
        metrics["acc"].append(acc)
        metrics["prec"].append(prec)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["homograph"].append(homograph)

    # ---- Summary ----
    print("\nüìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===")
    for key, values in metrics.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"{key.upper():<10}: {mean:.4f} ¬± {std:.4f}")

    return metrics


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    metrics = evaluate_all_runs(run_files, benchmark)



üìò Evaluating output_1.txt ...
PER: 4.3447
ACC: 96.7111, PREC: 86.3931, RECALL: 90.2935, F1: 88.3002
HOMOGRAPH: 75.0000

üìò Evaluating output_2.txt ...
PER: 4.3447
ACC: 96.7111, PREC: 86.3931, RECALL: 90.2935, F1: 88.3002
HOMOGRAPH: 75.0000

üìò Evaluating output_3.txt ...
PER: 4.3447
ACC: 96.7111, PREC: 86.3931, RECALL: 90.2935, F1: 88.3002
HOMOGRAPH: 75.0000

üìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===
PER       : 4.3447 ¬± 0.0000
ACC       : 96.7111 ¬± 0.0000
PREC      : 86.3931 ¬± 0.0000
RECALL    : 90.2935 ¬± 0.0000
F1        : 88.3002 ¬± 0.0000
HOMOGRAPH : 75.0000 ¬± 0.0000


In [None]:
import json
import numpy as np

def compute_overall_rtf(file_path):
    """
    Compute overall RTF for one experiment file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    total_synth = sum(item["synthesis_time_sec"] for item in data)
    total_audio = sum(item["audio_duration_sec"] for item in data)

    if total_audio == 0:
        return 0.0
    return total_synth / total_audio


def compute_rtf_across_experiments(run_files):
    """
    Compute overall RTF for each experiment and report mean ¬± std.
    """
    rtfs = []

    for path in run_files:
        rtf_value = compute_overall_rtf(path)
        rtfs.append(rtf_value)
        print(f"{path}: overall RTF = {rtf_value:.4f}")

    mean_rtf = np.mean(rtfs)
    std_rtf = np.std(rtfs)

    print("\nüìä === TOTAL RTF SUMMARY ===")
    print(f"Mean RTF: {mean_rtf:.4f} ¬± {std_rtf:.4f}")

    return rtfs, mean_rtf, std_rtf


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    rtfs, mean_rtf, std_rtf = compute_rtf_across_experiments(run_files)


output_1.txt: overall RTF = 4.3242
output_2.txt: overall RTF = 3.3100
output_3.txt: overall RTF = 3.8842

üìä === TOTAL RTF SUMMARY ===
Mean RTF: 3.8395 ¬± 0.4152


# Our Piper not Piped

In [None]:
import json
import numpy as np
from jiwer import cer


# --- assume you already have these ---
# from your_module import print_all_metrics, get_avg_cer_of_method, get_phonetic_model_performance, get_homograph_performance

def evaluate_all_runs(run_files, benchmark):
    """
    Evaluate all prediction runs, report average and std for metrics.
    """
    metrics = {
        "per": [],
        "acc": [],
        "prec": [],
        "recall": [],
        "f1": [],
        "homograph": []
    }

    for path in run_files:
        print(f"\nüìò Evaluating {path} ...")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Extract predicted phoneme strings
        predictions = [item["corrected_phonemes"] for item in data]

        # Compute metrics individually (assuming you have these functions)
        per = get_avg_cer_of_method(predictions, benchmark) * 100
        acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)
        homograph = get_homograph_performance(predictions, benchmark) * 100
        f1 = (2 * prec * recall) / (prec + recall) if (prec + recall) != 0 else 0.0

        # Print per-run results
        print(f"PER: {per:.4f}")
        print(f"ACC: {acc:.4f}, PREC: {prec:.4f}, RECALL: {recall:.4f}, F1: {f1:.4f}")
        print(f"HOMOGRAPH: {homograph:.4f}")

        # Collect metrics
        metrics["per"].append(per)
        metrics["acc"].append(acc)
        metrics["prec"].append(prec)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["homograph"].append(homograph)

    # ---- Summary ----
    print("\nüìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===")
    for key, values in metrics.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"{key.upper():<10}: {mean:.4f} ¬± {std:.4f}")

    return metrics


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    metrics = evaluate_all_runs(run_files, benchmark)



üìò Evaluating output_1.txt ...
PER: 4.4026
ACC: 97.1455, PREC: 88.9135, RECALL: 90.5192, F1: 89.7092
HOMOGRAPH: 76.8868

üìò Evaluating output_2.txt ...
PER: 4.5924
ACC: 97.2696, PREC: 89.1832, RECALL: 91.1964, F1: 90.1786
HOMOGRAPH: 75.9434

üìò Evaluating output_3.txt ...
PER: 4.1885
ACC: 97.1145, PREC: 89.0625, RECALL: 90.0677, F1: 89.5623
HOMOGRAPH: 75.9434

üìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===
PER       : 4.3945 ¬± 0.1650
ACC       : 97.1765 ¬± 0.0670
PREC      : 89.0531 ¬± 0.1103
RECALL    : 90.5944 ¬± 0.4638
F1        : 89.8167 ¬± 0.2628
HOMOGRAPH : 76.2579 ¬± 0.4447


In [None]:
import json
import numpy as np

def compute_overall_rtf(file_path):
    """
    Compute overall RTF for one experiment file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    total_synth = sum(item["synthesis_time_sec"] for item in data)
    total_audio = sum(item["audio_duration_sec"] for item in data)

    if total_audio == 0:
        return 0.0
    return total_synth / total_audio


def compute_rtf_across_experiments(run_files):
    """
    Compute overall RTF for each experiment and report mean ¬± std.
    """
    rtfs = []

    for path in run_files:
        rtf_value = compute_overall_rtf(path)
        rtfs.append(rtf_value)
        print(f"{path}: overall RTF = {rtf_value:.4f}")

    mean_rtf = np.mean(rtfs)
    std_rtf = np.std(rtfs)

    print("\nüìä === TOTAL RTF SUMMARY ===")
    print(f"Mean RTF: {mean_rtf:.4f} ¬± {std_rtf:.4f}")

    return rtfs, mean_rtf, std_rtf


# Example usage
if __name__ == "__main__":
    run_files = [f"output_{i}.txt" for i in range(1, 4)]
    rtfs, mean_rtf, std_rtf = compute_rtf_across_experiments(run_files)


output_1.txt: overall RTF = 4.2944
output_2.txt: overall RTF = 5.5603
output_3.txt: overall RTF = 6.7036

üìä === TOTAL RTF SUMMARY ===
Mean RTF: 5.5194 ¬± 0.9840


# Matcha

## RTF

In [None]:
import re
import numpy as np
import glob

def extract_rtf_from_file(filename):
    """Extract all RTF values from a single log file"""
    rtf_values = []

    try:
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read()

        # Use regex to find all RTF values
        # Pattern looks for "RTF: " followed by a decimal number
        pattern = r'RTF:\s*([0-9]+\.?[0-9]*)'
        matches = re.findall(pattern, content)

        for match in matches:
            try:
                rtf_values.append(float(match))
            except ValueError:
                print(f"Warning: Could not convert '{match}' to float in {filename}")

        print(f"Found {len(rtf_values)} RTF values in {filename}: {rtf_values}")
        return rtf_values

    except FileNotFoundError:
        print(f"Error: File {filename} not found")
        return []
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        return []

def calculate_statistics(rtf_values):
    """Calculate mean and standard deviation of RTF values"""
    if not rtf_values:
        return None, None, 0

    mean_rtf = np.mean(rtf_values)
    std_rtf = np.std(rtf_values)

    return mean_rtf, std_rtf, len(rtf_values)

def main():
    # You can modify this list to match your actual filenames
    log_files = ['matcha_experiment_log.txt', 'matcha_experiment_log2.txt', 'matcha_experiment_log3.txt']  # Replace with your actual filenames

    # Alternative: if files follow a pattern, you can use glob
    # log_files = glob.glob('*benchmark*.txt')

    all_rtf_values = []

    print("RTF Extraction Report")
    print("=" * 50)

    # Process each file
    for file in log_files:
        rtf_values = extract_rtf_from_file(file)
        all_rtf_values.extend(rtf_values)

    print("\n" + "=" * 50)

    if not all_rtf_values:
        print("No RTF values found in any files.")
        return

    # Calculate overall statistics
    mean_rtf, std_rtf, total_count = calculate_statistics(all_rtf_values)

    print(f"\nOverall Statistics:")
    print(f"Total RTF values: {total_count}")
    print(f"Average RTF: {mean_rtf:.4f}")
    print(f"Standard Deviation: {std_rtf:.4f}")
    print(f"RTF Range: {min(all_rtf_values):.4f} - {max(all_rtf_values):.4f}")

    # Optional: Also show statistics per file
    print(f"\nPer-file Statistics:")
    print("-" * 30)
    for file in log_files:
        rtf_values = extract_rtf_from_file(file)
        if rtf_values:
            mean, std, count = calculate_statistics(rtf_values)
            print(f"{file}: {count} values, Avg: {mean:.4f}, Std: {std:.4f}")

if __name__ == "__main__":
    main()

RTF Extraction Report
Found 400 RTF values in matcha_experiment_log.txt: [0.151, 0.189, 0.156, 0.192, 0.179, 0.152, 0.15, 0.161, 0.167, 0.163, 0.16, 0.154, 0.148, 0.152, 0.158, 0.153, 0.153, 0.172, 0.185, 0.164, 0.174, 0.157, 0.168, 0.165, 0.148, 0.191, 0.168, 0.17, 0.167, 0.169, 0.169, 0.149, 0.153, 0.162, 0.142, 0.183, 0.157, 0.162, 0.147, 0.171, 0.165, 0.149, 0.162, 0.158, 0.155, 0.143, 0.194, 0.166, 0.16, 0.183, 0.17, 0.168, 0.185, 0.156, 0.179, 0.174, 0.171, 0.188, 0.162, 0.158, 0.166, 0.175, 0.16, 0.156, 0.175, 0.256, 0.162, 0.176, 0.166, 0.163, 0.162, 0.139, 0.148, 0.166, 0.144, 0.174, 0.171, 0.141, 0.148, 0.158, 0.148, 0.159, 0.139, 0.157, 0.162, 0.16, 0.176, 0.196, 0.159, 0.163, 0.158, 0.166, 0.169, 0.149, 0.165, 0.164, 0.183, 0.149, 0.153, 0.155, 0.158, 0.172, 0.199, 0.228, 0.177, 0.172, 0.177, 0.161, 0.182, 0.243, 0.233, 0.206, 0.212, 0.163, 0.182, 0.195, 0.165, 0.154, 0.152, 0.201, 0.196, 0.209, 0.161, 0.154, 0.154, 0.158, 0.159, 0.196, 0.157, 0.176, 0.155, 0.199, 0.151, 0.

## Phonemes

In [None]:
import re
import os
import json

vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'

# Define phoneme replacement mapping
def normalize_phonemes(s: str) -> str:
    # Handle multi-character phonemes first (order matters)
    replacements = [
        ("j", "y"),
        ("d í", "j"),
        ("t É", "C"),
        (" í", "Z"),   # note: must come after d í replacement
        (" î", "?"),
        ("…ë", "A"),
        ("Àà", ""),
        ("Àå", ""),
        ("Àê", ""),
        (" É", "S"),
        ("q1", "q"),
        ("…°", "g")
    ]
    for old, new in replacements:
        s = s.replace(old, new)

    s = re.sub(rf'([^\w\-\?]|^){vowels_regex}', r'\1?', s)
    return s

# ----------------------------------------------------------------------
# List of log files to process
log_files = [
    "inference_benchmark.log",
    "inference_benchmark2.log",
    "inference_benchmark3.log"
]

# Regex to find a Python-style list: ['a', 'b', ...]
list_pattern = re.compile(r"\[.*?\]", re.DOTALL)

# Regex to detect a line that ends with "Status: SUCCESS"
status_pattern = re.compile(r"Status:\s*SUCCESS")

for log_file in log_files:
    if not os.path.exists(log_file):
        print(f"Warning: {log_file} not found. Skipping.")
        continue

    output_file = log_file.replace(".log", "_phonemes.txt")
    phoneme_lines = []

    with open(log_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # ---- Look for "Status: SUCCESS" ----
        if status_pattern.search(line):
            # Walk backwards to find the previous non-empty line that contains a list
            j = i - 1
            while j >= 0:
                prev_line = lines[j].strip()
                if not prev_line:
                    j -= 1
                    continue

                # Skip if the previous token is '=' (preceded by =)
                if prev_line.endswith('='):
                    # the list is on the next line, but we skip it
                    j -= 1
                    continue

                # Check if the previous line contains a list
                m = list_pattern.search(prev_line)
                if m:
                    raw_list = m.group(0)
                    # Clean the list: ['d', 'Àà', 'a', ...] ‚Üí d Àà a ...
                    cleaned = raw_list.strip("[]")
                    phonemes = [p.strip().strip("'\"") for p in cleaned.split(",") if p.strip()]
                    phoneme_str = "".join(phonemes)
                    mapped_phoneme_str = normalize_phonemes(phoneme_str)
                    phoneme_lines.append(mapped_phoneme_str)
                    break
                else:
                    # No list found before Status ‚Äì unusual, but move on
                    break
                j -= 1
        i += 1

    # ------------------------------------------------------------------
    # Write results
    with open(output_file, 'w', encoding='utf-8') as out:
        for pl in phoneme_lines:
            out.write(pl + "\n")

    print(f"Extracted {len(phoneme_lines)} phoneme sequences from {log_file} ‚Üí {output_file}")

Extracted 400 phoneme sequences from inference_benchmark.log ‚Üí inference_benchmark_phonemes.txt
Extracted 400 phoneme sequences from inference_benchmark2.log ‚Üí inference_benchmark2_phonemes.txt
Extracted 400 phoneme sequences from inference_benchmark3.log ‚Üí inference_benchmark3_phonemes.txt


In [None]:
import json
import numpy as np
from jiwer import cer


# --- assume you already have these ---
# from your_module import print_all_metrics, get_avg_cer_of_method, get_phonetic_model_performance, get_homograph_performance

def evaluate_all_runs(run_files, benchmark):
    """
    Evaluate all prediction runs, report average and std for metrics.
    """
    metrics = {
        "per": [],
        "acc": [],
        "prec": [],
        "recall": [],
        "f1": [],
        "homograph": []
    }

    for path in run_files:
        print(f"\nüìò Evaluating {path} ...")
        with open(path, "r", encoding="utf-8") as f:
            predictions = f.readlines()

        # Extract predicted phoneme strings
        predictions = [p.strip() for p in predictions]

        print(predictions)

        # Compute metrics individually (assuming you have these functions)
        per = get_avg_cer_of_method(predictions, benchmark) * 100
        acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)
        homograph = get_homograph_performance(predictions, benchmark) * 100
        f1 = (2 * prec * recall) / (prec + recall) if (prec + recall) != 0 else 0.0

        # Print per-run results
        print(f"PER: {per:.4f}")
        print(f"ACC: {acc:.4f}, PREC: {prec:.4f}, RECALL: {recall:.4f}, F1: {f1:.4f}")
        print(f"HOMOGRAPH: {homograph:.4f}")

        # Collect metrics
        metrics["per"].append(per)
        metrics["acc"].append(acc)
        metrics["prec"].append(prec)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["homograph"].append(homograph)

    # ---- Summary ----
    print("\nüìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===")
    for key, values in metrics.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"{key.upper():<10}: {mean:.4f} ¬± {std:.4f}")

    return metrics


# Example usage
if __name__ == "__main__":
    run_files = [f"inference_benchmark{i}_phonemes.txt" for i in range(1, 4)]
    metrics = evaluate_all_runs(run_files, benchmark)



üìò Evaluating inference_benchmark1_phonemes.txt ...
['dar ?in neveSte banA dArim bA yek ?abzAr sAde va mekAniki ?afzAyeS binAi barAye ?afrAd kambinA', 'be nAm bivapatik yA ?adasi durnamA ?ASnA Savim.', 'darAyensurat ?anjAm xodArazyAbi va ?erAe bAzxord bar ?ohde xodetAn ?ast.', 'va pas ?az peygirihAye mote?added', '?u dar mored xarid xadamAt', 'dastgAh mA sari? kamsedA va sAde ?ast va barAye kAr dar madrese va bA SAgerdAn dabestAni jAn midahad.', 'Soql peydAkardan ?afrAd nAbinA va kambinA', 'barAye lorA vazAyef gunAguni dar gugel ta?rifSode ?ast; ?az jomle ?inke', 'hamCenin dar rAstAye ?ejrAi Sodan mofAdd moxtalef har qAnuni', 'taxsis budjehA be ?ostAnhA va tozi? ?An dar SahrestAnhA', '?in maqAle be ?ejmAl sir tahavvolAt ?in nahAd rA', 'yeki digar ?az rAhkArhAye taqir negareS ?ertebAt bA ?afrAd ?Asibdide binAi ?ast.', 'tA dargiri zehniaS rA bartaraf konad ba?d be sorAq ?u biyAyad.', 'va mitavAnand be ?afrAd ?Asibdideye binAi komak konand ?az digar nokAt matrahSode', 'ba?d ?az ?enteSA

# Glow

## RTF

In [None]:
import re
import os

# Input log files
log_files = [
    "tts_benchmark_log1.txt",
    "tts_benchmark_log2.txt",
    "tts_benchmark_log3.txt"
]

# Regex to match: Real-time factor: 1.2663363701875638
rtf_pattern = re.compile(r"Real-time factor:\s*([\d\.]+)")

for log_file in log_files:
    if not os.path.exists(log_file):
        print(f"Warning: {log_file} not found. Skipping.")
        continue

    output_file = log_file.replace(".txt", "_rtf.txt")
    rtf_values = []

    with open(log_file, 'r', encoding='utf-8') as f:
        for line in f:
            match = rtf_pattern.search(line)
            if match:
                rtf_values.append(match.group(1))

    # Save extracted RTFs
    with open(output_file, 'w', encoding='utf-8') as out:
        for rtf in rtf_values:
            out.write(rtf + "\n")

    print(f"Extracted {len(rtf_values)} RTF values from {log_file} ‚Üí {output_file}")

Extracted 400 RTF values from tts_benchmark_log1.txt ‚Üí tts_benchmark_log1_rtf.txt
Extracted 400 RTF values from tts_benchmark_log2.txt ‚Üí tts_benchmark_log2_rtf.txt
Extracted 400 RTF values from tts_benchmark_log3.txt ‚Üí tts_benchmark_log3_rtf.txt


In [None]:
import re
import os
import numpy as np

# ----------------------------------------------------------------------
# Input files
log_files = [
    "tts_benchmark_log1.txt",
    "tts_benchmark_log2.txt",
    "tts_benchmark_log3.txt"
]

# Regex for "Real-time factor: 1.23456"
rtf_pattern = re.compile(r"Real-time factor:\s*([\d\.]+)")

# Store results
all_rtf_values = []      # all RTF values across all files
per_file_rtf = {}         # file ‚Üí list of RTF values
per_file_avg = {}         # file ‚Üí average RTF

# ----------------------------------------------------------------------
# 1. Extract RTF from each file
for log_file in log_files:
    if not os.path.exists(log_file):
        print(f"Warning: {log_file} not found. Skipping.")
        continue

    rtf_values = []
    with open(log_file, 'r', encoding='utf-8') as f:
        for line in f:
            m = rtf_pattern.search(line)
            if m:
                rtf_values.append(float(m.group(1)))

    per_file_rtf[log_file] = rtf_values
    all_rtf_values.extend(rtf_values)

    # Save extracted RTF to file (optional)
    out_file = log_file.replace(".txt", "_rtf.txt")
    with open(out_file, 'w', encoding='utf-8') as out:
        for v in rtf_values:
            out.write(f"{v}\n")
    print(f"Extracted {len(rtf_values)} RTF values ‚Üí {out_file}")

# ----------------------------------------------------------------------
# 2. Compute per-file averages
print("\n" + "="*60)
print("PER-FILE RTF AVERAGES")
print("="*60)
for log_file, rtfs in per_file_rtf.items():
    avg = np.mean(rtfs)
    per_file_avg[log_file] = avg
    print(f"{log_file:25} ‚Üí Average RTF = {avg:.6f}")

# ----------------------------------------------------------------------
# 3. Overall average and std across all utterances
if all_rtf_values:
    overall_avg = np.mean(all_rtf_values)
    overall_std = np.std(all_rtf_values, ddof=1)  # sample std
    total_utterances = len(all_rtf_values)

    print("\n" + "="*60)
    print("OVERALL STATISTICS (ALL UTTERANCES)")
    print("="*60)
    print(f"Total utterances       : {total_utterances}")
    print(f"Overall average RTF    : {overall_avg:.6f}")
    print(f"Standard deviation     : {overall_std:.6f}")
else:
    print("No RTF values found in any file.")

# ----------------------------------------------------------------------
# Optional: Save summary to a file
summary_file = "tts_rtf_summary.txt"
with open(summary_file, 'w', encoding='utf-8') as s:
    s.write("TTS RTF ANALYSIS SUMMARY\n")
    s.write("="*50 + "\n\n")
    s.write("Per-file averages:\n")
    for f, avg in per_file_avg.items():
        s.write(f"  {f}: {avg:.6f}\n")
    s.write(f"\nOverall average RTF: {overall_avg:.6f}\n")
    s.write(f"Standard deviation : {overall_std:.6f}\n")
    s.write(f"Total utterances   : {total_utterances}\n")

print(f"\nSummary saved to: {summary_file}")

Extracted 400 RTF values ‚Üí tts_benchmark_log1_rtf.txt
Extracted 400 RTF values ‚Üí tts_benchmark_log2_rtf.txt
Extracted 400 RTF values ‚Üí tts_benchmark_log3_rtf.txt

PER-FILE RTF AVERAGES
tts_benchmark_log1.txt    ‚Üí Average RTF = 1.484517
tts_benchmark_log2.txt    ‚Üí Average RTF = 0.770505
tts_benchmark_log3.txt    ‚Üí Average RTF = 1.835568

OVERALL STATISTICS (ALL UTTERANCES)
Total utterances       : 1200
Overall average RTF    : 1.363530
Standard deviation     : 0.704638

Summary saved to: tts_rtf_summary.txt


## Phonemes

In [None]:
import re
import os

# Input log files
log_files = [
    "phonemizer_benchmark_log1.txt",
    "phonemizer_benchmark_log2.txt",
    "phonemizer_benchmark_log3.txt"
]

# Regex to match: Phonemes: d|Ààa|r iÀê|n ...
phoneme_pattern = re.compile(r"Phonemes:\s*(.+)")

for log_file in log_files:
    if not os.path.exists(log_file):
        print(f"Warning: {log_file} not found. Skipping.")
        continue

    output_file = log_file.replace(".txt", "_phonemes.txt")
    phoneme_lines = []

    with open(log_file, 'r', encoding='utf-8') as f:
        for line in f:
            match = phoneme_pattern.search(line)
            if match:
                raw_phonemes = match.group(1).strip()
                # Replace '|' with space and clean up
                clean_phonemes = raw_phonemes.replace('|', '').strip()
                mapped_phonemes = normalize_phonemes(clean_phonemes)
                phoneme_lines.append(mapped_phonemes)

    # Save to output file
    with open(output_file, 'w', encoding='utf-8') as out:
        for line in phoneme_lines:
            out.write(line + "\n")

    print(f"Extracted {len(phoneme_lines)} phoneme sequences ‚Üí {output_file}")

Extracted 400 phoneme sequences ‚Üí phonemizer_benchmark_log1_phonemes.txt
Extracted 400 phoneme sequences ‚Üí phonemizer_benchmark_log2_phonemes.txt
Extracted 400 phoneme sequences ‚Üí phonemizer_benchmark_log3_phonemes.txt


In [None]:
import json
import numpy as np
from jiwer import cer


# --- assume you already have these ---
# from your_module import print_all_metrics, get_avg_cer_of_method, get_phonetic_model_performance, get_homograph_performance

def evaluate_all_runs(run_files, benchmark):
    """
    Evaluate all prediction runs, report average and std for metrics.
    """
    metrics = {
        "per": [],
        "acc": [],
        "prec": [],
        "recall": [],
        "f1": [],
        "homograph": []
    }

    for path in run_files:
        print(f"\nüìò Evaluating {path} ...")
        with open(path, "r", encoding="utf-8") as f:
            predictions = f.readlines()

        # Extract predicted phoneme strings
        predictions = [p.strip() for p in predictions]

        print(predictions)

        # Compute metrics individually (assuming you have these functions)
        per = get_avg_cer_of_method(predictions, benchmark) * 100
        acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)
        homograph = get_homograph_performance(predictions, benchmark) * 100
        f1 = (2 * prec * recall) / (prec + recall) if (prec + recall) != 0 else 0.0

        # Print per-run results
        print(f"PER: {per:.4f}")
        print(f"ACC: {acc:.4f}, PREC: {prec:.4f}, RECALL: {recall:.4f}, F1: {f1:.4f}")
        print(f"HOMOGRAPH: {homograph:.4f}")

        # Collect metrics
        metrics["per"].append(per)
        metrics["acc"].append(acc)
        metrics["prec"].append(prec)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["homograph"].append(homograph)

    # ---- Summary ----
    print("\nüìä === AVERAGE ¬± STD RESULTS ACROSS RUNS ===")
    for key, values in metrics.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"{key.upper():<10}: {mean:.4f} ¬± {std:.4f}")

    return metrics


# Example usage
if __name__ == "__main__":
    run_files = [f"phonemizer_benchmark_log{i}_phonemes.txt" for i in range(1, 4)]
    metrics = evaluate_all_runs(run_files, benchmark)



üìò Evaluating phonemizer_benchmark_log1_phonemes.txt ...
['dar ?in neveSte banA dArim bA yek ?abzAr sAde va mekAniki ?afzAyeS binAi barAye ?afrAd kambinA', 'be nAm bivapatik yA ?adasi durnamA ?ASnA Savim', 'darAyensuratanjAm xodArazyAbi va ?erAe bAzxord bar ?ohde xodetAn ?ast', 'va pas ?az peygirihAye mote?added', '?u dar mored xarid xadamAt', 'dastgAh mA sari?kamsedA va sAde ?ast va barAye kAr dar madrese va bA SAgerdAn dabestAni jAn midahad', 'Soql peydAkardan ?afrAd nAbinA va kambinA', 'barAye lorA vazAyef gunAguni dar gugel ta?rifSode ?astaz jomle ?inke', 'hamCenin dar rAstAye ?ejrAi Sodan mofAdd moxtalef har qAnuni', 'taxsis budjehA be ?ostAnhA va tozi? ?An dar SahrestAnhA', '?in maqAlebe ?ejmAlsir tahavvolAt ?in nahAd rA', 'yeki digar ?az rAhkArhAye taqir negareSertebAt bA ?afrAd ?Asibdide binAi ?ast', 'tA dargiri zehniaS rA bartaraf konad ba?d be sorAq ?u biyAyad', 'va mitavAnand be ?afrAd ?Asibdideye binAi komak konandaz digar nokAt matrahSode', 'ba?d ?az ?enteSAr ?in xabard