In [3]:
from textgrid import TextGrid
import numpy as np
from sentence_transformers import SentenceTransformer, util

# ✅ semantic model 로딩
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def normalize(text):
    return text.replace(" ", "").replace(".", "").replace(",", "").lower()

def get_combined_text_and_time(tier, words):
    matched = []
    start, end = None, None
    for interval in tier:
        mark = normalize(interval.mark)
        if mark in [normalize(w) for w in words]:
            matched.append(mark)
            if start is None:
                start = interval.minTime
            end = interval.maxTime
    return " ".join(matched), start, end

# Korean / English TextGrid
tg_ko = TextGrid.fromFile("./sound_sample/korean_yunjang/윤장목소리1.TextGrid")
tg_en = TextGrid.fromFile("./sound_sample/AI_eng_yunjang/mine_en.TextGrid")
tier_ko = tg_ko.getFirst("words")
tier_en = tg_en.getFirst("words")

# 의미 기반 수동 정렬
sentence_mapping = [
    (["안녕", "하", "세요"], ["hello"]),
    (["제", "이름", "은", "조윤장", "입니다"], ["my", "name", "is", "jo", "yoon", "jang"]),
    (["만나", "서", "반갑", "습니다"], ["nice", "to", "meet", "you"]),
    (["잘", "부탁", "드립니다"], ["please", "take", "care", "of", "me"])
]

dur_diffs = []
semantic_scores = []
relaxation_allowance = 0.3  # ±300ms

print("🔍 문장 단위 비교:\n")

for ko_words, en_words in sentence_mapping:
    ko_text, ko_start, ko_end = get_combined_text_and_time(tier_ko, ko_words)
    en_text, en_start, en_end = get_combined_text_and_time(tier_en, en_words)

    if not ko_text or not en_text:
        print(f"⚠️ 매핑 실패: {ko_words} ↔ {en_words}")
        continue

    ko_dur = ko_end - ko_start
    en_dur = en_end - en_start

    # 🧠 의미 유사도 (cross-lingual semantic match)
    embedding_ko = model.encode(ko_text, convert_to_tensor=True)
    embedding_en = model.encode(en_text, convert_to_tensor=True)
    semantic_score = util.cos_sim(embedding_ko, embedding_en).item()

    # 🕐 Relaxed 길이 비교 (±0.3s 허용)
    diff = abs(ko_dur - en_dur)
    dur_diffs.append(max(0, diff - relaxation_allowance))  # 차이에서 유연성 감안

    semantic_scores.append(semantic_score)

    print(f"[{ko_text}] ({ko_dur:.2f}s) ↔ [{en_text}] ({en_dur:.2f}s) → Δ = {diff:.2f}s, semantic: {semantic_score:.3f}")

# 🧮 최종 스코어
iso_score = 1 - (np.mean(dur_diffs) / max(tier_ko.maxTime, tier_en.maxTime))
sem_score = np.mean(semantic_scores)

print(f"\n✅ Isochrony Score (with relaxation): {iso_score:.3f}")
print(f"✅ Avg Semantic Similarity: {sem_score:.3f}")


🔍 문장 단위 비교:

[안녕 하 세요] (0.80s) ↔ [hello] (0.78s) → Δ = 0.02s, semantic: 0.871
[제 이름 은 조윤장 입니다] (2.59s) ↔ [my name is jo yoon jang] (1.64s) → Δ = 0.95s, semantic: 0.729
[만나 서 반갑 습니다] (1.55s) ↔ [nice to meet you] (2.08s) → Δ = 0.53s, semantic: 0.536
[잘 부탁 드립니다] (1.24s) ↔ [please take care of me] (2.97s) → Δ = 1.73s, semantic: 0.493

✅ Isochrony Score (with relaxation): 0.949
✅ Avg Semantic Similarity: 0.657
