In [2]:
from faster_whisper import WhisperModel
import os

MODEL_NAME = "small"   # 필요시 "small", "large-v3" 등으로 변경
DEVICE     = "cpu"      # M1 환경은 보통 cpu. (CUDA가 없으면 auto도 결국 cpu)
CPU_THREADS = max(1, (os.cpu_count() or 4) - 1)  # 여유 1코어 남기기
NUM_WORKERS = 1         # 데이터 로딩 워커. 파일 단위 평가면 1이면 충분

# CPU에서는 이 순서가 안전합니다.
CANDIDATE_CTYPES = ["int8", "float32"]

whisper = None
last_err = None
for ct in CANDIDATE_CTYPES:
    try:
        print(f"Trying faster-whisper: device={DEVICE}, compute_type={ct}, threads={CPU_THREADS}")
        whisper = WhisperModel(
            MODEL_NAME,
            device=DEVICE,
            compute_type=ct,
            cpu_threads=CPU_THREADS,
            num_workers=NUM_WORKERS,
        )
        print(f"✅ Loaded: compute_type={ct}")
        COMPUTE_TYPE = ct
        break
    except Exception as e:
        print(f"  -> failed: {type(e).__name__}: {e}")
        last_err = e

if whisper is None:
    raise RuntimeError(f"모델 로드 실패. 시도한 compute_types={CANDIDATE_CTYPES}") from last_err


  from .autonotebook import tqdm as notebook_tqdm


Trying faster-whisper: device=cpu, compute_type=int8, threads=7
✅ Loaded: compute_type=int8


In [3]:
def fw_transcribe(path, language="ko", beam_size=5):
    """
    faster-whisper로 한 파일을 텍스트로 변환.
    """
    # vad_filter=False: 파일 단위 평가라 VAD는 생략(속도 ↑, 변동 ↓)
    segments, info = whisper.transcribe(
        str(path),
        language=language,
        task="transcribe",
        beam_size=beam_size,
        vad_filter=False,
        without_timestamps=True,
        condition_on_previous_text=False,  # 파일 단위라 문맥 의존 비활성
    )
    # segments는 generator. join해서 1줄 텍스트로 반환
    return " ".join(seg.text for seg in segments).strip()


In [4]:
import time, csv
from pathlib import Path
import soundfile as sf

def evaluate_split_fw(wav_dir, trn_path, csv_out=None, preview_miss=10, log_every=20):
    wav_dir  = Path(wav_dir)
    trn_path = Path(trn_path)
    csv_out  = Path(csv_out) if csv_out is not None else None

    refs = load_trn(trn_path)
    wavs = sorted(wav_dir.glob("*.wav"))
    print(f"WAV: {len(wavs)} | TRN entries: {len(refs)}")

    stems = [w.stem for w in wavs]
    missing = [s for s in stems if s not in refs]
    if missing:
        print(f"[경고] TRN에서 찾을 수 없는 wav 키: {len(missing)}/{len(wavs)}개")
        for x in missing[:preview_miss]:
            print("  -", x)

    rows = []
    tot_cdist = tot_cN = 0
    tot_wdist = tot_wN = 0
    tot_secs  = tot_infer = 0.0

    for i, wav in enumerate(wavs, 1):
        utt = wav.stem
        ref = refs.get(utt)
        if ref is None:
            continue

        # 길이(sec)
        audio, sr = sf.read(str(wav), dtype="float32", always_2d=False)
        dur = float(len(audio) / sr)

        # 추론
        t0 = time.perf_counter()
        hyp = fw_transcribe(wav, language="ko", beam_size=5)
        t1 = time.perf_counter()

        infer = t1 - t0
        rtf = infer / max(dur, 1e-9)

        cer, cdist, cN = cer_score(ref, hyp)
        wer, wdist, wN = wer_score(ref, hyp)

        rows.append({
            "utt": utt,
            "dur_s": round(dur, 3),
            "infer_s": round(infer, 3),
            "rtf": round(rtf, 3),
            "CER": round(cer, 4),
            "WER": round(wer, 4),
            "ref": ref,
            "hyp": hyp,
        })

        tot_cdist += cdist; tot_cN += cN
        tot_wdist += wdist; tot_wN += wN
        tot_secs  += dur;   tot_infer += infer

        if i % log_every == 0:
            print(f"[{i}/{len(wavs)}] RTF {rtf:.2f} | CER {cer:.3f} | WER {wer:.3f}")

    if not rows:
        print("\n[중단] 평가에 매칭된 항목이 없습니다. 경로/TRN을 점검하세요.")
        return {"rows": [], "summary": {"files": 0, "unmatched": len(missing)}}

    overall_cer = (tot_cdist / tot_cN) if tot_cN else 0.0
    overall_wer = (tot_wdist / tot_wN) if tot_wN else 0.0
    avg_rtf     = (tot_infer / tot_secs) if tot_secs else 0.0

    print("\n=== Summary (faster-whisper) ===")
    print(f"Files scored     : {len(rows)}  (unmatched: {len(missing)})")
    print(f"Total audio (s)  : {tot_secs:.1f}")
    print(f"Total infer (s)  : {tot_infer:.1f}")
    print(f"Avg RTF          : {avg_rtf:.3f}")
    print(f"CER (char)       : {overall_cer:.4f}")
    print(f"WER (word)       : {overall_wer:.4f}")

    if csv_out:
        csv_out.parent.mkdir(parents=True, exist_ok=True)
        fieldnames = ["utt","dur_s","infer_s","rtf","CER","WER","ref","hyp"]
        with open(csv_out, "w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            for r in rows:
                w.writerow(r)
        print(f"Saved: {csv_out}")

    # 오차 큰 샘플 5개
    rows_sorted = sorted(rows, key=lambda r: (-r["CER"], -r["WER"], -r["rtf"]))
    print("\nTop-5 by CER:")
    for r in rows_sorted[:5]:
        print(f"- {r['utt']} | CER {r['CER']:.3f} WER {r['WER']:.3f} RTF {r['rtf']:.2f}")
        print(f"  ref: {r['ref']}")
        print(f"  hyp: {r['hyp']}")

    return {
        "rows": rows,
        "summary": dict(files=len(rows), unmatched=len(missing),
                        total_audio_s=tot_secs, total_infer_s=tot_infer,
                        avg_rtf=avg_rtf, cer=overall_cer, wer=overall_wer)
    }


In [5]:
import os, re, time, unicodedata, csv
from pathlib import Path

def u_nfc(s: str) -> str:
    return unicodedata.normalize("NFC", s)

# 간단 정규화: 한글 보존, 구두점 제거, 공백 정리
import re
_re_punct = re.compile(r"[^\w\s]", flags=re.UNICODE)
_re_ws = re.compile(r"\s+")

def normalize_for_wer(s: str) -> str:
    s = u_nfc(s).lower()
    s = _re_punct.sub(" ", s)
    s = _re_ws.sub(" ", s).strip()
    return s

def normalize_for_cer(s: str) -> str:
    s = normalize_for_wer(s)
    s = s.replace(" ", "")
    return s

def levenshtein(seq_a, seq_b):
    n, m = len(seq_a), len(seq_b)
    if n == 0: return m
    if m == 0: return n
    dp = list(range(m+1))
    for i in range(1, n+1):
        prev, dp[0] = dp[0], i
        for j in range(1, m+1):
            cur = dp[j]
            cost = 0 if seq_a[i-1] == seq_b[j-1] else 1
            dp[j] = min(dp[j] + 1, dp[j-1] + 1, prev + cost)
            prev = cur
    return dp[m]

def cer_score(ref: str, hyp: str):
    r = normalize_for_cer(ref)
    h = normalize_for_cer(hyp)
    if len(r) == 0: return 0.0, 0, 0
    dist = levenshtein(r, h)
    return dist / len(r), dist, len(r)

def wer_score(ref: str, hyp: str):
    r = normalize_for_wer(ref).split()
    h = normalize_for_wer(hyp).split()
    if len(r) == 0: return 0.0, 0, 0
    dist = levenshtein(r, h)
    return dist / len(r), dist, len(r)

def load_trn(trn_path) -> dict:
    """
    TRN 라인 예시 여러 형태를 모두 허용:
      1) '문장 텍스트 ... (KsponSpeech_E00001)'
      2) 'KsponSpeech_E00001 문장 텍스트 ...'
      3) 'KsponSpeech_E00001.wav\t문장 텍스트 ...'
    반환: { 'KsponSpeech_E00001': '문장 텍스트 ...', ... }
    """
    trn_path = Path(trn_path)
    mapping = {}
    with trn_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # 케이스 1) 마지막 괄호의 ID
            m = re.search(r"\(([^)]+)\)\s*$", line)
            if m:
                utt = m.group(1)
                text = line[:m.start()].strip()
            else:
                # 케이스 2/3) <utt>[.wav] <sep> <text>
                parts = re.split(r"[\t ]+", line, maxsplit=1)
                if len(parts) == 2:
                    utt, text = parts[0], parts[1]
                else:
                    # 파싱 실패 시 스킵
                    continue

            utt = os.path.basename(utt)
            utt = os.path.splitext(utt)[0]  # .wav 제거
            mapping[utt] = text
    return mapping


In [None]:
from pathlib import Path

wav_dir  = Path("/Users/leejeje/Desktop/DSL/25-1/Modeling/data/KsponSpeech_eval/eval_clean")
trn_path = Path("/Users/leejeje/Desktop/DSL/25-1/Modeling/data/KsponSpeech_scripts/eval_clean.trn")
out_csv  = Path("results_eval") / "fws_eval_clean_results.csv"

res_fw_clean = evaluate_split_fw(wav_dir, trn_path, csv_out=out_csv)


WAV: 3000 | TRN entries: 3000
[경고] TRN에서 찾을 수 없는 wav 키: 19/3000개
  - KsponSpeech_E00054
  - KsponSpeech_E00135
  - KsponSpeech_E00277
  - KsponSpeech_E00511
  - KsponSpeech_E00581
  - KsponSpeech_E00950
  - KsponSpeech_E01113
  - KsponSpeech_E01343
  - KsponSpeech_E01352
  - KsponSpeech_E01377
[20/3000] RTF 0.67 | CER 0.051 | WER 0.158
[40/3000] RTF 2.04 | CER 0.000 | WER 0.000
[60/3000] RTF 1.79 | CER 0.143 | WER 0.333
[80/3000] RTF 3.00 | CER 0.143 | WER 0.250
[100/3000] RTF 1.31 | CER 0.254 | WER 0.556
