In [1]:
import sys, platform, shutil, os
from pathlib import Path
import urllib.request, sys
print("Python:", sys.version)
print("Platform:", platform.platform(), "| Machine:", platform.machine())
print("Conda Prefix:", os.environ.get("CONDA_PREFIX"))
print("which python:", shutil.which("python"))

Python: 3.11.13 | packaged by conda-forge | (main, Jun  4 2025, 14:52:34) [Clang 18.1.8 ]
Platform: macOS-15.6.1-arm64-arm-64bit | Machine: arm64
Conda Prefix: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean
which python: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin/python


In [2]:
WORK_DIR = "/Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16"

In [3]:
MODEL_URL = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-korean-2024-06-16.tar.bz2"
TAR_NAME = "sherpa-onnx-streaming-zipformer-korean-2024-06-16.tar.bz2"
EXTRACTED_DIR_NAME = "sherpa-onnx-streaming-zipformer-korean-2024-06-16"

In [4]:
print("WORK_DIR:", WORK_DIR)
print("MODEL_URL:", MODEL_URL)

WORK_DIR: /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16
MODEL_URL: https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-korean-2024-06-16.tar.bz2


In [8]:
if isinstance(WORK_DIR, str):
    WORK_DIR = Path(WORK_DIR)
tar_path = WORK_DIR / TAR_NAME

if tar_path.exists():
    print(f"[skip] 이미 존재: {tar_path.name} ({tar_path.stat().st_size/1e6:.2f} MB)")
else:
    print(f"다운로드 시작: {MODEL_URL}")
    def _progress(block_num, block_size, total_size):
        downloaded = block_num * block_size
        percent = min(100, downloaded * 100 / (total_size or 1))
        sys.stdout.write(f"\r  {percent:6.2f}% ({downloaded/1e6:.1f} / {total_size/1e6 if total_size>0 else 0:.1f} MB)")
        sys.stdout.flush()
    urllib.request.urlretrieve(MODEL_URL, filename=tar_path, reporthook=_progress)
    print("\n완료:", tar_path)

print("파일 확인:", tar_path.exists(), "| 크기:", f"{tar_path.stat().st_size/1e6:.2f} MB" if tar_path.exists() else None)

[skip] 이미 존재: sherpa-onnx-streaming-zipformer-korean-2024-06-16.tar.bz2 (418.22 MB)
파일 확인: True | 크기: 418.22 MB


In [9]:
import tarfile
extract_dir = WORK_DIR / EXTRACTED_DIR_NAME

# if extract_dir.exists() and any(extract_dir.iterdir()):
#     print(f"[skip] 이미 풀림: {extract_dir}")
# else:
#     print(f"압축 해제 → {extract_dir}")
#     with tarfile.open(tar_path, "r:bz2") as tar:
#         tar.extractall(WORK_DIR)

# print("내용물 예시:")
# for p in sorted(extract_dir.iterdir()):
#     print(" -", p.name)

In [10]:
from pathlib import Path

def pick_one(patterns):
    for pat in patterns:
        items = sorted(Path(extract_dir).glob(pat))
        if items:
            return str(items[0])
    return None

tokens       = pick_one(["tokens.txt", "**/tokens.txt"])
encoder_fp32 = pick_one(["encoder-epoch-*.onnx"])
decoder_fp32 = pick_one(["decoder-epoch-*.onnx"])
joiner_fp32  = pick_one(["joiner-epoch-*.onnx"])

# (선택) int8 변형
encoder_int8 = pick_one(["*encoder*-int8.onnx", "encoder-*-int8.onnx"])
joiner_int8  = pick_one(["*joiner*-int8.onnx",  "joiner-*-int8.onnx"])

print("tokens       :", tokens)
print("encoder_fp32 :", encoder_fp32)
print("decoder_fp32 :", decoder_fp32)
print("joiner_fp32  :", joiner_fp32)
print("encoder_int8 :", encoder_int8)
print("joiner_int8  :", joiner_int8)

assert tokens and encoder_fp32 and decoder_fp32 and joiner_fp32, "필수 onnx/tokens 파일을 찾지 못했습니다."


tokens       : /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt
encoder_fp32 : /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.int8.onnx
decoder_fp32 : /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.int8.onnx
joiner_fp32  : /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.int8.onnx
encoder_int8 : None
joiner_int8  : None


In [11]:
import subprocess, shlex
from pathlib import Path

test_wav = Path(extract_dir) / "test_wavs" / "0.wav"
assert test_wav.exists(), f"샘플 wav를 찾지 못했습니다: {test_wav}"

cmd = [
    "sherpa-onnx",
    "--tokens", tokens,
    "--encoder", encoder_fp32,
    "--decoder", decoder_fp32,
    "--joiner",  joiner_fp32,
    str(test_wav),
]
print("실행 명령:", " ".join(shlex.quote(x) for x in cmd))

try:
    out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, text=True)
    print(out)
except FileNotFoundError:
    print("[문제] sherpa-onnx CLI를 찾을 수 없습니다. 같은 커널/환경에 설치되어 있는지 확인하세요.")
except subprocess.CalledProcessError as e:
    print("실행 실패! returncode=", e.returncode)
    print("출력:\n", e.output)

실행 명령: sherpa-onnx --tokens /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt --encoder /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.int8.onnx --decoder /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.int8.onnx --joiner /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.int8.onnx /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav
[문제] sherpa-onnx CLI를 찾을 수 없습니다. 같은 커널/환경에 설치되어 있는지 확인하세요.


In [25]:
import sys, shutil
import importlib

print("Python exe:", sys.executable)
try:
    m = importlib.import_module("sherpa_onnx")
    print("sherpa_onnx module:", m.__file__)
except Exception as e:
    print("[문제] sherpa_onnx가 현재 커널에 설치되지 않았습니다.", e)

Python exe: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin/python
sherpa_onnx module: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/lib/python3.11/site-packages/sherpa_onnx/__init__.py


In [26]:
import os, sysconfig, shutil
from pathlib import Path

scripts_dir = Path(sysconfig.get_path("scripts"))  # 이 커널이 사용하는 Scripts/bin 디렉터리
os.environ["PATH"] = str(scripts_dir) + os.pathsep + os.environ.get("PATH", "")
print("scripts_dir:", scripts_dir)
print("which sherpa-onnx ->", shutil.which("sherpa-onnx"))

# (보강) conda 환경의 bin 도 함께 확인
conda_prefix = os.environ.get("CONDA_PREFIX")
if not shutil.which("sherpa-onnx") and conda_prefix:
    cand = Path(conda_prefix) / "bin" / "sherpa-onnx"
    if cand.exists():
        os.environ["PATH"] = str(cand.parent) + os.pathsep + os.environ["PATH"]
        print("added:", cand.parent)
        print("which sherpa-onnx ->", shutil.which("sherpa-onnx"))

scripts_dir: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin
which sherpa-onnx -> None


In [12]:
import sys, subprocess, importlib

print("Python exe:", sys.executable)
print(subprocess.check_output([sys.executable, "-m", "pip", "--version"], text=True))
try:
    import sherpa_onnx
    print("sherpa_onnx module:", sherpa_onnx.__file__)
except Exception as e:
    print("[문제] 현재 커널에 sherpa_onnx가 없습니다:", e)
    print("→ 아래 명령으로 현재 커널에 재설치하세요:")
    print(f"{sys.executable} -m pip install --upgrade pip wheel")
    print(f"{sys.executable} -m pip install --force-reinstall --no-cache-dir sherpa-onnx")

Python exe: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin/python
pip 25.2 from /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/lib/python3.11/site-packages/pip (python 3.11)

sherpa_onnx module: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/lib/python3.11/site-packages/sherpa_onnx/__init__.py


In [13]:
import sys, shutil, sysconfig
from pathlib import Path

CLI = None
cli_path = shutil.which("sherpa-onnx")
if cli_path:
    CLI = [cli_path]
else:
    # fallback: 모듈 실행 (콘솔 스크립트 없이도 100% 동작)
    CLI = [sys.executable, "-m", "sherpa_onnx"]

print("CLI command prefix:", " ".join(CLI))
print("scripts_dir:", sysconfig.get_path("scripts"))

CLI command prefix: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin/python -m sherpa_onnx
scripts_dir: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin


In [16]:
import sys, importlib
print("Python:", sys.executable)
try:
    import sherpa_onnx
    print("sherpa_onnx:", getattr(sherpa_onnx, "__version__", "unknown"))
except Exception as e:
    print("sherpa_onnx import 실패:", e)
    raise SystemExit("현재 커널(asr-korean)에 설치되어 있는지 확인하세요: %pip install --upgrade pip wheel && %pip install sherpa-onnx")


Python: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin/python
sherpa_onnx: 1.12.10


In [17]:
from pathlib import Path

# 너의 경로 그대로 사용
extract_dir = Path("/Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16")
assert extract_dir.exists(), f"extract_dir 없음: {extract_dir}"

def pick_latest(glob_pat):
    files = sorted(extract_dir.glob(glob_pat))
    return files[-1] if files else None

# 파일을 fp32 / int8로 분리 ('.int8.' 포함 여부로 판단)
def split_fp32_int8(prefix):
    all_files = sorted(extract_dir.glob(f"{prefix}-epoch-*.onnx"))
    fp32 = [p for p in all_files if ".int8." not in p.name]
    int8 = [p for p in all_files if ".int8." in p.name]
    return (fp32[-1] if fp32 else None, int8[-1] if int8 else None)

encoder_fp32, encoder_int8 = split_fp32_int8("encoder")
decoder_fp32, decoder_int8 = split_fp32_int8("decoder")
joiner_fp32,  joiner_int8  = split_fp32_int8("joiner")

tokens = extract_dir / "tokens.txt"
test_wav = extract_dir / "test_wavs" / "0.wav"

print("tokens     :", tokens.name, tokens.exists())
print("encoder fp32/int8:", getattr(encoder_fp32, "name", None), "|", getattr(encoder_int8, "name", None))
print("decoder fp32/int8:", getattr(decoder_fp32, "name", None), "|", getattr(decoder_int8, "name", None))
print("joiner  fp32/int8:", getattr(joiner_fp32,  "name", None), "|", getattr(joiner_int8,  "name", None))
print("test_wav   :", test_wav.name, test_wav.exists())

assert tokens.exists(), "tokens.txt가 없습니다."
assert test_wav.exists(), "샘플 wav가 없습니다."

# 안전한 기본 조합: fp32가 있으면 fp32 사용, 없으면 int8 사용
ENC = (encoder_fp32 or encoder_int8)
DEC = (decoder_fp32 or decoder_int8)
JOIN = (joiner_fp32  or joiner_int8)
assert ENC and DEC and JOIN, "encoder/decoder/joiner onnx 파일이 부족합니다."
print("사용 조합:", ENC.name, "|", DEC.name, "|", JOIN.name)


tokens     : tokens.txt True
encoder fp32/int8: encoder-epoch-99-avg-1.onnx | encoder-epoch-99-avg-1.int8.onnx
decoder fp32/int8: decoder-epoch-99-avg-1.onnx | decoder-epoch-99-avg-1.int8.onnx
joiner  fp32/int8: joiner-epoch-99-avg-1.onnx | joiner-epoch-99-avg-1.int8.onnx
test_wav   : 0.wav True
사용 조합: encoder-epoch-99-avg-1.onnx | decoder-epoch-99-avg-1.onnx | joiner-epoch-99-avg-1.onnx


In [19]:
import sherpa_onnx, sys, importlib
print("Python:", sys.executable)
print("sherpa_onnx version:", getattr(sherpa_onnx, "__version__", "unknown"))
print("has OnlineRecognizer.from_transducer:", hasattr(sherpa_onnx.OnlineRecognizer, "from_transducer"))
print("has OnlineRecognizerConfig:", hasattr(sherpa_onnx, "OnlineRecognizerConfig"))


Python: /opt/homebrew/Caskroom/miniforge/base/envs/asr-korean/bin/python
sherpa_onnx version: 1.12.10
has OnlineRecognizer.from_transducer: True
has OnlineRecognizerConfig: False


In [25]:
from pathlib import Path
import soundfile as sf
import numpy as np
import sherpa_onnx

# 1) 네가 쓰고 있는 모델 디렉터리 그대로
extract_dir = Path("/Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16")
tokens    = extract_dir / "tokens.txt"
test_wav  = extract_dir / "test_wavs" / "0.wav"

# 2) 파일 자동 선택 (int8 우선, 없으면 fp32)
def pick(prefix: str):
    int8 = sorted(extract_dir.glob(f"{prefix}-epoch-*.int8.onnx"))
    fp32 = sorted(p for p in extract_dir.glob(f"{prefix}-epoch-*.onnx") if ".int8." not in p.name)
    return (int8[-1] if int8 else (fp32[-1] if fp32 else None))

ENC = pick("encoder")
DEC = pick("decoder")
JOIN = pick("joiner")

print("tokens:", tokens, tokens.exists())
print("ENC/DEC/JOIN:", ENC.name if ENC else None, DEC.name if DEC else None, JOIN.name if JOIN else None)
print("test_wav:", test_wav, test_wav.exists())

assert tokens.exists(), "tokens.txt가 없습니다."
assert ENC and DEC and JOIN, "encoder/decoder/joiner onnx를 찾지 못했습니다."
assert test_wav.exists(), "샘플 wav가 없습니다."

# 3) WAV 로드 (모노 float32 권장)
wave, sr = sf.read(str(test_wav), dtype="float32", always_2d=False)
if wave.ndim == 2:
    wave = wave.mean(axis=1)
if sr != 16000:
    print(f"[경고] 샘플레이트 {sr}Hz 입니다. 16kHz 권장입니다.")

# 4) OnlineRecognizer 생성 — from_transducer (너 버전에 존재)
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
    tokens=str(tokens),
    encoder=str(ENC),
    decoder=str(DEC),
    joiner=str(JOIN),
    decoding_method="greedy_search",  # 기본값
    num_threads=2,                    # 필요 시 조정
    provider="cpu",                   # 성능 튜닝시 "coreml" 시도 가능 (아래 팁 참고)
)

# 5) 스트림 생성/디코딩
stream = recognizer.create_stream()
stream.accept_waveform(16000, wave if sr == 16000 else wave.astype("float32"))
stream.input_finished()

while recognizer.is_ready(stream):
    recognizer.decode_stream(stream)

res = recognizer.get_result(stream)

# sherpa-onnx 버전에 따라 반환 타입이 다를 수 있어 안전하게 처리
if hasattr(res, "text"):          # 예전 일부 버전: 객체에 .text 속성
    text = res.text
elif isinstance(res, bytes):       # 혹시 bytes면 디코드
    text = res.decode("utf-8", "ignore")
else:                              # 현재(1.12.10): str 직접 반환
    text = str(res)

print("인식 결과:", text)


tokens: /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt True
ENC/DEC/JOIN: encoder-epoch-99-avg-1.int8.onnx decoder-epoch-99-avg-1.int8.onnx joiner-epoch-99-avg-1.int8.onnx
test_wav: /Users/leejeje/Desktop/DSL/25-1/Modeling/model/sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav True
인식 결과: 걔는 괜찮은 척하려구 애 쓰는 거 같
