In [1]:


import argparse
import sys
from pathlib import Path

import soundfile as sf
import sherpa_onnx




def create_recognizer(provider="cpu") -> sherpa_onnx.OfflineRecognizer:

    model_dir = Path("./SenseVoice-Small/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17")
    model = model_dir / "model.onnx"
    tokens = model_dir / "tokens.txt"
    
    return sherpa_onnx.OfflineRecognizer.from_sense_voice(
        model = model.as_posix(),
        tokens = tokens.as_posix(),
        use_itn=True,
        debug=True,
        language="zh",
        num_threads=4,
        provider=provider,
    )

def decode_file(
    recognizer: sherpa_onnx.OfflineRecognizer,
    filename: str,
):
    """Decode a single audio file."""
    audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
    audio = audio[:, 0]  # only use the first channel

    stream = recognizer.create_stream()
    stream.accept_waveform(sample_rate, audio)
    recognizer.decode_stream(stream)
    result = stream.result
    return result


In [2]:
recognizer_gpu = create_recognizer('cuda')

In [3]:

wav_path = f"./demo4.mp3"
result = decode_file(recognizer_gpu, wav_path)
print(result)
print(result.text)
print(result.timestamps)
print(result.tokens)


{"lang": "<|zh|>", "emotion": "<|NEUTRAL|>", "event": "<|Speech|>", "text": "财联社1月7日电港股午间收盘，恒生指数跌1.01%，恒生科技指数跌1.65%，大型科技股疲软，阿里巴巴跌于4%，美团网易跌超2%。", "timestamps": [0.36, 0.54, 0.72, 1.08, 1.26, 1.44, 1.62, 1.92, 2.94, 3.12, 3.48, 3.66, 3.96, 4.08, 4.26, 4.50, 4.68, 4.92, 5.10, 5.46, 6.54, 6.72, 7.02, 7.20, 7.32, 7.44, 8.04, 8.22, 8.46, 8.58, 8.76, 8.94, 9.18, 9.96, 10.14, 10.26, 10.44, 10.50, 10.68, 11.10, 11.22, 11.52, 11.70, 11.88, 12.24, 12.36, 12.66, 13.14, 13.32, 13.44, 13.62, 13.92, 14.10, 14.88, 15.06, 15.24, 15.78, 15.90, 16.68, 16.80, 17.34, 17.58, 18.48, 18.60, 18.84], "durations": [], "tokens":["财", "联", "社", "1", "月", "7", "日", "电", "港", "股", "午", "间", "收", "盘", "，", "恒", "生", "指", "数", "跌", "1", ".", "0", "1", "%", "，", "恒", "生", "科", "技", "指", "数", "跌", "1", ".", "6", "5", "%", "，", "大", "型", "科", "技", "股", "疲", "软", "，", "阿", "里", "巴", "巴", "跌", "于", "4", "%", "，", "美", "团", "网", "易", "跌", "超", "2", "%", "。"], "ys_log_probs": [], "words": []}
财联社1月7日电港股午间收盘，恒生指数跌1.01%，恒生科技指数

In [4]:
# recognizer_cpu = create_recognizer('cpu')


In [5]:
# wav_path = f"../demo.mp3"
# result = decode_file(recognizer_cpu, wav_path)
# print(result)