In [1]:


import argparse
import sys
from pathlib import Path

import soundfile as sf
import sherpa_onnx




def create_recognizer(provider="cpu") -> sherpa_onnx.OfflineRecognizer:

    model_dir = Path("./FireRed/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16")
    encoder = model_dir / 'encoder.int8.onnx'
    decoder = model_dir / 'decoder.int8.onnx'
    tokens = model_dir / "tokens.txt"
    itn = Path("./ITN/itn_zh_number.fst")
    
    return sherpa_onnx.OfflineRecognizer.from_fire_red_asr(
        encoder=encoder.as_posix(),
        decoder=decoder.as_posix(),
        tokens = tokens.as_posix(),
        debug=True,
        num_threads=4,
        provider=provider,
        rule_fsts=itn.as_posix(),
    )

def decode_file(
    recognizer: sherpa_onnx.OfflineRecognizer,
    filename: str,
):
    """Decode a single audio file."""
    audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
    audio = audio[:, 0]  # only use the first channel

    stream = recognizer.create_stream()
    stream.accept_waveform(sample_rate, audio)
    recognizer.decode_stream(stream)
    result = stream.result
    return result


In [2]:
recognizer_gpu = create_recognizer('cuda')

In [3]:

wav_path = f"./demo4.mp3"
result = decode_file(recognizer_gpu, wav_path)
print(result)
print(result.text)
print(result.timestamps)
print(result.tokens)


{"lang": "", "emotion": "", "event": "", "text": "财联社1月7日电港股午间收盘恒生指数跌百分之1点01恒生科技指数跌百分之1点65大型科技股疲软阿里巴巴跌逾百分之4美团网易跌超百分之2", "timestamps": [], "durations": [], "tokens":["财", "联", "社", "一", "月", "七", "日", "电", "港", "股", "午", "间", "收", "盘", "恒", "生", "指", "数", "跌", "百", "分", "之", "一", "点", "零", "一", "恒", "生", "科", "技", "指", "数", "跌", "百", "分", "之", "一", "点", "六", "五", "大", "型", "科", "技", "股", "疲", "软", "阿", "里", "巴", "巴", "跌", "逾", "百", "分", "之", "四", "美", "团", "网", "易", "跌", "超", "百", "分", "之", "二"], "ys_log_probs": [], "words": []}
财联社1月7日电港股午间收盘恒生指数跌百分之1点01恒生科技指数跌百分之1点65大型科技股疲软阿里巴巴跌逾百分之4美团网易跌超百分之2
[]
['财', '联', '社', '一', '月', '七', '日', '电', '港', '股', '午', '间', '收', '盘', '恒', '生', '指', '数', '跌', '百', '分', '之', '一', '点', '零', '一', '恒', '生', '科', '技', '指', '数', '跌', '百', '分', '之', '一', '点', '六', '五', '大', '型', '科', '技', '股', '疲', '软', '阿', '里', '巴', '巴', '跌', '逾', '百', '分', '之', '四', '美', '团', '网', '易', '跌', '超', '百', '分', '之', '二']


In [4]:
# recognizer_cpu = create_recognizer('cpu')


In [5]:
# wav_path = f"../demo.mp3"
# result = decode_file(recognizer_cpu, wav_path)
# print(result)