In [1]:


import argparse
import sys
from pathlib import Path
import token

import soundfile as sf
import sherpa_onnx


def create_recognizer(provider="cpu") -> sherpa_onnx.OfflineRecognizer:
    model_dir = Path("./Paraformer/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
    model = model_dir / "model.onnx"
    tokens = model_dir / "tokens.txt"
    return sherpa_onnx.OfflineRecognizer.from_paraformer(
        paraformer = model.as_posix(),
        tokens = tokens.as_posix(),
        debug=True,
        num_threads=4,
        provider=provider,
    )

def decode_file(
    recognizer: sherpa_onnx.OfflineRecognizer,
    filename: str,
):
    """Decode a single audio file."""
    audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
    audio = audio[:, 0]  # only use the first channel

    stream = recognizer.create_stream()
    stream.accept_waveform(sample_rate, audio)
    recognizer.decode_stream(stream)
    result = stream.result
    return result


In [2]:
recognizer_gpu = create_recognizer('cuda')

In [6]:

wav_path = f"./demo.mp3"
result = decode_file(recognizer_gpu, wav_path)
print(result)
print(result.text)
print(result.timestamps)
print(result.tokens)


{"lang": "", "emotion": "", "event": "", "text": "财联社一月七日电从国家邮政局了解到十四五以来我国建成世界上规模最大受益人数最多的寄 d 网络邮政快递业业务年均增长百分之十", "timestamps": [0.32, 0.50, 0.60, 0.86, 0.96, 1.16, 1.32, 1.50, 2.40, 2.78, 2.94, 3.08, 3.24, 3.36, 3.62, 3.72, 3.82, 4.46, 4.66, 4.84, 5.02, 5.14, 5.48, 5.68, 5.94, 6.10, 6.32, 6.46, 6.58, 6.78, 6.88, 7.06, 7.22, 7.54, 7.70, 7.90, 8.06, 8.34, 8.48, 8.66, 9.32, 9.62, 9.94, 10.16, 11.74, 12.00, 12.40, 12.66, 12.84, 13.46, 13.74, 14.18, 14.52, 14.92, 15.12, 15.48, 15.60, 15.78, 15.90], "durations": [], "tokens":["财", "联", "社", "一", "月", "七", "日", "电", "从", "国", "家", "邮", "政", "局", "了", "解", "到", "十", "四", "五", "以", "来", "我", "国", "建", "成", "世", "界", "上", "规", "模", "最", "大", "受", "益", "人", "数", "最", "多", "的", "寄", "d", "网", "络", "邮", "政", "快", "递", "业", "业", "务", "年", "均", "增", "长", "百", "分", "之", "十"], "ys_log_probs": [], "words": []}
财联社一月七日电从国家邮政局了解到十四五以来我国建成世界上规模最大受益人数最多的寄 d 网络邮政快递业业务年均增长百分之十
[0.3199999928474426, 0.5, 0.5999999642372131, 0.85999995470047, 0.9599999785423279

In [7]:
recognizer_cpu = create_recognizer('cpu')

In [10]:
wav_path = f"./demo.mp3"
result = decode_file(recognizer_cpu, wav_path)
print(result)
print(result.text)
print(result.timestamps)
print(result.tokens)

{"lang": "", "emotion": "", "event": "", "text": "财联社一月七日电从国家邮政局了解到十四五以来我国建成世界上规模最大受益人数最多的寄 d 网络邮政快递业业务年均增长百分之十", "timestamps": [0.32, 0.50, 0.60, 0.86, 0.96, 1.16, 1.32, 1.50, 2.40, 2.78, 2.94, 3.08, 3.24, 3.36, 3.62, 3.72, 3.82, 4.46, 4.66, 4.84, 5.02, 5.14, 5.48, 5.68, 5.94, 6.10, 6.32, 6.46, 6.58, 6.78, 6.88, 7.06, 7.22, 7.54, 7.70, 7.90, 8.06, 8.34, 8.48, 8.66, 9.32, 9.62, 9.94, 10.16, 11.74, 12.00, 12.40, 12.66, 12.84, 13.46, 13.74, 14.18, 14.52, 14.92, 15.12, 15.48, 15.60, 15.78, 15.90], "durations": [], "tokens":["财", "联", "社", "一", "月", "七", "日", "电", "从", "国", "家", "邮", "政", "局", "了", "解", "到", "十", "四", "五", "以", "来", "我", "国", "建", "成", "世", "界", "上", "规", "模", "最", "大", "受", "益", "人", "数", "最", "多", "的", "寄", "d", "网", "络", "邮", "政", "快", "递", "业", "业", "务", "年", "均", "增", "长", "百", "分", "之", "十"], "ys_log_probs": [], "words": []}
财联社一月七日电从国家邮政局了解到十四五以来我国建成世界上规模最大受益人数最多的寄 d 网络邮政快递业业务年均增长百分之十
[0.3199999928474426, 0.5, 0.5999999642372131, 0.85999995470047, 0.9599999785423279