In [8]:
import sys
import os
import numpy as np
import sounddevice as sd
from sherpa_onnx import OnlineRecognizer

current_dir = os.getcwd() 
sys.path.append(os.path.join(current_dir, '../caption-engine'))

from sysaudio.win import AudioStream
from audioprcs import resampleRawChunk, mergeChunkChannels

In [10]:
devices = sd.query_devices()
if len(devices) == 0:
    print("No microphone devices found")
    sys.exit(0)

# print(devices)
default_input_device_idx = sd.default.device[0]
# print(f'Use default device: {devices[default_input_device_idx]["name"]}') # type: ignore

m_path = "D:/Projects/auto-caption/caption-engine/models/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10"
recognizer = OnlineRecognizer.from_transducer(
    tokens=f"{m_path}/tokens.txt",
    encoder=f"{m_path}/encoder-epoch-75-avg-11-chunk-16-left-128.int8.onnx",
    decoder=f"{m_path}/decoder-epoch-75-avg-11-chunk-16-left-128.onnx",
    joiner=f"{m_path}/joiner-epoch-75-avg-11-chunk-16-left-128.int8.onnx",
    num_threads=2,
    sample_rate=16000,
    feature_dim=80,
    enable_endpoint_detection=True,
    rule1_min_trailing_silence=2.4,
    rule2_min_trailing_silence=1.2,
    rule3_min_utterance_length=300,  # it essentially disables this rule
)

print("Started! Please speak")

# The model is using 16 kHz, we use 48 kHz here to demonstrate that
# sherpa-onnx will do resampling inside.
sample_rate = 48000
samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
last_result = ""
stream = recognizer.create_stream()
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
    while True:
        samples, _ = s.read(samples_per_read)  # a blocking read
        samples = samples.reshape(-1)
        stream.accept_waveform(sample_rate, samples)
        while recognizer.is_ready(stream):
            recognizer.decode_stream(stream)
        result = recognizer.get_result(stream)
        if last_result != result:
            last_result = result
            print("\r{}".format(result), end="", flush=True)

Started! Please speak
木のデップハートへいてかいものしましたNIさんはまいばんなにちを兄さんは毎朝七時に家を出かけます机の上にねこがいます测试

KeyboardInterrupt: 

In [None]:
# m_path = "D:/Projects/auto-caption/caption-engine/models/sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms-int8"
# recognizer = OnlineRecognizer.from_transducer(
#     tokens=f"{m_path}/tokens.txt",
#     encoder=f"{m_path}/encoder.int8.onnx",
#     decoder=f"{m_path}/decoder.int8.onnx",
#     joiner=f"{m_path}/joiner.int8.onnx",
#     enable_endpoint_detection=True,
# )

m_path = "D:/Projects/auto-caption/caption-engine/models/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10"
recognizer = OnlineRecognizer.from_transducer(
    tokens=f"{m_path}/tokens.txt",
    encoder=f"{m_path}/encoder-epoch-75-avg-11-chunk-16-left-128.int8.onnx",
    decoder=f"{m_path}/decoder-epoch-75-avg-11-chunk-16-left-128.onnx",
    joiner=f"{m_path}/joiner-epoch-75-avg-11-chunk-16-left-128.int8.onnx",
    num_threads=1,
    sample_rate=16000,
    feature_dim=80,
    enable_endpoint_detection=True,
    rule1_min_trailing_silence=2.4,
    rule2_min_trailing_silence=1.2,
    rule3_min_utterance_length=300,  # it essentially disables this rule
)

In [None]:
rec_stream = recognizer.create_stream()

stream = AudioStream(0, 1)
stream.printInfo()

stream.openStream()


for i in range(300):
    chunk = stream.read_chunk()
    chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)
    chunk_mono = np.frombuffer(chunk_mono, dtype=np.int16)
    chunk_mono = chunk_mono.astype(np.float32)
    print(i, chunk_mono.shape)
    # print(type(chunk_mono), chunk_mono.shape)
    rec_stream.accept_waveform(16000, chunk_mono)
    while recognizer.is_ready(rec_stream):
        recognizer.decode_stream(rec_stream)
    result = recognizer.get_result(rec_stream)
    if result:
        print(result)
    if recognizer.is_endpoint(rec_stream):
        recognizer.reset(rec_stream)
