In [1]:
from paddlespeech.server.engine.asr.python.asr_engine import ASREngine, PaddleASRConnectionHandler

from paddlespeech.cli.asr.infer import ASRExecutor
import pyaudio, wave
import numpy as np
import speech_recognition as sr
import io
import time

import warnings
warnings.filterwarnings("ignore")



In [2]:
import threading
import queue

class mic_thread(threading.Thread):
    threads = []
    id = 0
    thread_lock = threading.Lock()
    mic_queue = queue.Queue(10)
    
    def __init__(self, name, source):
        threading.Thread.__init__(self)
        self.name = name
        self.source = source
        self.stop = False
        
        self.id = mic_thread.id
        mic_thread.id = mic_thread.id+1
        
        mic_thread.threads.append(self)
        
    def run(self):
        with mic as source:
            # r.energy_threshold = 8000 # This filters noise
            r.dynamic_energy_threshold = False
            r.adjust_for_ambient_noise(source,2)
            
            r.pause_threshold = 1.5
            print("Start talk..")
            while not self.stop:
                print("Listening..")
                try:
                    audio = r.listen(source,timeout=10) # no talk within 2 sec, re-loop
                except sr.exceptions.WaitTimeoutError:
                    continue
                        
                print("Done lisening")
                wav_data = audio.get_wav_data(convert_rate = 16000)
                
                mic_thread.thread_lock.acquire()
                mic_thread.mic_queue.put(io.BytesIO(wav_data))
                mic_thread.thread_lock.release()
                
    def terminate(self):
        self.stop = True
        
                

In [9]:
class crASR:

    def __init__(self):
        # 创建对象参数
        class _conf:
            def __init__(self, **kwargs):
                for key in kwargs:
                    setattr(self, key, kwargs[key])
        # 初始化ASR
        self.asr = ASREngine()
        # 以指定参数方式生成对象并初始化ASR
        self.asr.init(_conf(
            model='deepspeech2online_wenetspeech',
            lang='zh', # zh_en en zh
            sample_rate=16000,
            cfg_path=None,  # 自定义配置文件路径(可选)
            ckpt_path=None,  # 自定义模型文件路径(可选)
            decode_method='attention_rescoring',
            force_yes=True,
            device=None))
        self.asr_handle = PaddleASRConnectionHandler(self.asr)

    def predict(self, wavData):
        """
        ASR预测
        :param wavData: 需要预测的音频，会根据传入类型自动识别：路径(str 相对|绝对)、音频流、内存BytesIO对象
        :return:
        """
        if type(wavData) == str:
            wavData = filesystem_get_contents(wavData, 'rb')
        elif type(wavData) == io.BytesIO:
            wavData = wavData.getvalue()

        start = time.time()
        self.asr_handle.run(wavData)
        text = self.asr_handle.output
        print("ASR预测消耗时间：%dms, 识别结果: %s" % (round((time.time() - start) * 1000), text))
        return text

In [10]:
asr = crASR()
r = sr.Recognizer()
mic = sr.Microphone(device_index=1)

100%|█████████████████████████████████████████████████████████████████████████████| 2.62G/2.62G [12:59<00:00, 3.36MB/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2.95G/2.95G [04:41<00:00, 10.5MB/s]
[2023-10-09 13:25:16,197] [    INFO] - Initialize ASR server engine successfully on device: gpu:0.


In [11]:
sr.Microphone.list_microphone_names()

['Microsoft Sound Mapper - Input',
 'Microphone (JOUNIVO JV601)',
 '麦克风 (Dubbing Virtual Device)',
 'Headset Microphone (Oculus Virt',
 'Microphone (WEB CAM)',
 'Microsoft Sound Mapper - Output',
 '扬声器 (Realtek High Definition Au',
 'Realtek Digital Output (Realtek',
 '扬声器 (Dubbing Virtual Device)',
 'R240HY (NVIDIA High Definition ',
 '鑰虫満 (Oculus Virtual Audio Device',
 '主声音捕获驱动程序',
 'Microphone (JOUNIVO JV601)',
 '麦克风 (Dubbing Virtual Device)',
 'Headset Microphone (Oculus Virtual Audio Device)',
 'Microphone (WEB CAM)',
 '主声音驱动程序',
 '扬声器 (Realtek High Definition Audio)',
 'Realtek Digital Output (Realtek High Definition Audio)',
 '扬声器 (Dubbing Virtual Device)',
 'R240HY (NVIDIA High Definition Audio)',
 '鑰虫満 (Oculus Virtual Audio Device)',
 'Realtek Digital Output (Realtek High Definition Audio)',
 '扬声器 (Realtek High Definition Audio)',
 '扬声器 (Dubbing Virtual Device)',
 'R240HY (NVIDIA High Definition Audio)',
 '鑰虫満 (Oculus Virtual Audio Device)',
 '麦克风 (Dubbing Virtual Device)',
 

In [12]:
microphone = mic_thread("local mic",mic)

In [13]:

microphone.start()

flag = True
while flag:
    if mic_thread.mic_queue.empty():
        time.sleep(0.1)
        continue
        
        
    mic_thread.thread_lock.acquire()
    msg = mic_thread.mic_queue.get(0)
    mic_thread.thread_lock.release()

        
    replyTxt = asr.predict(msg)
        
    # send to LLM afterward
    # 音纹识别
        
        
    print(replyTxt)
        
    if (replyTxt == "停下"):
        flag = False
    
    

Start talk..
Listening..
Done lisening
Listening..


[2023-10-09 13:27:18,491] [    INFO] - name 'Scorer' is not defined


[32m2023-10-09 13:27:18.490[0m | [1mINFO    [0m | [36mpaddlespeech.s2t.modules.ctc[0m:[36m_init_ext_scorer[0m:[36m190[0m - [1mbegin to initialize the external scorer for decoding[0m


AttributeError: 'tuple' object has no attribute 'tb_frame'

Done lisening
Listening..
Listening..
Listening..
Done lisening
Listening..
Listening..
Listening..
Listening..
Listening..
Listening..
Done lisening
Listening..
Done lisening
Listening..
Done lisening
Listening..
Listening..
Done lisening
Listening..
Done lisening
Listening..
Done lisening
Listening..
Done lisening
Listening..
Done lisening
Listening..
Done lisening


In [7]:
microphone.start()

In [21]:
microphone.terminate()

In [22]:
mic_thread.mic_queue.qsize()

1

In [29]:
mic_thread.threads

[<mic_thread(local mic, stopped 7024)>, <mic_thread(local mic, stopped 12264)>]

In [16]:
with mic as source:
    r.adjust_for_ambient_noise(source) # This filters noise
    r.pause_threshold = 1
    print("Start talk..")
    while True:
        print("Listening..")
        audio = r.listen(source)
        
    

Start talk..


In [17]:
wav_data = audio.get_wav_data(convert_rate = 16000)
io_wav = io.BytesIO(wav_data)

In [14]:
audio = pyaudio.PyAudio()

# 查找虚拟麦克风的设备索引
# 使用VB-Audio Virtual Cable

mic_index = None
for i in range(audio.get_device_count()):
    print(audio.get_device_info_by_index(i))

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Microphone (JOUNIVO JV601)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': '麦克风 (Dubbing Virtual Device)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Headset Microphone (Oculus Virt', 'hostApi': 0, 'maxInputChannels': 1,

In [30]:


with open("audio_file.wav", "wb") as file:
    file.write(wav_data)

In [13]:
asr = ASREngine()
asr.init({'model':'conformer_wenetspeech',
            'lang':'zh',
            'sample_rate':16000,
            'cfg_path':None,  # 自定义配置文件路径(可选)
            'ckpt_path':None,  # 自定义模型文件路径(可选)
            'decode_method':'attention_rescoring',
            'force_yes':True,
            'device':'cuda'})

[2023-09-30 10:39:44,235] [   ERROR] - Set device failed, please check if device is already used and the parameter 'device' in the yaml file
[2023-09-30 10:39:44,236] [   ERROR] - 'dict' object has no attribute 'device'


False