In [6]:
import pvporcupine
import pyaudio
import numpy as np
import whisper
import torch
from time import time

# load vad model
vad_model, utils = torch.hub.load('snakers4/silero-vad', model='silero_vad', trust_repo=True)
(get_speech_timestamps, _, _, _, _) = utils

ACCESS_KEY = "KjFJIHycu/LCghU3SFVYv1XzoC/KSW6mDxQWBmc4K8I+ktk6hKL6Mw=="
porcupine = pvporcupine.create(
    access_key=ACCESS_KEY,
    keyword_paths=["D:\Downloads\hey-assistant_en_windows_v3_0_0.ppn"]
)

pa = pyaudio.PyAudio()
stream = pa.open(
    rate=porcupine.sample_rate,
    channels=1,
    format=pyaudio.paInt16,
    input=True,
    frames_per_buffer=porcupine.frame_length
)

whisper_model = whisper.load_model("base")

def vad_detect(audio_np):
    audio_tensor = torch.from_numpy(audio_np)
    speech_timestamps = get_speech_timestamps(audio_tensor, vad_model, sampling_rate=porcupine.sample_rate)
    return speech_timestamps

def normalize_audio(audio):
    max_amp = np.max(np.abs(audio))
    if max_amp > 0:
        return audio / max_amp
    return audio

print("Listening for wake word...")

try:
    while True:
        pcm = stream.read(porcupine.frame_length)
        pcm_int16 = np.frombuffer(pcm, dtype=np.int16)

        result = porcupine.process(pcm_int16)
        if result >= 0:
            print("Wake word detected! Listening for command...")

            command_frames = []
            silence_start = None
            max_silence_duration = 1.0  # 1 ثانیه سکوت => پایان
            min_speech_duration = 0.5   # حداقل نیم ثانیه صحبت لازم است
            max_command_duration = 10.0 # حداکثر 10 ثانیه ضبط

            start_time = time()
            speech_time = 0

            while True:
                data = stream.read(porcupine.frame_length)
                frame_int16 = np.frombuffer(data, dtype=np.int16)
                command_frames.append(frame_int16)

                audio_np = np.concatenate(command_frames).astype(np.float32) / 32768.0

                speech_timestamps = vad_detect(audio_np)

                if speech_timestamps:
                    silence_start = None
                    speech_time = sum([(ts['end']-ts['start'])/porcupine.sample_rate for ts in speech_timestamps])
                else:
                    if silence_start is None:
                        silence_start = time()
                    elif time() - silence_start > max_silence_duration and speech_time >= min_speech_duration:
                        break

                if time() - start_time > max_command_duration:
                    break

            command_audio = normalize_audio(audio_np)

            result = whisper_model.transcribe(command_audio, fp16=False, language="en")
            command = result["text"].strip()
            print(f"Command: {command}")

            if "turn off" in command.lower():
                print("Turning off the program...")
                break

            print("Listening for wake word...")

except KeyboardInterrupt:
    print("Stopping...")

finally:
    stream.stop_stream()
    stream.close()
    pa.terminate()
    porcupine.delete()


Using cache found in C:\Users\feres/.cache\torch\hub\snakers4_silero-vad_master


Listening for wake word...
Wake word detected! Listening for command...
Stopping...


In [5]:
import pvporcupine
import pyaudio
import numpy as np
import whisper
from time import time

ACCESS_KEY = "E8f/7MDN+ZwcIui/j2IB0ij6D+NzkMcUzE6M4WxkCO7xVW+5TseW2g=="  # کلید دسترسی خودت رو اینجا وارد کن

porcupine = pvporcupine.create(
    access_key=ACCESS_KEY,
    keywords=["picovoice"]  # یا wake word مورد نظرت رو بذار
)

pa = pyaudio.PyAudio()
stream = pa.open(
    rate=porcupine.sample_rate,
    channels=1,
    format=pyaudio.paInt16,
    input=True,
    frames_per_buffer=porcupine.frame_length
)

whisper_model = whisper.load_model("base")

print("Listening for wake word...")

try:
    while True:
        pcm = stream.read(porcupine.frame_length)
        pcm = np.frombuffer(pcm, dtype=np.int16)

        result = porcupine.process(pcm)
        if result >= 0:
            print("Wake word detected! Say your command:")

            command_frames = []
            start_time = time()
            while time() - start_time < 5:
                data = stream.read(porcupine.frame_length)
                command_frames.append(np.frombuffer(data, dtype=np.int16))

            command_audio = np.concatenate(command_frames).astype(np.float32) / 32768.0

            result = whisper_model.transcribe(command_audio, fp16=False, language="en")
            command = result["text"].strip()
            print(f"Command: {command}")

            print("Listening for wake word...")

except KeyboardInterrupt:
    print("Stopping...")

finally:
    stream.stop_stream()
    stream.close()
    pa.terminate()
    porcupine.delete()


Listening for wake word...
Wake word detected! Say your command:
Command: Hi3 cats
Listening for wake word...
Wake word detected! Say your command:
Command: Hi, open the door.
Listening for wake word...
Wake word detected! Say your command:
Command: Open the door.
Listening for wake word...
Wake word detected! Say your command:
Command: Please open the door and turn off the air conditioner.
Listening for wake word...
Stopping...
