In [2]:
%pip install vosk 


Collecting vosk
  Downloading vosk-0.3.45-py3-none-win_amd64.whl.metadata (1.8 kB)
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting websockets (from vosk)
  Downloading websockets-15.0.1-cp311-cp311-win_amd64.whl.metadata (7.0 kB)
Downloading vosk-0.3.45-py3-none-win_amd64.whl (14.0 MB)
   ---------------------------------------- 0.0/14.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/14.0 MB 1.7 MB/s eta 0:00:09
    --------------------------------------- 0.2/14.0 MB 2.5 MB/s eta 0:00:06
   - -------------------------------------- 0.4/14.0 MB 3.0 MB/s eta 0:00:05
   - -------------------------------------- 0.5/14.0 MB 3.1 MB/s eta 0:00:05
   - -------------------------------------- 0.5/14.0 MB 2.2 MB/s eta 0:00:07
   - -------------------------------------- 0.5/14.0 MB 2.3 MB/s eta 0:00:06
   -- ------------------------------------- 1.0/14


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import speech_recognition as sr

def main():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()

    print("Adjusting for ambient noise... Please wait.")
    with mic as source:
        recognizer.adjust_for_ambient_noise(source)
        print("Listening! Say something...")

    try:
        while True:
            with mic as source:
                print("\nListening...")
                audio = recognizer.listen(source)
            try:
                text = recognizer.recognize_google(audio)
                print("You said:", text)
            except sr.UnknownValueError:
                print("Sorry, I couldn't understand that.")
            except sr.RequestError as e:
                print(f"API error: {e}")
    except KeyboardInterrupt:
        print("\nExiting...")

if __name__ == "__main__":
    main()


Adjusting for ambient noise... Please wait.
Listening! Say something...

Listening...
You said: I'm going to try

Listening...
You said: do a keyboard thing

Listening...
You said: which key is

Listening...
You said: is it enter no

Listening...
You said: Escape

Listening...
Sorry, I couldn't understand that.

Listening...
You said: you're still going

Listening...
You said: just going to press every oh oh is collapse okay

Listening...
You said: slave work

Listening...
Sorry, I couldn't understand that.

Listening...
You said: not bad

Listening...


: 

# Using offline models

## vosk

In [3]:
import sys
import json
import queue
import pyaudio
from vosk import Model, KaldiRecognizer

# Set model path
MODEL_PATH = "vosk-model-small-en-us-0.15"

def main():
    # Load model
    model = Model(MODEL_PATH)
    recognizer = KaldiRecognizer(model, 16000)
    audio_queue = queue.Queue()

    def callback(in_data, frame_count, time_info, status):
        audio_queue.put(in_data)
        return (None, pyaudio.paContinue)

    # Setup PyAudio
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=8000,
                    stream_callback=callback)
    stream.start_stream()

    print("Listening (offline)... Press Ctrl+C to stop.")

    try:
        while True:
            data = audio_queue.get()
            if recognizer.AcceptWaveform(data):
                result = json.loads(recognizer.Result())
                if result.get("text"):
                    print("You said:", result["text"])
            else:
                partial = json.loads(recognizer.PartialResult())
                # Uncomment to show partial results:
                # print("Partial:", partial["partial"])
    except KeyboardInterrupt:
        print("\nExiting...")
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()

if __name__ == "__main__":
    main()


Listening (offline)... Press Ctrl+C to stop.
You said: oh okay
You said: half assed are you can just keep speaking and you'll just start picking it up or do i have to stop speaking as in
You said: half past
You said: slave good a slave work
You said: this is not good this is not a good model
You said: oh look it's a pick that up transferring were if i go back over here
You said: yeah this is not good
You said: oh okay i actually want a bigger
You said: oh
You said: okay
You said: this can be stopped
You said: oh


: 

## Whisper OpenAI

In [1]:
import sounddevice as sd
import numpy as np
from faster_whisper import WhisperModel
import queue

# Set up model: "base", "small", "medium", "large-v2"
model_size = "small"
model = WhisperModel(model_size, compute_type="float16")  # Use "int8" or "float32" if needed

samplerate = 16000
blocksize = 4000
audio_queue = queue.Queue()

# Callback to collect audio blocks
def callback(indata, frames, time, status):
    audio_queue.put(indata.copy())

# Start audio stream
stream = sd.InputStream(samplerate=samplerate, channels=1, callback=callback, blocksize=blocksize)
stream.start()

print("Listening with Whisper... (Ctrl+C to stop)")
try:
    buffer = np.empty((0,), dtype=np.float32)

    while True:
        block = audio_queue.get()
        block = block.flatten()
        buffer = np.concatenate((buffer, block))

        # Run recognition every ~5 seconds
        if len(buffer) >= samplerate * 5:
            segment = buffer[:samplerate * 5]
            buffer = buffer[samplerate * 5:]

            segments, _ = model.transcribe(segment, language="en")
            for seg in segments:
                print("You said:", seg.text.strip())

except KeyboardInterrupt:
    print("\nExiting...")
    stream.stop()


  from .autonotebook import tqdm as notebook_tqdm


: 

In [2]:
'slave' in '''Okay, so I'm talkingslave'''

True

In [5]:
True and 'work' in '''Okay, so I'm twOrkalkingslave'''.lower()

True

In [14]:
txt = '''Okay, so I'm talkingslave something slave else and then I add work with extra words to add error testing'''

start = txt.lower().rfind('slave')
end = txt.lower().find('work')

In [15]:
txt[start+5:end]

' else and then I add '