# Playground

----

## Wake Word Detection testing

In [1]:
from transformers import pipeline
import torch
device = "cuda:0" if torch.cuda.is_available() else "mps"


2025-02-12 17:12:48.093639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739376768.132483    8611 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739376768.143608    8611 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-12 17:12:48.235027: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)
classifier.model.config.id2label

Device set to use mps


{0: 'backward',
 1: 'follow',
 2: 'five',
 3: 'bed',
 4: 'zero',
 5: 'on',
 6: 'learn',
 7: 'two',
 8: 'house',
 9: 'tree',
 10: 'dog',
 11: 'stop',
 12: 'seven',
 13: 'eight',
 14: 'down',
 15: 'six',
 16: 'forward',
 17: 'cat',
 18: 'right',
 19: 'visual',
 20: 'four',
 21: 'wow',
 22: 'no',
 23: 'nine',
 24: 'off',
 25: 'three',
 26: 'left',
 27: 'marvin',
 28: 'yes',
 29: 'up',
 30: 'sheila',
 31: 'happy',
 32: 'bird',
 33: 'go',
 34: 'one'}

In [3]:
classifier.model.config.id2label[27]

'marvin'

In [4]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live


def launch_fn(
    wake_word="marvin",
    prob_threshold=0.8,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for wake word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == wake_word:
            if prediction["score"] > prob_threshold:
                return True

In [5]:
launch_fn(debug=True)

Listening for wake word...


  waveform = torch.from_numpy(waveform).unsqueeze(0)


{'score': 0.053641922771930695, 'label': 'one'}
{'score': 0.30270814895629883, 'label': 'nine'}
{'score': 0.39697304368019104, 'label': 'learn'}
{'score': 0.4017743468284607, 'label': 'learn'}
{'score': 0.4017743468284607, 'label': 'learn'}
{'score': 0.4017743468284607, 'label': 'learn'}
{'score': 0.6992917060852051, 'label': 'one'}
{'score': 0.9686111211776733, 'label': 'one'}
{'score': 0.9663316607475281, 'label': 'one'}
{'score': 0.9663316607475281, 'label': 'one'}
{'score': 0.9663316607475281, 'label': 'one'}
{'score': 0.9663316607475281, 'label': 'one'}
{'score': 0.8052412271499634, 'label': 'sheila'}
{'score': 0.22134172916412354, 'label': 'stop'}
{'score': 0.22134172916412354, 'label': 'stop'}
{'score': 0.22134172916412354, 'label': 'stop'}
{'score': 0.22134172916412354, 'label': 'stop'}
{'score': 0.9599952101707458, 'label': 'marvin'}


True