# Playground

----

## Wake Word Detection testing

In [2]:
from transformers import pipeline
import torch
device = "cuda:0" if torch.cuda.is_available() else "mps"


In [4]:
classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)
classifier.model.config.id2label

Device set to use mps


{0: 'backward',
 1: 'follow',
 2: 'five',
 3: 'bed',
 4: 'zero',
 5: 'on',
 6: 'learn',
 7: 'two',
 8: 'house',
 9: 'tree',
 10: 'dog',
 11: 'stop',
 12: 'seven',
 13: 'eight',
 14: 'down',
 15: 'six',
 16: 'forward',
 17: 'cat',
 18: 'right',
 19: 'visual',
 20: 'four',
 21: 'wow',
 22: 'no',
 23: 'nine',
 24: 'off',
 25: 'three',
 26: 'left',
 27: 'marvin',
 28: 'yes',
 29: 'up',
 30: 'sheila',
 31: 'happy',
 32: 'bird',
 33: 'go',
 34: 'one'}

In [5]:
classifier.model.config.id2label[27]

'marvin'

In [24]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live


def launch_fn(
    wake_word="marvin",
    prob_threshold=0.8,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for wake word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == wake_word:
            if prediction["score"] > prob_threshold:
                return True

In [25]:
launch_fn(debug=True)

Listening for wake word...




{'score': 0.050247229635715485, 'label': 'no'}
{'score': 0.059809740632772446, 'label': 'no'}
{'score': 0.09184109419584274, 'label': 'off'}
{'score': 0.1321328729391098, 'label': 'off'}
{'score': 0.14427167177200317, 'label': 'off'}
{'score': 0.13177059590816498, 'label': 'off'}
{'score': 0.13177059590816498, 'label': 'off'}
{'score': 0.13177059590816498, 'label': 'off'}
{'score': 0.9419751167297363, 'label': 'one'}
{'score': 0.9330095052719116, 'label': 'one'}
{'score': 0.9314926266670227, 'label': 'one'}
{'score': 0.9314926266670227, 'label': 'one'}
{'score': 0.9314926266670227, 'label': 'one'}
{'score': 0.9314926266670227, 'label': 'one'}
{'score': 0.5056683421134949, 'label': 'one'}
{'score': 0.5366392731666565, 'label': 'one'}
{'score': 0.5366392731666565, 'label': 'one'}
{'score': 0.5366392731666565, 'label': 'one'}
{'score': 0.5366392731666565, 'label': 'one'}
{'score': 0.991104006767273, 'label': 'two'}
{'score': 0.9998354911804199, 'label': 'two'}
{'score': 0.9998797178268433

True