In [89]:
# !pip install openwakeword
# !pip install speechbrain
# !pip install datasets
# !pip install scipy matplotlib

In [90]:
import os
import collections
import numpy as np
from numpy.lib.format import open_memmap
from pathlib import Path

from onnxruntime.transformers.shape_infer_helper import file_path
from tqdm import tqdm
import openwakeword
# import openwakeword.data
import openwakeword.utils
import openwakeword.metrics


import scipy
import datasets
import matplotlib.pyplot as plt
import torch
from torch import nn
import IPython.display as ipd

import datasets
import os
import numpy as np
import scipy.io.wavfile
from tqdm import tqdm

import urllib.request


import warnings

warnings.filterwarnings("ignore")

In [91]:


ds = datasets.load_dataset(
    "speech_commands",
    "v0.02",
    split="test",
    streaming=True
)
ds_iter = iter(ds)

os.makedirs("./data/speech_commands_test", exist_ok=True)
limit = 4890

for i in tqdm(range(limit)):
    output_file = f"./data/speech_commands_test/{i:05d}.wav"

    if os.path.exists(output_file):
        continue

    example = next(ds_iter)
    wav_data = (example["audio"]["array"] * 32767).astype(np.int16) # Convert to 16-bit PCM Format
    scipy.io.wavfile.write(output_file, 16000, wav_data)

100%|██████████| 4890/4890 [00:00<00:00, 16879.25it/s]


# ⇲ Compute Audio Embeddings

Create audio pre-processing object to get openwakeword audio embeddings

In [92]:
model_dir = "./resources/models"
os.makedirs(model_dir, exist_ok=True)

List Model and URL corresponding

In [93]:
models = {
    "embedding_model.onnx": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx",
    "embedding_model.tflite": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite",
    "melspectrogram.onnx": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx",
    "melspectrogram.tflite": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite"
}

In [94]:
for filename, url in models.items():
    file_path = os.path.join(model_dir, filename)
    if not os.path.exists(file_path):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, file_path)
    else:
        print(f"Found {filename}.")


Found embedding_model.onnx.
Found embedding_model.tflite.
Found melspectrogram.onnx.
Found melspectrogram.tflite.


In [95]:
F = openwakeword.utils.AudioFeatures(
    melspec_model_path=f"{model_dir}/melspectrogram.onnx",
    embedding_model_path=f"{model_dir}/embedding_model.onnx",
    inference_framework="onnx",
)

## ⌀ Negative Clips

Get negative example paths, filtering out clips that are too long or too short

 pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cpu

In [96]:
import openwakeword.data

In [97]:
negative_clips, negative_durations = openwakeword.data.filter_audio_paths(
    [
        "./data/fma_sample",
        "./data/fsd50k_sample",
        "./data/speech_commands_test",
    ],
    min_length_secs=1.0, # minimum clip length in seconds
    max_length_secs=60*30, # maximum clip length in seconds
    duration_method="header" # use the file header to calculate duration
)

print(f"{len(negative_clips)} negative clips after filtering, representing ~{sum(negative_durations)//3600} hours")

200it [00:00, 6419.54it/s]
100%|██████████| 200/200 [00:01<00:00, 107.29it/s]
1000it [00:00, 13571.47it/s]
100%|██████████| 1000/1000 [00:04<00:00, 240.41it/s]
4890it [00:00, 93591.67it/s]
100%|██████████| 4890/4890 [00:08<00:00, 581.03it/s]

5987 negative clips after filtering, representing ~5.0 hours





Use HuggingFace datasets to load files from disk by batches, because:
- Load lazy
- Memmory-Mapping for doesn't load entire in RAM
- map() -> feature extraction, augmentation
- filter() -> remove audio too long/short

In [98]:
audio_dataset = datasets.Dataset.from_dict({
    "audio": negative_clips
})

In [99]:
audio_dataset = audio_dataset.cast_column(
    "audio",
    datasets.Audio(sampling_rate=16000)
)

- Get Audio Embeddings (features) for negative clips and save to .npy file
- Process files by batch and save to Numpy memory mapped file so that
- An array larger than the available system memory can be created

In [100]:
batch_size = 64 # number of files to load, compute features, and write to mmap at a time

In [101]:
clip_size = 3 # The desired window size (in seconds) for the trained openWakeWord model

In [102]:
N_total = int(sum(negative_durations)//clip_size) # Maximum number of rows in mmap file

N_total

6441

In [103]:
n_feature_cols = F.get_embedding_shape(clip_size)

n_feature_cols

(28, 96)

In [104]:
output_file = "negative_features.npy"

In [105]:
output_array_shape = (N_total, n_feature_cols[0], n_feature_cols[1])

output_array_shape

(6441, 28, 96)

Remove .npy before create mmap

In [106]:
for f in [output_file, output_file + ".npy"]:
    if os.path.exists(f):
        os.remove(f)

In [107]:
fp = open_memmap(
    output_file,
    mode="w+",
    dtype=np.float32,
    shape=output_array_shape
)

fp

memmap([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

In [108]:
row_counter = 0

In [109]:
print("\n===== START PROCESS =====")
print(f"Total rows: {audio_dataset.num_rows}")
print(f"Batch size: {batch_size}")
print(f"Expected batches: {int(np.ceil(audio_dataset.num_rows / batch_size))}")
print("")

for i in tqdm(np.arange(0, audio_dataset.num_rows, batch_size)):
    print(f"\n--- Batch index: {int(i / batch_size)} ---")

    print("Loading audio batch...")
    block = audio_dataset[i:i + batch_size]["audio"]
    print(f"Loaded {len(block)} clips")

    print("Converting to int16 arrays...")
    wav_data = [(j["array"] * 32767).astype(np.int16) for j in block]
    print("Conversion done.")

    print("Stacking clips (pad/trim)...")
    wav_data = openwakeword.data.stack_clips(
        wav_data,
        clip_size=16000 * clip_size
    ).astype(np.int16)
    print("Stack clips done.")

    print("Computing embeddings...")
    features = F.embed_clips(
        x=wav_data,
        batch_size=1024,
        ncpu=8
    )
    print(f"Embedding done. features.shape = {features.shape}")

    print("Saving to mmap...")

    rows_needed = features.shape[0]

    if row_counter + rows_needed > N_total:
        print(f"Final batch: writing {N_total - row_counter} rows then break.")
        fp[row_counter:N_total, :, :] = features[0:N_total - row_counter]
        fp.flush()
        break

    else:
        print(f"Writing rows {row_counter} → {row_counter + rows_needed}")
        fp[row_counter:row_counter + rows_needed, :, :] = features
        row_counter += rows_needed
        fp.flush()


===== START PROCESS =====
Total rows: 5987
Batch size: 64
Expected batches: 94



  0%|          | 0/94 [00:00<?, ?it/s]


--- Batch index: 0 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  1%|          | 1/94 [00:07<11:04,  7.15s/it]

Embedding done. features.shape = (640, 28, 96)
Saving to mmap...
Writing rows 0 → 640

--- Batch index: 1 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  2%|▏         | 2/94 [00:14<11:20,  7.40s/it]

Embedding done. features.shape = (640, 28, 96)
Saving to mmap...
Writing rows 640 → 1280

--- Batch index: 2 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  3%|▎         | 3/94 [00:22<11:19,  7.47s/it]

Embedding done. features.shape = (638, 28, 96)
Saving to mmap...
Writing rows 1280 → 1918

--- Batch index: 3 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  4%|▍         | 4/94 [00:25<08:27,  5.64s/it]

Embedding done. features.shape = (234, 28, 96)
Saving to mmap...
Writing rows 1918 → 2152

--- Batch index: 4 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  5%|▌         | 5/94 [00:27<06:52,  4.63s/it]

Embedding done. features.shape = (249, 28, 96)
Saving to mmap...
Writing rows 2152 → 2401

--- Batch index: 5 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  6%|▋         | 6/94 [00:30<05:42,  3.89s/it]

Embedding done. features.shape = (222, 28, 96)
Saving to mmap...
Writing rows 2401 → 2623

--- Batch index: 6 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  7%|▋         | 7/94 [00:32<04:54,  3.38s/it]

Embedding done. features.shape = (207, 28, 96)
Saving to mmap...
Writing rows 2623 → 2830

--- Batch index: 7 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  9%|▊         | 8/94 [00:35<04:25,  3.09s/it]

Embedding done. features.shape = (192, 28, 96)
Saving to mmap...
Writing rows 2830 → 3022

--- Batch index: 8 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 10%|▉         | 9/94 [00:38<04:17,  3.03s/it]

Embedding done. features.shape = (228, 28, 96)
Saving to mmap...
Writing rows 3022 → 3250

--- Batch index: 9 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 11%|█         | 10/94 [00:41<04:19,  3.09s/it]

Embedding done. features.shape = (228, 28, 96)
Saving to mmap...
Writing rows 3250 → 3478

--- Batch index: 10 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 12%|█▏        | 11/94 [00:44<04:07,  2.98s/it]

Embedding done. features.shape = (210, 28, 96)
Saving to mmap...
Writing rows 3478 → 3688

--- Batch index: 11 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 13%|█▎        | 12/94 [00:46<03:46,  2.76s/it]

Embedding done. features.shape = (184, 28, 96)
Saving to mmap...
Writing rows 3688 → 3872

--- Batch index: 12 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 14%|█▍        | 13/94 [00:48<03:37,  2.68s/it]

Embedding done. features.shape = (194, 28, 96)
Saving to mmap...
Writing rows 3872 → 4066

--- Batch index: 13 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 15%|█▍        | 14/94 [00:51<03:25,  2.57s/it]

Embedding done. features.shape = (195, 28, 96)
Saving to mmap...
Writing rows 4066 → 4261

--- Batch index: 14 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 16%|█▌        | 15/94 [00:52<02:56,  2.23s/it]

Embedding done. features.shape = (125, 28, 96)
Saving to mmap...
Writing rows 4261 → 4386

--- Batch index: 15 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 17%|█▋        | 16/94 [00:55<03:04,  2.37s/it]

Embedding done. features.shape = (226, 28, 96)
Saving to mmap...
Writing rows 4386 → 4612

--- Batch index: 16 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 18%|█▊        | 17/94 [00:57<02:58,  2.32s/it]

Embedding done. features.shape = (186, 28, 96)
Saving to mmap...
Writing rows 4612 → 4798

--- Batch index: 17 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 19%|█▉        | 18/94 [00:58<02:16,  1.79s/it]

Embedding done. features.shape = (42, 28, 96)
Saving to mmap...
Writing rows 4798 → 4840

--- Batch index: 18 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 20%|██        | 19/94 [00:58<01:41,  1.36s/it]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4840 → 4862

--- Batch index: 19 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 21%|██▏       | 20/94 [00:58<01:17,  1.04s/it]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4862 → 4884

--- Batch index: 20 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 22%|██▏       | 21/94 [00:59<01:02,  1.18it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4884 → 4906

--- Batch index: 21 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 23%|██▎       | 22/94 [00:59<00:51,  1.39it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4906 → 4928

--- Batch index: 22 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 24%|██▍       | 23/94 [00:59<00:43,  1.65it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4928 → 4950

--- Batch index: 23 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 26%|██▌       | 24/94 [01:00<00:38,  1.81it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4950 → 4972

--- Batch index: 24 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 27%|██▋       | 25/94 [01:00<00:34,  1.97it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4972 → 4994

--- Batch index: 25 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 28%|██▊       | 26/94 [01:01<00:33,  2.05it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4994 → 5016

--- Batch index: 26 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 29%|██▊       | 27/94 [01:01<00:30,  2.17it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5016 → 5038

--- Batch index: 27 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 30%|██▉       | 28/94 [01:01<00:29,  2.24it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5038 → 5060

--- Batch index: 28 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 31%|███       | 29/94 [01:02<00:27,  2.39it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5060 → 5082

--- Batch index: 29 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 32%|███▏      | 30/94 [01:02<00:25,  2.51it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5082 → 5104

--- Batch index: 30 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 33%|███▎      | 31/94 [01:02<00:24,  2.60it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5104 → 5126

--- Batch index: 31 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 34%|███▍      | 32/94 [01:03<00:22,  2.71it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5126 → 5148

--- Batch index: 32 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 35%|███▌      | 33/94 [01:03<00:22,  2.66it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5148 → 5170

--- Batch index: 33 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 36%|███▌      | 34/94 [01:04<00:24,  2.41it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5170 → 5192

--- Batch index: 34 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 37%|███▋      | 35/94 [01:04<00:23,  2.51it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5192 → 5214

--- Batch index: 35 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 38%|███▊      | 36/94 [01:04<00:21,  2.66it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5214 → 5236

--- Batch index: 36 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 39%|███▉      | 37/94 [01:05<00:23,  2.43it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5236 → 5258

--- Batch index: 37 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 40%|████      | 38/94 [01:06<00:26,  2.09it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5258 → 5280

--- Batch index: 38 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 41%|████▏     | 39/94 [01:06<00:26,  2.07it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5280 → 5302

--- Batch index: 39 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 43%|████▎     | 40/94 [01:06<00:25,  2.14it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5302 → 5324

--- Batch index: 40 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 44%|████▎     | 41/94 [01:07<00:23,  2.24it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5324 → 5346

--- Batch index: 41 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 45%|████▍     | 42/94 [01:07<00:21,  2.39it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5346 → 5368

--- Batch index: 42 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 46%|████▌     | 43/94 [01:08<00:21,  2.40it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5368 → 5390

--- Batch index: 43 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 47%|████▋     | 44/94 [01:08<00:20,  2.50it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5390 → 5412

--- Batch index: 44 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 48%|████▊     | 45/94 [01:08<00:18,  2.60it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5412 → 5434

--- Batch index: 45 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 49%|████▉     | 46/94 [01:09<00:18,  2.66it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5434 → 5456

--- Batch index: 46 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 50%|█████     | 47/94 [01:09<00:16,  2.77it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5456 → 5478

--- Batch index: 47 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 51%|█████     | 48/94 [01:09<00:16,  2.77it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5478 → 5500

--- Batch index: 48 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 52%|█████▏    | 49/94 [01:10<00:15,  2.81it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5500 → 5522

--- Batch index: 49 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 53%|█████▎    | 50/94 [01:10<00:15,  2.78it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5522 → 5544

--- Batch index: 50 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 54%|█████▍    | 51/94 [01:10<00:14,  2.88it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5544 → 5566

--- Batch index: 51 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 55%|█████▌    | 52/94 [01:11<00:14,  2.89it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5566 → 5588

--- Batch index: 52 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 56%|█████▋    | 53/94 [01:11<00:14,  2.92it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5588 → 5610

--- Batch index: 53 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 57%|█████▋    | 54/94 [01:11<00:14,  2.84it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5610 → 5632

--- Batch index: 54 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 59%|█████▊    | 55/94 [01:12<00:13,  2.96it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5632 → 5654

--- Batch index: 55 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 60%|█████▉    | 56/94 [01:12<00:12,  3.04it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5654 → 5676

--- Batch index: 56 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 61%|██████    | 57/94 [01:12<00:12,  2.87it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5676 → 5698

--- Batch index: 57 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 62%|██████▏   | 58/94 [01:13<00:12,  3.00it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5698 → 5720

--- Batch index: 58 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 63%|██████▎   | 59/94 [01:13<00:11,  3.08it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5720 → 5742

--- Batch index: 59 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 64%|██████▍   | 60/94 [01:13<00:11,  3.05it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5742 → 5764

--- Batch index: 60 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 65%|██████▍   | 61/94 [01:14<00:10,  3.07it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5764 → 5786

--- Batch index: 61 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 66%|██████▌   | 62/94 [01:14<00:10,  3.08it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5786 → 5808

--- Batch index: 62 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 67%|██████▋   | 63/94 [01:14<00:10,  3.02it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5808 → 5830

--- Batch index: 63 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 68%|██████▊   | 64/94 [01:15<00:10,  2.89it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5830 → 5852

--- Batch index: 64 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 69%|██████▉   | 65/94 [01:15<00:11,  2.60it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5852 → 5874

--- Batch index: 65 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 70%|███████   | 66/94 [01:16<00:11,  2.51it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5874 → 5896

--- Batch index: 66 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 71%|███████▏  | 67/94 [01:16<00:10,  2.53it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5896 → 5918

--- Batch index: 67 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 72%|███████▏  | 68/94 [01:17<00:10,  2.47it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5918 → 5940

--- Batch index: 68 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 73%|███████▎  | 69/94 [01:17<00:09,  2.65it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5940 → 5962

--- Batch index: 69 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 74%|███████▍  | 70/94 [01:17<00:08,  2.72it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5962 → 5984

--- Batch index: 70 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 76%|███████▌  | 71/94 [01:17<00:08,  2.80it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5984 → 6006

--- Batch index: 71 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 77%|███████▋  | 72/94 [01:18<00:07,  2.90it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6006 → 6028

--- Batch index: 72 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 78%|███████▊  | 73/94 [01:18<00:07,  2.87it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6028 → 6050

--- Batch index: 73 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 79%|███████▊  | 74/94 [01:18<00:06,  3.00it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6050 → 6072

--- Batch index: 74 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 80%|███████▉  | 75/94 [01:19<00:06,  3.03it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6072 → 6094

--- Batch index: 75 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 81%|████████  | 76/94 [01:19<00:05,  3.10it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6094 → 6116

--- Batch index: 76 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 82%|████████▏ | 77/94 [01:19<00:05,  3.10it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6116 → 6138

--- Batch index: 77 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 83%|████████▎ | 78/94 [01:20<00:05,  3.01it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6138 → 6160

--- Batch index: 78 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 84%|████████▍ | 79/94 [01:20<00:04,  3.02it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6160 → 6182

--- Batch index: 79 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 85%|████████▌ | 80/94 [01:20<00:04,  2.98it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6182 → 6204

--- Batch index: 80 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 86%|████████▌ | 81/94 [01:21<00:04,  2.81it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6204 → 6226

--- Batch index: 81 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 87%|████████▋ | 82/94 [01:21<00:04,  2.84it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6226 → 6248

--- Batch index: 82 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 88%|████████▊ | 83/94 [01:22<00:03,  2.80it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6248 → 6270

--- Batch index: 83 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 89%|████████▉ | 84/94 [01:22<00:03,  2.75it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6270 → 6292

--- Batch index: 84 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 90%|█████████ | 85/94 [01:22<00:03,  2.76it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6292 → 6314

--- Batch index: 85 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 91%|█████████▏| 86/94 [01:23<00:02,  2.88it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6314 → 6336

--- Batch index: 86 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 93%|█████████▎| 87/94 [01:23<00:02,  2.99it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6336 → 6358

--- Batch index: 87 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 94%|█████████▎| 88/94 [01:23<00:01,  3.07it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6358 → 6380

--- Batch index: 88 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 95%|█████████▍| 89/94 [01:24<00:01,  3.13it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6380 → 6402

--- Batch index: 89 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 96%|█████████▌| 90/94 [01:24<00:01,  3.18it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6402 → 6424

--- Batch index: 90 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 96%|█████████▌| 90/94 [01:24<00:03,  1.06it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Final batch: writing 17 rows then break.





In [110]:
print("\n===== Trimming mmap file =====")
openwakeword.data.trim_mmap(output_file)
print("===== FINISHED =====\n")


===== Trimming mmap file =====


Trimming empty rows: 7it [00:01,  5.44it/s]                       

===== FINISHED =====






Now we have all of the negative features prepared, and saved to fixed durations clips in Numpy array. For this data, the array is small at ~160MB, but in-practice the memory mapping allows to be very large.

## ✰ Negative Clips