In [45]:
# !pip install openwakeword
# !pip install speechbrain
# !pip install datasets
# !pip install scipy matplotlib

In [46]:
import os
import collections
import numpy as np
from numpy.lib.format import open_memmap
from pathlib import Path

from onnxruntime.transformers.shape_infer_helper import file_path
from tqdm import tqdm
import openwakeword
# import openwakeword.data
import openwakeword.utils
import openwakeword.metrics


import scipy
import datasets
import matplotlib.pyplot as plt
import torch
from torch import nn
import IPython.display as ipd

import datasets
import os
import numpy as np
import scipy.io.wavfile
from tqdm import tqdm

import urllib.request


import warnings

warnings.filterwarnings("ignore")

In [47]:


ds = datasets.load_dataset(
    "speech_commands",
    "v0.02",
    split="test",
    streaming=True
)
ds_iter = iter(ds)

os.makedirs("./data/speech_commands_test", exist_ok=True)
limit = 4890

for i in tqdm(range(limit)):
    output_file = f"./data/speech_commands_test/{i:05d}.wav"

    if os.path.exists(output_file):
        continue

    example = next(ds_iter)
    wav_data = (example["audio"]["array"] * 32767).astype(np.int16) # Convert to 16-bit PCM Format
    scipy.io.wavfile.write(output_file, 16000, wav_data)

100%|██████████| 4890/4890 [00:00<00:00, 16652.73it/s]


# ⇲ Compute Audio Embeddings

Create audio pre-processing object to get openwakeword audio embeddings

In [48]:
model_dir = "./resources/models"
os.makedirs(model_dir, exist_ok=True)

List Model and URL corresponding

In [49]:
models = {
    "embedding_model.onnx": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx",
    "embedding_model.tflite": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite",
    "melspectrogram.onnx": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx",
    "melspectrogram.tflite": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite"
}

In [50]:
for filename, url in models.items():
    file_path = os.path.join(model_dir, filename)
    if not os.path.exists(file_path):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, file_path)
    else:
        print(f"Found {filename}.")


Found embedding_model.onnx.
Found embedding_model.tflite.
Found melspectrogram.onnx.
Found melspectrogram.tflite.


In [51]:
F = openwakeword.utils.AudioFeatures(
    melspec_model_path=f"{model_dir}/melspectrogram.onnx",
    embedding_model_path=f"{model_dir}/embedding_model.onnx",
    inference_framework="onnx",
)

## ⌀ Negative Clips

Get negative example paths, filtering out clips that are too long or too short

 pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cpu

In [52]:
import openwakeword.data

In [53]:
negative_clips, negative_durations = openwakeword.data.filter_audio_paths(
    [
        "./data/fma_sample",
        "./data/fsd50k_sample",
        "./data/speech_commands_test",
    ],
    min_length_secs=1.0, # minimum clip length in seconds
    max_length_secs=60*30, # maximum clip length in seconds
    duration_method="header" # use the file header to calculate duration
)

print(f"{len(negative_clips)} negative clips after filtering, representing ~{sum(negative_durations)//3600} hours")

200it [00:00, 6478.74it/s]
100%|██████████| 200/200 [00:01<00:00, 111.20it/s]
1000it [00:00, 13335.11it/s]
100%|██████████| 1000/1000 [00:04<00:00, 227.16it/s]
4890it [00:00, 56887.77it/s]
100%|██████████| 4890/4890 [00:07<00:00, 617.77it/s]

5987 negative clips after filtering, representing ~5.0 hours





Use HuggingFace datasets to load files from disk by batches, because:
- Load lazy
- Memmory-Mapping for doesn't load entire in RAM
- map() -> feature extraction, augmentation
- filter() -> remove audio too long/short

In [54]:
audio_dataset = datasets.Dataset.from_dict({
    "audio": negative_clips
})

In [55]:
audio_dataset = audio_dataset.cast_column(
    "audio",
    datasets.Audio(sampling_rate=16000)
)

- Get Audio Embeddings (features) for negative clips and save to .npy file
- Process files by batch and save to Numpy memory mapped file so that
- An array larger than the available system memory can be created

In [56]:
batch_size = 64 # number of files to load, compute features, and write to mmap at a time

In [57]:
clip_size = 3 # The desired window size (in seconds) for the trained openWakeWord model

In [58]:
N_total = int(sum(negative_durations)//clip_size) # Maximum number of rows in mmap file

N_total

6441

In [59]:
n_feature_cols = F.get_embedding_shape(clip_size)

n_feature_cols

(28, 96)

In [60]:
output_file = "negative_features.npy"

In [61]:
output_array_shape = (N_total, n_feature_cols[0], n_feature_cols[1])

output_array_shape

(6441, 28, 96)

Remove .npy before create mmap

In [62]:
for f in [output_file, output_file + ".npy"]:
    if os.path.exists(f):
        os.remove(f)

In [63]:
fp = open_memmap(
    output_file,
    mode="w+",
    dtype=np.float32,
    shape=output_array_shape
)

fp

memmap([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

In [64]:
row_counter = 0

In [65]:
print("\n===== START PROCESS =====")
print(f"Total rows: {audio_dataset.num_rows}")
print(f"Batch size: {batch_size}")
print(f"Expected batches: {int(np.ceil(audio_dataset.num_rows / batch_size))}")
print("")

for i in tqdm(np.arange(0, audio_dataset.num_rows, batch_size)):
    print(f"\n--- Batch index: {int(i / batch_size)} ---")

    print("Loading audio batch...")
    block = audio_dataset[i:i + batch_size]["audio"]
    print(f"Loaded {len(block)} clips")

    print("Converting to int16 arrays...")
    wav_data = [(j["array"] * 32767).astype(np.int16) for j in block]
    print("Conversion done.")

    print("Stacking clips (pad/trim)...")
    wav_data = openwakeword.data.stack_clips(
        wav_data,
        clip_size=16000 * clip_size
    ).astype(np.int16)
    print("Stack clips done.")

    print("Computing embeddings...")
    features = F.embed_clips(
        x=wav_data,
        batch_size=1024,
        ncpu=8
    )
    print(f"Embedding done. features.shape = {features.shape}")

    print("Saving to mmap...")

    rows_needed = features.shape[0]

    if row_counter + rows_needed > N_total:
        print(f"Final batch: writing {N_total - row_counter} rows then break.")
        fp[row_counter:N_total, :, :] = features[0:N_total - row_counter]
        fp.flush()
        break

    else:
        print(f"Writing rows {row_counter} → {row_counter + rows_needed}")
        fp[row_counter:row_counter + rows_needed, :, :] = features
        row_counter += rows_needed
        fp.flush()


===== START PROCESS =====
Total rows: 5987
Batch size: 64
Expected batches: 94



  0%|          | 0/94 [00:00<?, ?it/s]


--- Batch index: 0 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  1%|          | 1/94 [00:06<10:38,  6.87s/it]

Embedding done. features.shape = (640, 28, 96)
Saving to mmap...
Writing rows 0 → 640

--- Batch index: 1 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  2%|▏         | 2/94 [00:13<10:38,  6.95s/it]

Embedding done. features.shape = (640, 28, 96)
Saving to mmap...
Writing rows 640 → 1280

--- Batch index: 2 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  3%|▎         | 3/94 [00:21<10:45,  7.09s/it]

Embedding done. features.shape = (638, 28, 96)
Saving to mmap...
Writing rows 1280 → 1918

--- Batch index: 3 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  4%|▍         | 4/94 [00:23<07:56,  5.30s/it]

Embedding done. features.shape = (234, 28, 96)
Saving to mmap...
Writing rows 1918 → 2152

--- Batch index: 4 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  5%|▌         | 5/94 [00:26<06:25,  4.33s/it]

Embedding done. features.shape = (249, 28, 96)
Saving to mmap...
Writing rows 2152 → 2401

--- Batch index: 5 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  6%|▋         | 6/94 [00:28<05:21,  3.65s/it]

Embedding done. features.shape = (222, 28, 96)
Saving to mmap...
Writing rows 2401 → 2623

--- Batch index: 6 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  7%|▋         | 7/94 [00:30<04:36,  3.18s/it]

Embedding done. features.shape = (207, 28, 96)
Saving to mmap...
Writing rows 2623 → 2830

--- Batch index: 7 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


  9%|▊         | 8/94 [00:32<04:05,  2.85s/it]

Embedding done. features.shape = (192, 28, 96)
Saving to mmap...
Writing rows 2830 → 3022

--- Batch index: 8 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 10%|▉         | 9/94 [00:35<03:57,  2.80s/it]

Embedding done. features.shape = (228, 28, 96)
Saving to mmap...
Writing rows 3022 → 3250

--- Batch index: 9 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 11%|█         | 10/94 [00:38<03:50,  2.74s/it]

Embedding done. features.shape = (228, 28, 96)
Saving to mmap...
Writing rows 3250 → 3478

--- Batch index: 10 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 12%|█▏        | 11/94 [00:40<03:36,  2.61s/it]

Embedding done. features.shape = (210, 28, 96)
Saving to mmap...
Writing rows 3478 → 3688

--- Batch index: 11 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 13%|█▎        | 12/94 [00:42<03:18,  2.42s/it]

Embedding done. features.shape = (184, 28, 96)
Saving to mmap...
Writing rows 3688 → 3872

--- Batch index: 12 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...
Embedding done. features.shape = (194, 28, 96)
Saving to mmap...
Writing rows 3872 → 4066


 14%|█▍        | 13/94 [00:45<03:25,  2.53s/it]


--- Batch index: 13 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 15%|█▍        | 14/94 [00:47<03:12,  2.41s/it]

Embedding done. features.shape = (195, 28, 96)
Saving to mmap...
Writing rows 4066 → 4261

--- Batch index: 14 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 16%|█▌        | 15/94 [00:48<02:45,  2.09s/it]

Embedding done. features.shape = (125, 28, 96)
Saving to mmap...
Writing rows 4261 → 4386

--- Batch index: 15 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 17%|█▋        | 16/94 [00:51<02:51,  2.20s/it]

Embedding done. features.shape = (226, 28, 96)
Saving to mmap...
Writing rows 4386 → 4612

--- Batch index: 16 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 18%|█▊        | 17/94 [00:53<02:44,  2.14s/it]

Embedding done. features.shape = (186, 28, 96)
Saving to mmap...
Writing rows 4612 → 4798

--- Batch index: 17 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 19%|█▉        | 18/94 [00:53<02:06,  1.66s/it]

Embedding done. features.shape = (42, 28, 96)
Saving to mmap...
Writing rows 4798 → 4840

--- Batch index: 18 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 20%|██        | 19/94 [00:54<01:34,  1.25s/it]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4840 → 4862

--- Batch index: 19 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 21%|██▏       | 20/94 [00:54<01:11,  1.03it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4862 → 4884

--- Batch index: 20 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 22%|██▏       | 21/94 [00:54<00:55,  1.30it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4884 → 4906

--- Batch index: 21 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 23%|██▎       | 22/94 [00:55<00:45,  1.58it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4906 → 4928

--- Batch index: 22 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 24%|██▍       | 23/94 [00:55<00:37,  1.88it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4928 → 4950

--- Batch index: 23 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 26%|██▌       | 24/94 [00:55<00:32,  2.15it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4950 → 4972

--- Batch index: 24 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 27%|██▋       | 25/94 [00:55<00:28,  2.41it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4972 → 4994

--- Batch index: 25 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 28%|██▊       | 26/94 [00:56<00:26,  2.56it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 4994 → 5016

--- Batch index: 26 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 29%|██▊       | 27/94 [00:56<00:24,  2.76it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5016 → 5038

--- Batch index: 27 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 30%|██▉       | 28/94 [00:56<00:23,  2.85it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5038 → 5060

--- Batch index: 28 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 31%|███       | 29/94 [00:57<00:21,  3.00it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5060 → 5082

--- Batch index: 29 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 32%|███▏      | 30/94 [00:57<00:21,  3.04it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5082 → 5104

--- Batch index: 30 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 33%|███▎      | 31/94 [00:57<00:20,  3.13it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5104 → 5126

--- Batch index: 31 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 34%|███▍      | 32/94 [00:58<00:19,  3.18it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5126 → 5148

--- Batch index: 32 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 35%|███▌      | 33/94 [00:58<00:19,  3.20it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5148 → 5170

--- Batch index: 33 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 36%|███▌      | 34/94 [00:58<00:18,  3.21it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5170 → 5192

--- Batch index: 34 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 37%|███▋      | 35/94 [00:59<00:18,  3.24it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5192 → 5214

--- Batch index: 35 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 38%|███▊      | 36/94 [00:59<00:17,  3.25it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5214 → 5236

--- Batch index: 36 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 39%|███▉      | 37/94 [00:59<00:17,  3.17it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5236 → 5258

--- Batch index: 37 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 40%|████      | 38/94 [01:00<00:17,  3.13it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5258 → 5280

--- Batch index: 38 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 41%|████▏     | 39/94 [01:00<00:18,  3.03it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5280 → 5302

--- Batch index: 39 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 43%|████▎     | 40/94 [01:00<00:17,  3.02it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5302 → 5324

--- Batch index: 40 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 44%|████▎     | 41/94 [01:01<00:17,  3.11it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5324 → 5346

--- Batch index: 41 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 45%|████▍     | 42/94 [01:01<00:16,  3.06it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5346 → 5368

--- Batch index: 42 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 46%|████▌     | 43/94 [01:01<00:16,  3.15it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5368 → 5390

--- Batch index: 43 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 47%|████▋     | 44/94 [01:01<00:15,  3.16it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5390 → 5412

--- Batch index: 44 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 48%|████▊     | 45/94 [01:02<00:15,  3.15it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5412 → 5434

--- Batch index: 45 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 49%|████▉     | 46/94 [01:02<00:15,  3.16it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5434 → 5456

--- Batch index: 46 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 50%|█████     | 47/94 [01:02<00:15,  3.00it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5456 → 5478

--- Batch index: 47 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 51%|█████     | 48/94 [01:03<00:15,  3.03it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5478 → 5500

--- Batch index: 48 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 52%|█████▏    | 49/94 [01:03<00:14,  3.13it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5500 → 5522

--- Batch index: 49 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 53%|█████▎    | 50/94 [01:03<00:13,  3.18it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5522 → 5544

--- Batch index: 50 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 54%|█████▍    | 51/94 [01:04<00:13,  3.19it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5544 → 5566

--- Batch index: 51 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 55%|█████▌    | 52/94 [01:04<00:13,  3.16it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5566 → 5588

--- Batch index: 52 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 56%|█████▋    | 53/94 [01:04<00:12,  3.18it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5588 → 5610

--- Batch index: 53 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 57%|█████▋    | 54/94 [01:05<00:13,  3.01it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5610 → 5632

--- Batch index: 54 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 59%|█████▊    | 55/94 [01:05<00:12,  3.08it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5632 → 5654

--- Batch index: 55 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 60%|█████▉    | 56/94 [01:05<00:12,  3.08it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5654 → 5676

--- Batch index: 56 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 61%|██████    | 57/94 [01:06<00:11,  3.15it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5676 → 5698

--- Batch index: 57 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 62%|██████▏   | 58/94 [01:06<00:11,  3.13it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5698 → 5720

--- Batch index: 58 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 63%|██████▎   | 59/94 [01:06<00:11,  3.14it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5720 → 5742

--- Batch index: 59 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 64%|██████▍   | 60/94 [01:07<00:11,  3.08it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5742 → 5764

--- Batch index: 60 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 65%|██████▍   | 61/94 [01:07<00:10,  3.10it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5764 → 5786

--- Batch index: 61 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 66%|██████▌   | 62/94 [01:07<00:10,  3.14it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5786 → 5808

--- Batch index: 62 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 67%|██████▋   | 63/94 [01:08<00:09,  3.19it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5808 → 5830

--- Batch index: 63 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 68%|██████▊   | 64/94 [01:08<00:09,  3.14it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5830 → 5852

--- Batch index: 64 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 69%|██████▉   | 65/94 [01:08<00:09,  3.14it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5852 → 5874

--- Batch index: 65 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 70%|███████   | 66/94 [01:09<00:08,  3.18it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5874 → 5896

--- Batch index: 66 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 71%|███████▏  | 67/94 [01:09<00:08,  3.09it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5896 → 5918

--- Batch index: 67 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 72%|███████▏  | 68/94 [01:09<00:08,  3.07it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5918 → 5940

--- Batch index: 68 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 73%|███████▎  | 69/94 [01:09<00:08,  3.12it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5940 → 5962

--- Batch index: 69 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 74%|███████▍  | 70/94 [01:10<00:07,  3.11it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5962 → 5984

--- Batch index: 70 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 76%|███████▌  | 71/94 [01:10<00:07,  3.19it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 5984 → 6006

--- Batch index: 71 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 77%|███████▋  | 72/94 [01:10<00:06,  3.22it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6006 → 6028

--- Batch index: 72 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 78%|███████▊  | 73/94 [01:11<00:06,  3.27it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6028 → 6050

--- Batch index: 73 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 79%|███████▊  | 74/94 [01:11<00:06,  3.31it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6050 → 6072

--- Batch index: 74 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 80%|███████▉  | 75/94 [01:11<00:05,  3.20it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6072 → 6094

--- Batch index: 75 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 81%|████████  | 76/94 [01:12<00:05,  3.23it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6094 → 6116

--- Batch index: 76 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 82%|████████▏ | 77/94 [01:12<00:05,  3.24it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6116 → 6138

--- Batch index: 77 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 83%|████████▎ | 78/94 [01:12<00:04,  3.23it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6138 → 6160

--- Batch index: 78 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 84%|████████▍ | 79/94 [01:13<00:04,  3.28it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6160 → 6182

--- Batch index: 79 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 85%|████████▌ | 80/94 [01:13<00:04,  3.20it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6182 → 6204

--- Batch index: 80 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 86%|████████▌ | 81/94 [01:13<00:04,  3.14it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6204 → 6226

--- Batch index: 81 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 87%|████████▋ | 82/94 [01:14<00:03,  3.19it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6226 → 6248

--- Batch index: 82 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 88%|████████▊ | 83/94 [01:14<00:03,  3.23it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6248 → 6270

--- Batch index: 83 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 89%|████████▉ | 84/94 [01:14<00:03,  3.24it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6270 → 6292

--- Batch index: 84 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 90%|█████████ | 85/94 [01:14<00:02,  3.19it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6292 → 6314

--- Batch index: 85 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 91%|█████████▏| 86/94 [01:15<00:02,  3.24it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6314 → 6336

--- Batch index: 86 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 93%|█████████▎| 87/94 [01:15<00:02,  3.30it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6336 → 6358

--- Batch index: 87 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 94%|█████████▎| 88/94 [01:15<00:01,  3.29it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6358 → 6380

--- Batch index: 88 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 95%|█████████▍| 89/94 [01:16<00:01,  3.23it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6380 → 6402

--- Batch index: 89 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 96%|█████████▌| 90/94 [01:16<00:01,  3.26it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Writing rows 6402 → 6424

--- Batch index: 90 ---
Loading audio batch...
Loaded 64 clips
Converting to int16 arrays...
Conversion done.
Stacking clips (pad/trim)...
Stack clips done.
Computing embeddings...


 96%|█████████▌| 90/94 [01:16<00:03,  1.17it/s]

Embedding done. features.shape = (22, 28, 96)
Saving to mmap...
Final batch: writing 17 rows then break.





In [66]:
print("\n===== Trimming mmap file =====")
openwakeword.data.trim_mmap(output_file)
print("===== FINISHED =====\n")


===== Trimming mmap file =====


Trimming empty rows: 7it [00:01,  4.76it/s]                       

===== FINISHED =====






Now we have all of the negative features prepared, and saved to fixed durations clips in Numpy array. For this data, the array is small at ~160MB, but in-practice the memory mapping allows to be very large.

## ✰ Negative Clips

The positive clips will be prepared in two way:
1. Mixing the synthetic positive clips with negative data at random SNRs to simulate noise data
2. Aligning the positive clips with background data such as that the end of the input window aligns with the end of the positive clip. This way the model will learn to predict the presence of the wakeword/phrase immediately after it is spoken.

In practice, there are other possible ways to augment the positive data (e.g, creating reverberation with room impulse response files, mixing with synthetic noise, etc.) but in pratice we have observed that mixing with realistic background data provides the best results.

After this preparation, the positive clips will be converted into the openWakeWord features in the same way as the negative files.

**Get positive example paths, filtering out clips that are too long or too short**

In [67]:
positive_clips, durations = openwakeword.data.filter_audio_paths(
    [
        "./data/turn_on_the_office_lights"
    ],
    min_length_secs=1.0,
    max_length_secs=2.0,
    duration_method="header" # use the file header to calculate duration
)

print(f"{len(positive_clips)} positive clips after filtering, representing ~{sum(durations)//3600} hours")

3388it [00:00, 11175.21it/s]
100%|██████████| 3388/3388 [00:26<00:00, 125.96it/s]

3203 positive clips after filtering, representing ~1.0 hours





Define starting point for each positive clip based on its length, so that each one ends between 0-200ms from the end of the total window size chosen for the model.

This results in the model being most confident in the prediction right after the end of the wakeword in the audio stream, reducing latency in operation.

Get start and end position for the positive audio in the full window

In [68]:
sr = 16000
total_length_seconds = 3 # must be some window length as that used for the negative examples
total_length = int(sr * total_length_seconds)

random delay (0-0.2s) -> make sure the audio don't need right position -> reduce overfitting

In [69]:
jitters = (np.random.uniform(0, 0.2, len(positive_clips)) * sr).astype(np.int32)

jitters

array([2095, 1494, 1934, ...,  500, 2201,  185],
      shape=(3203,), dtype=int32)

start position of wakeword for 3 seconds = total_length - (clip_length + jitter)


In [70]:
starts = [
    total_length
    -
    (
        int(np.ceil(i * sr)) + j
    ) for i,j in zip(durations, jitters)
]


starts

[np.int32(17299),
 np.int32(25143),
 np.int32(28233),
 np.int32(24940),
 np.int32(23750),
 np.int32(25444),
 np.int32(24960),
 np.int32(25562),
 np.int32(21872),
 np.int32(23157),
 np.int32(21154),
 np.int32(20644),
 np.int32(18381),
 np.int32(17686),
 np.int32(21227),
 np.int32(22864),
 np.int32(24021),
 np.int32(23330),
 np.int32(20297),
 np.int32(20653),
 np.int32(20692),
 np.int32(22218),
 np.int32(26625),
 np.int32(30312),
 np.int32(13911),
 np.int32(18013),
 np.int32(25166),
 np.int32(21778),
 np.int32(20872),
 np.int32(15585),
 np.int32(21245),
 np.int32(28610),
 np.int32(18309),
 np.int32(24545),
 np.int32(23233),
 np.int32(26866),
 np.int32(20321),
 np.int32(21804),
 np.int32(27507),
 np.int32(19126),
 np.int32(19475),
 np.int32(23735),
 np.int32(25364),
 np.int32(22741),
 np.int32(16092),
 np.int32(24663),
 np.int32(19958),
 np.int32(19805),
 np.int32(18974),
 np.int32(21658),
 np.int32(28217),
 np.int32(21222),
 np.int32(18151),
 np.int32(21389),
 np.int32(18552),
 np.int32(

end = clip_length + start_index

In [71]:
ends = [
    (int(i * sr) + j) for i, j in zip(durations, starts)
]

ends

[np.int32(45905),
 np.int32(46506),
 np.int32(46066),
 np.int32(47974),
 np.int32(47898),
 np.int32(47920),
 np.int32(46880),
 np.int32(45067),
 np.int32(45463),
 np.int32(45634),
 np.int32(46418),
 np.int32(46464),
 np.int32(45687),
 np.int32(46292),
 np.int32(46861),
 np.int32(47012),
 np.int32(47612),
 np.int32(44879),
 np.int32(46489),
 np.int32(44801),
 np.int32(47812),
 np.int32(47295),
 np.int32(46130),
 np.int32(47960),
 np.int32(45490),
 np.int32(45133),
 np.int32(47829),
 np.int32(46298),
 np.int32(47621),
 np.int32(44935),
 np.int32(46879),
 np.int32(46815),
 np.int32(46358),
 np.int32(45351),
 np.int32(45152),
 np.int32(45257),
 np.int32(45212),
 np.int32(46324),
 np.int32(45526),
 np.int32(46989),
 np.int32(46038),
 np.int32(45097),
 np.int32(46355),
 np.int32(45403),
 np.int32(46185),
 np.int32(47326),
 np.int32(45778),
 np.int32(47668),
 np.int32(46280),
 np.int32(47850),
 np.int32(47165),
 np.int32(47042),
 np.int32(45457),
 np.int32(45166),
 np.int32(45488),
 np.int32(

Create generator to mix the positive audio with background audio

In [72]:
batch_size = 8

In [73]:
mixing_generator = openwakeword.data.mix_clips_batch(
    foreground_clips = positive_clips,
    background_clips = negative_clips,
    combined_size = total_length,
    batch_size = batch_size,
    snr_low = 5,
    snr_high = 15,
    start_index = starts,
    volume_augmentation = True # Randomly scale the volume of the audio after mixing
)

(Optionally) listen to mixed clips to confirm that the mixing appears correctly

In [74]:
mixed_clips, labels, background_clips = next(mixing_generator)
ipd.display(ipd.Audio(mixed_clips[0], rate=16000, normalize=True, autoplay=False))

Iterate through the mixing generator, computing audio features for positive examples and saving them

In [75]:
N_total = len(positive_clips) # Maximum number of rows in mmap files

N_total

3203

In [76]:
n_feature_cols = F.get_embedding_shape(
    total_length_seconds
)

n_feature_cols

(28, 96)

In [77]:
output_file = "turn_on_the_office_lights_features.npy"

In [78]:
output_array_shape = (N_total, n_feature_cols[0], n_feature_cols[1])

output_array_shape

(3203, 28, 96)

Remove .npy before create mmap


In [79]:
for f in [output_file, output_file + ".npy"]:
    if os.path.exists(f):
        os.remove(f)

In [80]:
fp = open_memmap(
    output_file,
    "w+",
    dtype=np.float32,
    shape=output_array_shape
)

fp

memmap([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

In [81]:
row_counter = 0

In [82]:
import numpy as np
from tqdm import tqdm

print("===== START FEATURE EXTRACTION =====")
print(f"Total rows to process: {N_total}")
print(f"Batch size: {batch_size}")
print(f"Expected batches: {N_total // batch_size}")

row_counter = 0

for batch_idx, batch in enumerate(tqdm(mixing_generator, total=N_total//batch_size, desc="Processing batches")):

    # Unpack batch
    audio_batch, lbls, background = batch[0], batch[1], batch[2]
    print(f"\n--- Batch index: {batch_idx} ---")
    print(f"Audio batch shape: {audio_batch.shape}")

    # Compute embeddings
    print("Computing embeddings...")
    features = F.embed_clips(
        audio_batch,
        batch_size=256
    )
    print(f"Embeddings shape: {features.shape}")

    # Save computed features to memory-mapped file
    start_row = row_counter
    end_row = row_counter + features.shape[0]

    if end_row > N_total:
        # Truncate if last batch exceeds total rows
        print(f"Saving final partial batch: rows {start_row} → {N_total}")
        fp[start_row:N_total, :, :] = features[:N_total - start_row]
        fp.flush()
        row_counter = N_total
        break
    else:
        print(f"Saving batch rows: {start_row} → {end_row}")
        fp[start_row:end_row, :, :] = features
        fp.flush()
        row_counter = end_row

print("\n===== FEATURE EXTRACTION FINISHED =====")
print(f"Total rows written: {row_counter}")


===== START FEATURE EXTRACTION =====
Total rows to process: 3203
Batch size: 8
Expected batches: 400


Processing batches:   0%|          | 0/400 [00:00<?, ?it/s]


--- Batch index: 0 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   0%|          | 1/400 [00:00<03:14,  2.05it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 0 → 8

--- Batch index: 1 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   0%|          | 2/400 [00:01<03:27,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 8 → 16

--- Batch index: 2 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   1%|          | 3/400 [00:01<03:27,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 16 → 24

--- Batch index: 3 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   1%|          | 4/400 [00:02<03:25,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 24 → 32

--- Batch index: 4 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   1%|▏         | 5/400 [00:02<03:25,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 32 → 40

--- Batch index: 5 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   2%|▏         | 6/400 [00:03<03:24,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 40 → 48

--- Batch index: 6 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   2%|▏         | 7/400 [00:03<03:24,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 48 → 56

--- Batch index: 7 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   2%|▏         | 8/400 [00:04<03:32,  1.85it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 56 → 64

--- Batch index: 8 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   2%|▏         | 9/400 [00:05<05:54,  1.10it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 64 → 72

--- Batch index: 9 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   2%|▎         | 10/400 [00:08<08:27,  1.30s/it]

Embeddings shape: (8, 28, 96)
Saving batch rows: 72 → 80

--- Batch index: 10 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   3%|▎         | 11/400 [00:09<07:41,  1.19s/it]

Embeddings shape: (8, 28, 96)
Saving batch rows: 80 → 88

--- Batch index: 11 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   3%|▎         | 12/400 [00:09<06:18,  1.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 88 → 96

--- Batch index: 12 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   3%|▎         | 13/400 [00:10<05:19,  1.21it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 96 → 104

--- Batch index: 13 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   4%|▎         | 14/400 [00:10<04:45,  1.35it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 104 → 112

--- Batch index: 14 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   4%|▍         | 15/400 [00:11<04:21,  1.47it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 112 → 120

--- Batch index: 15 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   4%|▍         | 16/400 [00:11<04:00,  1.59it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 120 → 128

--- Batch index: 16 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   4%|▍         | 17/400 [00:12<03:54,  1.63it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 128 → 136

--- Batch index: 17 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   4%|▍         | 18/400 [00:12<03:40,  1.73it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 136 → 144

--- Batch index: 18 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   5%|▍         | 19/400 [00:13<03:31,  1.80it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 144 → 152

--- Batch index: 19 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   5%|▌         | 20/400 [00:13<03:29,  1.81it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 152 → 160

--- Batch index: 20 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   5%|▌         | 21/400 [00:14<03:30,  1.80it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 160 → 168

--- Batch index: 21 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   6%|▌         | 22/400 [00:14<03:28,  1.81it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 168 → 176

--- Batch index: 22 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   6%|▌         | 23/400 [00:15<03:26,  1.83it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 176 → 184

--- Batch index: 23 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   6%|▌         | 24/400 [00:15<03:17,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 184 → 192

--- Batch index: 24 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   6%|▋         | 25/400 [00:16<03:14,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 192 → 200

--- Batch index: 25 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   6%|▋         | 26/400 [00:16<03:16,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 200 → 208

--- Batch index: 26 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   7%|▋         | 27/400 [00:17<03:12,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 208 → 216

--- Batch index: 27 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   7%|▋         | 28/400 [00:17<03:07,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 216 → 224

--- Batch index: 28 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   7%|▋         | 29/400 [00:18<03:08,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 224 → 232

--- Batch index: 29 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   8%|▊         | 30/400 [00:18<03:14,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 232 → 240

--- Batch index: 30 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   8%|▊         | 31/400 [00:19<03:11,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 240 → 248

--- Batch index: 31 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   8%|▊         | 32/400 [00:19<03:09,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 248 → 256

--- Batch index: 32 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   8%|▊         | 33/400 [00:20<03:07,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 256 → 264

--- Batch index: 33 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   8%|▊         | 34/400 [00:20<03:11,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 264 → 272

--- Batch index: 34 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   9%|▉         | 35/400 [00:21<03:01,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 272 → 280

--- Batch index: 35 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   9%|▉         | 36/400 [00:21<03:02,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 280 → 288

--- Batch index: 36 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:   9%|▉         | 37/400 [00:22<03:05,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 288 → 296

--- Batch index: 37 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  10%|▉         | 38/400 [00:23<03:06,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 296 → 304

--- Batch index: 38 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  10%|▉         | 39/400 [00:23<03:01,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 304 → 312

--- Batch index: 39 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  10%|█         | 40/400 [00:24<03:05,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 312 → 320

--- Batch index: 40 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  10%|█         | 41/400 [00:24<03:04,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 320 → 328

--- Batch index: 41 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  10%|█         | 42/400 [00:25<03:03,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 328 → 336

--- Batch index: 42 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  11%|█         | 43/400 [00:25<03:01,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 336 → 344

--- Batch index: 43 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  11%|█         | 44/400 [00:26<03:03,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 344 → 352

--- Batch index: 44 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  11%|█▏        | 45/400 [00:26<03:02,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 352 → 360

--- Batch index: 45 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  12%|█▏        | 46/400 [00:27<03:01,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 360 → 368

--- Batch index: 46 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  12%|█▏        | 47/400 [00:27<02:56,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 368 → 376

--- Batch index: 47 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  12%|█▏        | 48/400 [00:28<02:56,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 376 → 384

--- Batch index: 48 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  12%|█▏        | 49/400 [00:28<02:53,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 384 → 392

--- Batch index: 49 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  12%|█▎        | 50/400 [00:29<02:52,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 392 → 400

--- Batch index: 50 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  13%|█▎        | 51/400 [00:29<02:56,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 400 → 408

--- Batch index: 51 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  13%|█▎        | 52/400 [00:30<02:51,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 408 → 416

--- Batch index: 52 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  13%|█▎        | 53/400 [00:30<02:50,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 416 → 424

--- Batch index: 53 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  14%|█▎        | 54/400 [00:31<02:51,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 424 → 432

--- Batch index: 54 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  14%|█▍        | 55/400 [00:31<02:53,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 432 → 440

--- Batch index: 55 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  14%|█▍        | 56/400 [00:32<02:51,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 440 → 448

--- Batch index: 56 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  14%|█▍        | 57/400 [00:32<02:50,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 448 → 456

--- Batch index: 57 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  14%|█▍        | 58/400 [00:33<02:53,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 456 → 464

--- Batch index: 58 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  15%|█▍        | 59/400 [00:33<02:46,  2.05it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 464 → 472

--- Batch index: 59 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  15%|█▌        | 60/400 [00:34<02:47,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 472 → 480

--- Batch index: 60 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  15%|█▌        | 61/400 [00:34<02:47,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 480 → 488

--- Batch index: 61 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  16%|█▌        | 62/400 [00:35<02:47,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 488 → 496

--- Batch index: 62 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  16%|█▌        | 63/400 [00:35<02:48,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 496 → 504

--- Batch index: 63 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  16%|█▌        | 64/400 [00:36<02:49,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 504 → 512

--- Batch index: 64 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  16%|█▋        | 65/400 [00:36<02:50,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 512 → 520

--- Batch index: 65 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  16%|█▋        | 66/400 [00:37<02:52,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 520 → 528

--- Batch index: 66 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  17%|█▋        | 67/400 [00:37<02:56,  1.89it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 528 → 536

--- Batch index: 67 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  17%|█▋        | 68/400 [00:38<02:53,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 536 → 544

--- Batch index: 68 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  17%|█▋        | 69/400 [00:38<02:54,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 544 → 552

--- Batch index: 69 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  18%|█▊        | 70/400 [00:39<02:52,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 552 → 560

--- Batch index: 70 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  18%|█▊        | 71/400 [00:39<02:53,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 560 → 568

--- Batch index: 71 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  18%|█▊        | 72/400 [00:40<02:50,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 568 → 576

--- Batch index: 72 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  18%|█▊        | 73/400 [00:40<02:51,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 576 → 584

--- Batch index: 73 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  18%|█▊        | 74/400 [00:41<02:48,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 584 → 592

--- Batch index: 74 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  19%|█▉        | 75/400 [00:41<02:45,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 592 → 600

--- Batch index: 75 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  19%|█▉        | 76/400 [00:42<02:44,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 600 → 608

--- Batch index: 76 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  19%|█▉        | 77/400 [00:42<02:43,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 608 → 616

--- Batch index: 77 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  20%|█▉        | 78/400 [00:43<02:42,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 616 → 624

--- Batch index: 78 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  20%|█▉        | 79/400 [00:43<02:47,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 624 → 632

--- Batch index: 79 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  20%|██        | 80/400 [00:44<02:45,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 632 → 640

--- Batch index: 80 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  20%|██        | 81/400 [00:44<02:39,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 640 → 648

--- Batch index: 81 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  20%|██        | 82/400 [00:45<02:37,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 648 → 656

--- Batch index: 82 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  21%|██        | 83/400 [00:45<02:41,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 656 → 664

--- Batch index: 83 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  21%|██        | 84/400 [00:46<02:39,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 664 → 672

--- Batch index: 84 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  21%|██▏       | 85/400 [00:46<02:39,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 672 → 680

--- Batch index: 85 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  22%|██▏       | 86/400 [00:47<02:36,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 680 → 688

--- Batch index: 86 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  22%|██▏       | 87/400 [00:47<02:35,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 688 → 696

--- Batch index: 87 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  22%|██▏       | 88/400 [00:48<02:33,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 696 → 704

--- Batch index: 88 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  22%|██▏       | 89/400 [00:48<02:34,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 704 → 712

--- Batch index: 89 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  22%|██▎       | 90/400 [00:49<02:37,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 712 → 720

--- Batch index: 90 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  23%|██▎       | 91/400 [00:49<02:35,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 720 → 728

--- Batch index: 91 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  23%|██▎       | 92/400 [00:50<02:36,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 728 → 736

--- Batch index: 92 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  23%|██▎       | 93/400 [00:50<02:36,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 736 → 744

--- Batch index: 93 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  24%|██▎       | 94/400 [00:51<02:32,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 744 → 752

--- Batch index: 94 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  24%|██▍       | 95/400 [00:51<02:36,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 752 → 760

--- Batch index: 95 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  24%|██▍       | 96/400 [00:52<02:34,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 760 → 768

--- Batch index: 96 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  24%|██▍       | 97/400 [00:52<02:37,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 768 → 776

--- Batch index: 97 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  24%|██▍       | 98/400 [00:53<02:37,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 776 → 784

--- Batch index: 98 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  25%|██▍       | 99/400 [00:53<02:35,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 784 → 792

--- Batch index: 99 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  25%|██▌       | 100/400 [00:54<02:31,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 792 → 800

--- Batch index: 100 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  25%|██▌       | 101/400 [00:54<02:32,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 800 → 808

--- Batch index: 101 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  26%|██▌       | 102/400 [00:55<02:30,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 808 → 816

--- Batch index: 102 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  26%|██▌       | 103/400 [00:55<02:30,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 816 → 824

--- Batch index: 103 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  26%|██▌       | 104/400 [00:56<02:29,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 824 → 832

--- Batch index: 104 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  26%|██▋       | 105/400 [00:56<02:28,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 832 → 840

--- Batch index: 105 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  26%|██▋       | 106/400 [00:57<02:25,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 840 → 848

--- Batch index: 106 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  27%|██▋       | 107/400 [00:57<02:25,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 848 → 856

--- Batch index: 107 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  27%|██▋       | 108/400 [00:58<02:25,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 856 → 864

--- Batch index: 108 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  27%|██▋       | 109/400 [00:58<02:28,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 864 → 872

--- Batch index: 109 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  28%|██▊       | 110/400 [00:59<02:33,  1.89it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 872 → 880

--- Batch index: 110 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  28%|██▊       | 111/400 [01:00<02:32,  1.89it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 880 → 888

--- Batch index: 111 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  28%|██▊       | 112/400 [01:00<02:33,  1.88it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 888 → 896

--- Batch index: 112 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  28%|██▊       | 113/400 [01:01<02:27,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 896 → 904

--- Batch index: 113 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  28%|██▊       | 114/400 [01:01<02:27,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 904 → 912

--- Batch index: 114 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  29%|██▉       | 115/400 [01:02<02:29,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 912 → 920

--- Batch index: 115 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  29%|██▉       | 116/400 [01:02<02:24,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 920 → 928

--- Batch index: 116 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  29%|██▉       | 117/400 [01:03<02:28,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 928 → 936

--- Batch index: 117 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  30%|██▉       | 118/400 [01:03<02:26,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 936 → 944

--- Batch index: 118 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  30%|██▉       | 119/400 [01:04<02:22,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 944 → 952

--- Batch index: 119 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  30%|███       | 120/400 [01:04<02:20,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 952 → 960

--- Batch index: 120 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  30%|███       | 121/400 [01:05<02:19,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 960 → 968

--- Batch index: 121 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  30%|███       | 122/400 [01:05<02:17,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 968 → 976

--- Batch index: 122 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  31%|███       | 123/400 [01:06<02:18,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 976 → 984

--- Batch index: 123 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  31%|███       | 124/400 [01:06<02:18,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 984 → 992

--- Batch index: 124 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  31%|███▏      | 125/400 [01:07<02:17,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 992 → 1000

--- Batch index: 125 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  32%|███▏      | 126/400 [01:07<02:17,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1000 → 1008

--- Batch index: 126 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  32%|███▏      | 127/400 [01:08<02:19,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1008 → 1016

--- Batch index: 127 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  32%|███▏      | 128/400 [01:08<02:16,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1016 → 1024

--- Batch index: 128 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  32%|███▏      | 129/400 [01:09<02:18,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1024 → 1032

--- Batch index: 129 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  32%|███▎      | 130/400 [01:09<02:17,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1032 → 1040

--- Batch index: 130 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  33%|███▎      | 131/400 [01:10<02:15,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1040 → 1048

--- Batch index: 131 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  33%|███▎      | 132/400 [01:10<02:13,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1048 → 1056

--- Batch index: 132 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  33%|███▎      | 133/400 [01:11<02:12,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1056 → 1064

--- Batch index: 133 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  34%|███▎      | 134/400 [01:11<02:14,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1064 → 1072

--- Batch index: 134 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  34%|███▍      | 135/400 [01:12<02:17,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1072 → 1080

--- Batch index: 135 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  34%|███▍      | 136/400 [01:12<02:14,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1080 → 1088

--- Batch index: 136 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  34%|███▍      | 137/400 [01:13<02:16,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1088 → 1096

--- Batch index: 137 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  34%|███▍      | 138/400 [01:13<02:15,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1096 → 1104

--- Batch index: 138 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  35%|███▍      | 139/400 [01:14<02:09,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1104 → 1112

--- Batch index: 139 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  35%|███▌      | 140/400 [01:14<02:11,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1112 → 1120

--- Batch index: 140 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  35%|███▌      | 141/400 [01:15<02:13,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1120 → 1128

--- Batch index: 141 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  36%|███▌      | 142/400 [01:15<02:21,  1.83it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1128 → 1136

--- Batch index: 142 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  36%|███▌      | 143/400 [01:16<02:21,  1.82it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1136 → 1144

--- Batch index: 143 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  36%|███▌      | 144/400 [01:16<02:17,  1.87it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1144 → 1152

--- Batch index: 144 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  36%|███▋      | 145/400 [01:17<02:14,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1152 → 1160

--- Batch index: 145 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  36%|███▋      | 146/400 [01:18<02:13,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1160 → 1168

--- Batch index: 146 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  37%|███▋      | 147/400 [01:18<02:11,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1168 → 1176

--- Batch index: 147 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  37%|███▋      | 148/400 [01:19<02:11,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1176 → 1184

--- Batch index: 148 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  37%|███▋      | 149/400 [01:19<02:14,  1.86it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1184 → 1192

--- Batch index: 149 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  38%|███▊      | 150/400 [01:20<02:10,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1192 → 1200

--- Batch index: 150 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  38%|███▊      | 151/400 [01:20<02:09,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1200 → 1208

--- Batch index: 151 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  38%|███▊      | 152/400 [01:21<02:11,  1.89it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1208 → 1216

--- Batch index: 152 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  38%|███▊      | 153/400 [01:21<02:08,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1216 → 1224

--- Batch index: 153 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  38%|███▊      | 154/400 [01:22<02:09,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1224 → 1232

--- Batch index: 154 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  39%|███▉      | 155/400 [01:22<02:04,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1232 → 1240

--- Batch index: 155 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  39%|███▉      | 156/400 [01:23<02:05,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1240 → 1248

--- Batch index: 156 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  39%|███▉      | 157/400 [01:23<02:04,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1248 → 1256

--- Batch index: 157 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  40%|███▉      | 158/400 [01:24<02:02,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1256 → 1264

--- Batch index: 158 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  40%|███▉      | 159/400 [01:24<02:01,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1264 → 1272

--- Batch index: 159 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  40%|████      | 160/400 [01:25<02:03,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1272 → 1280

--- Batch index: 160 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  40%|████      | 161/400 [01:25<02:01,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1280 → 1288

--- Batch index: 161 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  40%|████      | 162/400 [01:26<01:58,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1288 → 1296

--- Batch index: 162 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  41%|████      | 163/400 [01:26<01:57,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1296 → 1304

--- Batch index: 163 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  41%|████      | 164/400 [01:27<01:54,  2.05it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1304 → 1312

--- Batch index: 164 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  41%|████▏     | 165/400 [01:27<01:55,  2.04it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1312 → 1320

--- Batch index: 165 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  42%|████▏     | 166/400 [01:28<01:54,  2.04it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1320 → 1328

--- Batch index: 166 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  42%|████▏     | 167/400 [01:28<01:54,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1328 → 1336

--- Batch index: 167 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  42%|████▏     | 168/400 [01:29<01:57,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1336 → 1344

--- Batch index: 168 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  42%|████▏     | 169/400 [01:29<01:57,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1344 → 1352

--- Batch index: 169 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  42%|████▎     | 170/400 [01:30<01:55,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1352 → 1360

--- Batch index: 170 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  43%|████▎     | 171/400 [01:30<01:57,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1360 → 1368

--- Batch index: 171 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  43%|████▎     | 172/400 [01:31<01:55,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1368 → 1376

--- Batch index: 172 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  43%|████▎     | 173/400 [01:31<01:53,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1376 → 1384

--- Batch index: 173 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  44%|████▎     | 174/400 [01:32<01:54,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1384 → 1392

--- Batch index: 174 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  44%|████▍     | 175/400 [01:32<01:52,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1392 → 1400

--- Batch index: 175 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  44%|████▍     | 176/400 [01:33<01:58,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1400 → 1408

--- Batch index: 176 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  44%|████▍     | 177/400 [01:33<01:54,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1408 → 1416

--- Batch index: 177 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  44%|████▍     | 178/400 [01:34<01:54,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1416 → 1424

--- Batch index: 178 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  45%|████▍     | 179/400 [01:34<01:53,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1424 → 1432

--- Batch index: 179 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  45%|████▌     | 180/400 [01:35<01:59,  1.83it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1432 → 1440

--- Batch index: 180 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  45%|████▌     | 181/400 [01:35<01:56,  1.88it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1440 → 1448

--- Batch index: 181 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  46%|████▌     | 182/400 [01:36<01:54,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1448 → 1456

--- Batch index: 182 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  46%|████▌     | 183/400 [01:36<01:53,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1456 → 1464

--- Batch index: 183 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  46%|████▌     | 184/400 [01:37<01:50,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1464 → 1472

--- Batch index: 184 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  46%|████▋     | 185/400 [01:37<01:48,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1472 → 1480

--- Batch index: 185 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  46%|████▋     | 186/400 [01:38<01:46,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1480 → 1488

--- Batch index: 186 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  47%|████▋     | 187/400 [01:38<01:46,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1488 → 1496

--- Batch index: 187 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  47%|████▋     | 188/400 [01:39<01:46,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1496 → 1504

--- Batch index: 188 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  47%|████▋     | 189/400 [01:39<01:45,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1504 → 1512

--- Batch index: 189 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  48%|████▊     | 190/400 [01:40<01:45,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1512 → 1520

--- Batch index: 190 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  48%|████▊     | 191/400 [01:40<01:44,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1520 → 1528

--- Batch index: 191 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  48%|████▊     | 192/400 [01:41<01:44,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1528 → 1536

--- Batch index: 192 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  48%|████▊     | 193/400 [01:41<01:42,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1536 → 1544

--- Batch index: 193 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  48%|████▊     | 194/400 [01:42<01:45,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1544 → 1552

--- Batch index: 194 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  49%|████▉     | 195/400 [01:43<01:45,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1552 → 1560

--- Batch index: 195 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  49%|████▉     | 196/400 [01:43<01:43,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1560 → 1568

--- Batch index: 196 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  49%|████▉     | 197/400 [01:43<01:42,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1568 → 1576

--- Batch index: 197 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  50%|████▉     | 198/400 [01:44<01:46,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1576 → 1584

--- Batch index: 198 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  50%|████▉     | 199/400 [01:45<01:43,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1584 → 1592

--- Batch index: 199 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  50%|█████     | 200/400 [01:45<01:42,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1592 → 1600

--- Batch index: 200 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  50%|█████     | 201/400 [01:46<01:41,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1600 → 1608

--- Batch index: 201 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  50%|█████     | 202/400 [01:46<01:40,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1608 → 1616

--- Batch index: 202 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  51%|█████     | 203/400 [01:47<01:39,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1616 → 1624

--- Batch index: 203 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  51%|█████     | 204/400 [01:47<01:41,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1624 → 1632

--- Batch index: 204 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  51%|█████▏    | 205/400 [01:48<01:39,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1632 → 1640

--- Batch index: 205 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  52%|█████▏    | 206/400 [01:48<01:37,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1640 → 1648

--- Batch index: 206 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  52%|█████▏    | 207/400 [01:49<01:39,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1648 → 1656

--- Batch index: 207 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  52%|█████▏    | 208/400 [01:49<01:39,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1656 → 1664

--- Batch index: 208 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  52%|█████▏    | 209/400 [01:50<01:35,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1664 → 1672

--- Batch index: 209 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  52%|█████▎    | 210/400 [01:50<01:36,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1672 → 1680

--- Batch index: 210 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  53%|█████▎    | 211/400 [01:51<01:36,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1680 → 1688

--- Batch index: 211 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  53%|█████▎    | 212/400 [01:51<01:37,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1688 → 1696

--- Batch index: 212 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  53%|█████▎    | 213/400 [01:52<01:36,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1696 → 1704

--- Batch index: 213 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  54%|█████▎    | 214/400 [01:52<01:37,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1704 → 1712

--- Batch index: 214 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  54%|█████▍    | 215/400 [01:53<01:35,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1712 → 1720

--- Batch index: 215 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  54%|█████▍    | 216/400 [01:53<01:36,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1720 → 1728

--- Batch index: 216 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  54%|█████▍    | 217/400 [01:54<01:33,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1728 → 1736

--- Batch index: 217 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  55%|█████▍    | 218/400 [01:54<01:33,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1736 → 1744

--- Batch index: 218 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  55%|█████▍    | 219/400 [01:55<01:34,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1744 → 1752

--- Batch index: 219 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  55%|█████▌    | 220/400 [01:55<01:32,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1752 → 1760

--- Batch index: 220 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  55%|█████▌    | 221/400 [01:56<01:33,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1760 → 1768

--- Batch index: 221 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  56%|█████▌    | 222/400 [01:56<01:32,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1768 → 1776

--- Batch index: 222 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  56%|█████▌    | 223/400 [01:57<01:31,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1776 → 1784

--- Batch index: 223 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  56%|█████▌    | 224/400 [01:57<01:32,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1784 → 1792

--- Batch index: 224 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  56%|█████▋    | 225/400 [01:58<01:32,  1.89it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1792 → 1800

--- Batch index: 225 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  56%|█████▋    | 226/400 [01:59<01:32,  1.88it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1800 → 1808

--- Batch index: 226 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  57%|█████▋    | 227/400 [01:59<01:30,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1808 → 1816

--- Batch index: 227 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  57%|█████▋    | 228/400 [02:00<01:28,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1816 → 1824

--- Batch index: 228 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  57%|█████▋    | 229/400 [02:00<01:27,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1824 → 1832

--- Batch index: 229 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  57%|█████▊    | 230/400 [02:01<01:28,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1832 → 1840

--- Batch index: 230 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  58%|█████▊    | 231/400 [02:01<01:29,  1.88it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1840 → 1848

--- Batch index: 231 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  58%|█████▊    | 232/400 [02:02<01:32,  1.81it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1848 → 1856

--- Batch index: 232 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  58%|█████▊    | 233/400 [02:02<01:30,  1.84it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1856 → 1864

--- Batch index: 233 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  58%|█████▊    | 234/400 [02:03<01:31,  1.82it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1864 → 1872

--- Batch index: 234 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  59%|█████▉    | 235/400 [02:03<01:28,  1.86it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1872 → 1880

--- Batch index: 235 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  59%|█████▉    | 236/400 [02:04<01:30,  1.82it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1880 → 1888

--- Batch index: 236 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  59%|█████▉    | 237/400 [02:04<01:28,  1.83it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1888 → 1896

--- Batch index: 237 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  60%|█████▉    | 238/400 [02:05<01:29,  1.82it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1896 → 1904

--- Batch index: 238 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  60%|█████▉    | 239/400 [02:05<01:25,  1.87it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1904 → 1912

--- Batch index: 239 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  60%|██████    | 240/400 [02:06<01:25,  1.87it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1912 → 1920

--- Batch index: 240 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  60%|██████    | 241/400 [02:07<01:26,  1.83it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1920 → 1928

--- Batch index: 241 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  60%|██████    | 242/400 [02:07<01:23,  1.88it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1928 → 1936

--- Batch index: 242 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  61%|██████    | 243/400 [02:08<01:22,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1936 → 1944

--- Batch index: 243 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  61%|██████    | 244/400 [02:08<01:22,  1.90it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1944 → 1952

--- Batch index: 244 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  61%|██████▏   | 245/400 [02:09<01:20,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1952 → 1960

--- Batch index: 245 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  62%|██████▏   | 246/400 [02:09<01:18,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1960 → 1968

--- Batch index: 246 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  62%|██████▏   | 247/400 [02:10<01:16,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1968 → 1976

--- Batch index: 247 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  62%|██████▏   | 248/400 [02:10<01:17,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1976 → 1984

--- Batch index: 248 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  62%|██████▏   | 249/400 [02:11<01:17,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1984 → 1992

--- Batch index: 249 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  62%|██████▎   | 250/400 [02:11<01:17,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 1992 → 2000

--- Batch index: 250 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  63%|██████▎   | 251/400 [02:12<01:15,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2000 → 2008

--- Batch index: 251 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  63%|██████▎   | 252/400 [02:12<01:14,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2008 → 2016

--- Batch index: 252 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  63%|██████▎   | 253/400 [02:13<01:13,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2016 → 2024

--- Batch index: 253 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  64%|██████▎   | 254/400 [02:13<01:16,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2024 → 2032

--- Batch index: 254 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  64%|██████▍   | 255/400 [02:14<01:15,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2032 → 2040

--- Batch index: 255 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  64%|██████▍   | 256/400 [02:14<01:14,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2040 → 2048

--- Batch index: 256 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  64%|██████▍   | 257/400 [02:15<01:13,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2048 → 2056

--- Batch index: 257 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  64%|██████▍   | 258/400 [02:15<01:12,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2056 → 2064

--- Batch index: 258 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  65%|██████▍   | 259/400 [02:16<01:11,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2064 → 2072

--- Batch index: 259 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  65%|██████▌   | 260/400 [02:16<01:11,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2072 → 2080

--- Batch index: 260 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  65%|██████▌   | 261/400 [02:17<01:09,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2080 → 2088

--- Batch index: 261 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  66%|██████▌   | 262/400 [02:17<01:09,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2088 → 2096

--- Batch index: 262 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  66%|██████▌   | 263/400 [02:18<01:10,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2096 → 2104

--- Batch index: 263 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  66%|██████▌   | 264/400 [02:18<01:10,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2104 → 2112

--- Batch index: 264 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  66%|██████▋   | 265/400 [02:19<01:08,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2112 → 2120

--- Batch index: 265 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  66%|██████▋   | 266/400 [02:19<01:09,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2120 → 2128

--- Batch index: 266 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  67%|██████▋   | 267/400 [02:20<01:08,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2128 → 2136

--- Batch index: 267 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  67%|██████▋   | 268/400 [02:20<01:06,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2136 → 2144

--- Batch index: 268 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  67%|██████▋   | 269/400 [02:21<01:06,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2144 → 2152

--- Batch index: 269 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  68%|██████▊   | 270/400 [02:21<01:06,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2152 → 2160

--- Batch index: 270 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  68%|██████▊   | 271/400 [02:22<01:05,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2160 → 2168

--- Batch index: 271 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  68%|██████▊   | 272/400 [02:22<01:05,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2168 → 2176

--- Batch index: 272 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  68%|██████▊   | 273/400 [02:23<01:03,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2176 → 2184

--- Batch index: 273 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  68%|██████▊   | 274/400 [02:23<01:02,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2184 → 2192

--- Batch index: 274 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  69%|██████▉   | 275/400 [02:24<01:04,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2192 → 2200

--- Batch index: 275 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  69%|██████▉   | 276/400 [02:24<01:02,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2200 → 2208

--- Batch index: 276 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  69%|██████▉   | 277/400 [02:25<01:01,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2208 → 2216

--- Batch index: 277 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  70%|██████▉   | 278/400 [02:25<01:01,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2216 → 2224

--- Batch index: 278 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  70%|██████▉   | 279/400 [02:26<01:01,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2224 → 2232

--- Batch index: 279 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  70%|███████   | 280/400 [02:26<01:00,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2232 → 2240

--- Batch index: 280 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  70%|███████   | 281/400 [02:27<00:59,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2240 → 2248

--- Batch index: 281 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  70%|███████   | 282/400 [02:27<00:58,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2248 → 2256

--- Batch index: 282 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  71%|███████   | 283/400 [02:28<00:57,  2.05it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2256 → 2264

--- Batch index: 283 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  71%|███████   | 284/400 [02:28<00:57,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2264 → 2272

--- Batch index: 284 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  71%|███████▏  | 285/400 [02:29<00:57,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2272 → 2280

--- Batch index: 285 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  72%|███████▏  | 286/400 [02:29<00:55,  2.06it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2280 → 2288

--- Batch index: 286 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  72%|███████▏  | 287/400 [02:30<00:55,  2.05it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2288 → 2296

--- Batch index: 287 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  72%|███████▏  | 288/400 [02:30<00:56,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2296 → 2304

--- Batch index: 288 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  72%|███████▏  | 289/400 [02:31<00:55,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2304 → 2312

--- Batch index: 289 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  72%|███████▎  | 290/400 [02:31<00:55,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2312 → 2320

--- Batch index: 290 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  73%|███████▎  | 291/400 [02:32<00:56,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2320 → 2328

--- Batch index: 291 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  73%|███████▎  | 292/400 [02:32<00:54,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2328 → 2336

--- Batch index: 292 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  73%|███████▎  | 293/400 [02:33<00:52,  2.04it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2336 → 2344

--- Batch index: 293 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  74%|███████▎  | 294/400 [02:33<00:51,  2.05it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2344 → 2352

--- Batch index: 294 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  74%|███████▍  | 295/400 [02:34<00:52,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2352 → 2360

--- Batch index: 295 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  74%|███████▍  | 296/400 [02:34<00:52,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2360 → 2368

--- Batch index: 296 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  74%|███████▍  | 297/400 [02:35<00:52,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2368 → 2376

--- Batch index: 297 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  74%|███████▍  | 298/400 [02:35<00:50,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2376 → 2384

--- Batch index: 298 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  75%|███████▍  | 299/400 [02:36<00:50,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2384 → 2392

--- Batch index: 299 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  75%|███████▌  | 300/400 [02:36<00:51,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2392 → 2400

--- Batch index: 300 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  75%|███████▌  | 301/400 [02:37<00:49,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2400 → 2408

--- Batch index: 301 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  76%|███████▌  | 302/400 [02:37<00:49,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2408 → 2416

--- Batch index: 302 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  76%|███████▌  | 303/400 [02:38<00:48,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2416 → 2424

--- Batch index: 303 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  76%|███████▌  | 304/400 [02:38<00:47,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2424 → 2432

--- Batch index: 304 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  76%|███████▋  | 305/400 [02:39<00:47,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2432 → 2440

--- Batch index: 305 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  76%|███████▋  | 306/400 [02:39<00:46,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2440 → 2448

--- Batch index: 306 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  77%|███████▋  | 307/400 [02:40<00:45,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2448 → 2456

--- Batch index: 307 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  77%|███████▋  | 308/400 [02:40<00:46,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2456 → 2464

--- Batch index: 308 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  77%|███████▋  | 309/400 [02:41<00:45,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2464 → 2472

--- Batch index: 309 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  78%|███████▊  | 310/400 [02:41<00:44,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2472 → 2480

--- Batch index: 310 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  78%|███████▊  | 311/400 [02:42<00:44,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2480 → 2488

--- Batch index: 311 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  78%|███████▊  | 312/400 [02:42<00:45,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2488 → 2496

--- Batch index: 312 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  78%|███████▊  | 313/400 [02:43<00:43,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2496 → 2504

--- Batch index: 313 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  78%|███████▊  | 314/400 [02:43<00:42,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2504 → 2512

--- Batch index: 314 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  79%|███████▉  | 315/400 [02:44<00:42,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2512 → 2520

--- Batch index: 315 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  79%|███████▉  | 316/400 [02:44<00:42,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2520 → 2528

--- Batch index: 316 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  79%|███████▉  | 317/400 [02:45<00:41,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2528 → 2536

--- Batch index: 317 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  80%|███████▉  | 318/400 [02:45<00:39,  2.06it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2536 → 2544

--- Batch index: 318 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  80%|███████▉  | 319/400 [02:46<00:40,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2544 → 2552

--- Batch index: 319 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  80%|████████  | 320/400 [02:46<00:39,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2552 → 2560

--- Batch index: 320 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  80%|████████  | 321/400 [02:47<00:38,  2.04it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2560 → 2568

--- Batch index: 321 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  80%|████████  | 322/400 [02:47<00:38,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2568 → 2576

--- Batch index: 322 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  81%|████████  | 323/400 [02:48<00:39,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2576 → 2584

--- Batch index: 323 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  81%|████████  | 324/400 [02:48<00:39,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2584 → 2592

--- Batch index: 324 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  81%|████████▏ | 325/400 [02:49<00:37,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2592 → 2600

--- Batch index: 325 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  82%|████████▏ | 326/400 [02:49<00:36,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2600 → 2608

--- Batch index: 326 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  82%|████████▏ | 327/400 [02:50<00:36,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2608 → 2616

--- Batch index: 327 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  82%|████████▏ | 328/400 [02:50<00:35,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2616 → 2624

--- Batch index: 328 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  82%|████████▏ | 329/400 [02:51<00:35,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2624 → 2632

--- Batch index: 329 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  82%|████████▎ | 330/400 [02:51<00:34,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2632 → 2640

--- Batch index: 330 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  83%|████████▎ | 331/400 [02:52<00:34,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2640 → 2648

--- Batch index: 331 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  83%|████████▎ | 332/400 [02:52<00:33,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2648 → 2656

--- Batch index: 332 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  83%|████████▎ | 333/400 [02:53<00:33,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2656 → 2664

--- Batch index: 333 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  84%|████████▎ | 334/400 [02:53<00:32,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2664 → 2672

--- Batch index: 334 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  84%|████████▍ | 335/400 [02:54<00:32,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2672 → 2680

--- Batch index: 335 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  84%|████████▍ | 336/400 [02:54<00:32,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2680 → 2688

--- Batch index: 336 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  84%|████████▍ | 337/400 [02:55<00:32,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2688 → 2696

--- Batch index: 337 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  84%|████████▍ | 338/400 [02:55<00:30,  2.01it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2696 → 2704

--- Batch index: 338 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  85%|████████▍ | 339/400 [02:56<00:30,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2704 → 2712

--- Batch index: 339 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  85%|████████▌ | 340/400 [02:56<00:30,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2712 → 2720

--- Batch index: 340 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  85%|████████▌ | 341/400 [02:57<00:30,  1.91it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2720 → 2728

--- Batch index: 341 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  86%|████████▌ | 342/400 [02:57<00:30,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2728 → 2736

--- Batch index: 342 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  86%|████████▌ | 343/400 [02:58<00:29,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2736 → 2744

--- Batch index: 343 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  86%|████████▌ | 344/400 [02:58<00:28,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2744 → 2752

--- Batch index: 344 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  86%|████████▋ | 345/400 [02:59<00:28,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2752 → 2760

--- Batch index: 345 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  86%|████████▋ | 346/400 [02:59<00:27,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2760 → 2768

--- Batch index: 346 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  87%|████████▋ | 347/400 [03:00<00:26,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2768 → 2776

--- Batch index: 347 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  87%|████████▋ | 348/400 [03:01<00:26,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2776 → 2784

--- Batch index: 348 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  87%|████████▋ | 349/400 [03:01<00:25,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2784 → 2792

--- Batch index: 349 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  88%|████████▊ | 350/400 [03:02<00:25,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2792 → 2800

--- Batch index: 350 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  88%|████████▊ | 351/400 [03:02<00:25,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2800 → 2808

--- Batch index: 351 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  88%|████████▊ | 352/400 [03:03<00:24,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2808 → 2816

--- Batch index: 352 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  88%|████████▊ | 353/400 [03:03<00:23,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2816 → 2824

--- Batch index: 353 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  88%|████████▊ | 354/400 [03:03<00:23,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2824 → 2832

--- Batch index: 354 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  89%|████████▉ | 355/400 [03:04<00:22,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2832 → 2840

--- Batch index: 355 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  89%|████████▉ | 356/400 [03:05<00:22,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2840 → 2848

--- Batch index: 356 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  89%|████████▉ | 357/400 [03:05<00:21,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2848 → 2856

--- Batch index: 357 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  90%|████████▉ | 358/400 [03:06<00:21,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2856 → 2864

--- Batch index: 358 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  90%|████████▉ | 359/400 [03:06<00:20,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2864 → 2872

--- Batch index: 359 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  90%|█████████ | 360/400 [03:06<00:19,  2.04it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2872 → 2880

--- Batch index: 360 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  90%|█████████ | 361/400 [03:07<00:19,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2880 → 2888

--- Batch index: 361 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  90%|█████████ | 362/400 [03:07<00:18,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2888 → 2896

--- Batch index: 362 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  91%|█████████ | 363/400 [03:08<00:18,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2896 → 2904

--- Batch index: 363 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  91%|█████████ | 364/400 [03:09<00:18,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2904 → 2912

--- Batch index: 364 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  91%|█████████▏| 365/400 [03:09<00:17,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2912 → 2920

--- Batch index: 365 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  92%|█████████▏| 366/400 [03:09<00:16,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2920 → 2928

--- Batch index: 366 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  92%|█████████▏| 367/400 [03:10<00:16,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2928 → 2936

--- Batch index: 367 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  92%|█████████▏| 368/400 [03:10<00:15,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2936 → 2944

--- Batch index: 368 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  92%|█████████▏| 369/400 [03:11<00:15,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2944 → 2952

--- Batch index: 369 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  92%|█████████▎| 370/400 [03:12<00:15,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2952 → 2960

--- Batch index: 370 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  93%|█████████▎| 371/400 [03:12<00:14,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2960 → 2968

--- Batch index: 371 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  93%|█████████▎| 372/400 [03:13<00:14,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2968 → 2976

--- Batch index: 372 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  93%|█████████▎| 373/400 [03:13<00:13,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2976 → 2984

--- Batch index: 373 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  94%|█████████▎| 374/400 [03:14<00:12,  2.02it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2984 → 2992

--- Batch index: 374 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  94%|█████████▍| 375/400 [03:14<00:12,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 2992 → 3000

--- Batch index: 375 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  94%|█████████▍| 376/400 [03:15<00:12,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3000 → 3008

--- Batch index: 376 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  94%|█████████▍| 377/400 [03:15<00:11,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3008 → 3016

--- Batch index: 377 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  94%|█████████▍| 378/400 [03:16<00:11,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3016 → 3024

--- Batch index: 378 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  95%|█████████▍| 379/400 [03:16<00:10,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3024 → 3032

--- Batch index: 379 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  95%|█████████▌| 380/400 [03:17<00:10,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3032 → 3040

--- Batch index: 380 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  95%|█████████▌| 381/400 [03:17<00:09,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3040 → 3048

--- Batch index: 381 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  96%|█████████▌| 382/400 [03:18<00:09,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3048 → 3056

--- Batch index: 382 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  96%|█████████▌| 383/400 [03:18<00:08,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3056 → 3064

--- Batch index: 383 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  96%|█████████▌| 384/400 [03:19<00:07,  2.03it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3064 → 3072

--- Batch index: 384 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  96%|█████████▋| 385/400 [03:19<00:07,  1.97it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3072 → 3080

--- Batch index: 385 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  96%|█████████▋| 386/400 [03:20<00:06,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3080 → 3088

--- Batch index: 386 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  97%|█████████▋| 387/400 [03:20<00:06,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3088 → 3096

--- Batch index: 387 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  97%|█████████▋| 388/400 [03:21<00:06,  1.93it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3096 → 3104

--- Batch index: 388 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  97%|█████████▋| 389/400 [03:21<00:05,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3104 → 3112

--- Batch index: 389 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  98%|█████████▊| 390/400 [03:22<00:05,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3112 → 3120

--- Batch index: 390 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  98%|█████████▊| 391/400 [03:22<00:04,  1.92it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3120 → 3128

--- Batch index: 391 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  98%|█████████▊| 392/400 [03:23<00:04,  1.95it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3128 → 3136

--- Batch index: 392 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  98%|█████████▊| 393/400 [03:23<00:03,  1.99it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3136 → 3144

--- Batch index: 393 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  98%|█████████▊| 394/400 [03:24<00:03,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3144 → 3152

--- Batch index: 394 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  99%|█████████▉| 395/400 [03:24<00:02,  1.98it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3152 → 3160

--- Batch index: 395 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  99%|█████████▉| 396/400 [03:25<00:01,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3160 → 3168

--- Batch index: 396 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches:  99%|█████████▉| 397/400 [03:25<00:01,  1.94it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3168 → 3176

--- Batch index: 397 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches: 100%|█████████▉| 398/400 [03:26<00:01,  1.96it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3176 → 3184

--- Batch index: 398 ---
Audio batch shape: (8, 48000)
Computing embeddings...


Processing batches: 100%|█████████▉| 399/400 [03:26<00:00,  2.00it/s]

Embeddings shape: (8, 28, 96)
Saving batch rows: 3184 → 3192

--- Batch index: 399 ---
Audio batch shape: (3, 48000)
Computing embeddings...


Processing batches: 100%|██████████| 400/400 [03:26<00:00,  1.93it/s]

Embeddings shape: (3, 28, 96)
Saving batch rows: 3192 → 3195

===== FEATURE EXTRACTION FINISHED =====
Total rows written: 3195





Trip empty rows from the mmapped array

In [84]:
print("\n===== Trimming mmap file =====")
openwakeword.data.trim_mmap(output_file)
print("===== FINISHED =====\n")


===== Trimming mmap file =====


Trimming empty rows: 4it [00:00, 10.12it/s]                       


===== FINISHED =====



Alright! At this point the positive and negative features have been pre-computed and saved to disk, and now a model can be trained that takes these features and predicts whether the wakeword/phrase in present.