# üé§ 09 ‚Äî Keyword Spotting (Voice Command Detection)

**Detect spoken keywords from audio MFCC features ‚Äî a core TinyML use case**

| Property | Value |
|----------|-------|
| **Task** | Wake-word / command detection |
| **Dataset** | Google Speech Commands v0.02 (~2.3GB) |
| **Keywords** | yes, no, up, down, left, right, on, off, stop, go |
| **Input** | 13 MFCC √ó 32 frames = 416 features |
| **Architecture** | Dense(416‚Üí128) ‚Üí ReLU ‚Üí Dense(128‚Üí64) ‚Üí ReLU ‚Üí Dense(64‚Üí10) |
| **MCU Memory** | ~62KB Flash + 16KB Arena |

### Audio Processing Pipeline
```
WAV (16kHz) ‚Üí Pad/Truncate to 1s ‚Üí MFCC (13 coefficients √ó 32 frames)
            ‚Üí Normalize ‚Üí Flatten [416] ‚Üí MLP ‚Üí [10 keywords]
```

> **Prerequisites**: `pip install nano-rust-py[train,audio]` (includes torchaudio, soundfile)


## Setup

In [None]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import soundfile as sf
from pathlib import Path
from torch.utils.data import DataLoader, Dataset
from nano_rust_py.utils import quantize_to_i8, quantize_weights, calibrate_model
import nano_rust_py

KEYWORDS = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
N_MFCC, N_FRAMES = 13, 32
N_FEATURES = N_MFCC * N_FRAMES  # 416

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
print(f'Features: {N_MFCC} MFCC √ó {N_FRAMES} frames = {N_FEATURES}')


## Step 1: Download & Load Speech Commands

Downloads Google Speech Commands v0.02 (~2.3GB on first run).
Audio is processed into MFCC features (13 coefficients √ó 32 time frames).


In [None]:
import tarfile, urllib.request

DATASET_URL = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
DATASET_DIR = Path('./data/SpeechCommands/speech_commands_v0.02')
if not DATASET_DIR.exists():
    DATASET_DIR = Path('./data/speech_commands_v002')

if not DATASET_DIR.exists():
    archive = Path('./data/speech_commands_v0.02.tar.gz')
    archive.parent.mkdir(parents=True, exist_ok=True)
    if not archive.exists():
        print('Downloading Speech Commands dataset (~2.3GB)...')
        urllib.request.urlretrieve(DATASET_URL, str(archive))
    print('Extracting...')
    DATASET_DIR.mkdir(parents=True, exist_ok=True)
    with tarfile.open(str(archive), 'r:gz') as tar:
        tar.extractall(str(DATASET_DIR))

val_set = set((DATASET_DIR / 'validation_list.txt').read_text().strip().split('\n'))
test_set = set((DATASET_DIR / 'testing_list.txt').read_text().strip().split('\n'))
print(f'Dataset ready at: {DATASET_DIR}')


In [None]:
class KeywordDataset(Dataset):
    def __init__(self, subset='training'):
        self.mfcc_transform = torchaudio.transforms.MFCC(
            sample_rate=16000, n_mfcc=N_MFCC,
            melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 23}
        )
        self.files, self.labels = [], []
        for kw in KEYWORDS:
            kw_dir = DATASET_DIR / kw
            if not kw_dir.exists(): continue
            for wav in kw_dir.glob('*.wav'):
                rel = f'{kw}/{wav.name}'
                if subset == 'testing' and rel not in test_set: continue
                elif subset == 'validation' and rel not in val_set: continue
                elif subset == 'training' and (rel in test_set or rel in val_set): continue
                self.files.append(str(wav))
                self.labels.append(KEYWORDS.index(kw))
        print(f'  {subset}: {len(self.files)} samples')

    def __len__(self): return len(self.files)

    def __getitem__(self, idx):
        audio_np, _ = sf.read(self.files[idx], dtype='float32')
        waveform = torch.from_numpy(audio_np).unsqueeze(0)
        if waveform.shape[1] < 16000:
            waveform = torch.nn.functional.pad(waveform, (0, 16000 - waveform.shape[1]))
        else:
            waveform = waveform[:, :16000]
        mfcc = self.mfcc_transform(waveform).squeeze(0)
        if mfcc.shape[1] != N_FRAMES:
            mfcc = torch.nn.functional.interpolate(
                mfcc.unsqueeze(0), size=N_FRAMES, mode='linear', align_corners=False
            ).squeeze(0)
        features = mfcc.flatten()
        features = (features - features.mean()) / (features.std() + 1e-8)
        return features, self.labels[idx]

train_ds = KeywordDataset('training')
test_ds = KeywordDataset('testing')
train_loader = DataLoader(train_ds, batch_size=256, shuffle=True, pin_memory=True, num_workers=0)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False, pin_memory=True, num_workers=0)


## Step 2: Train MLP (10 epochs)

In [None]:
model = nn.Sequential(
    nn.Linear(N_FEATURES, 128), nn.ReLU(),
    nn.Linear(128, 64),         nn.ReLU(),
    nn.Linear(64, len(KEYWORDS)),
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
EPOCHS = 10

t0 = time.time()
for epoch in range(EPOCHS):
    model.train()
    correct, total = 0, 0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        out = model(features)
        criterion(out, labels).backward()
        optimizer.step()
        correct += out.argmax(1).eq(labels).sum().item()
        total += labels.size(0)
    if (epoch + 1) % 2 == 0:
        print(f'  Epoch {epoch+1}/{EPOCHS} ‚Äî Acc: {100.*correct/total:.1f}%')
train_time = time.time() - t0


## Step 3: Evaluate & Quantize

In [None]:
model.eval()
correct_pt, total_pt = 0, 0
with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        correct_pt += model(features).argmax(1).eq(labels).sum().item()
        total_pt += labels.size(0)
pt_acc = 100. * correct_pt / total_pt
print(f'PyTorch Test Accuracy: {pt_acc:.2f}%')


In [None]:
model_cpu = model.cpu().eval()
q_weights = quantize_weights(model_cpu)

# Global scale from test features
all_feats = np.vstack([test_ds[i][0].numpy() for i in range(min(100, len(test_ds)))])
global_max = float(np.max(np.abs(all_feats)))
global_scale = global_max / 127.0
print(f'Global scale: {global_scale:.6f}')

def quantize_global(data):
    return np.clip(np.round(data / global_scale), -128, 127).astype(np.int8)

cal_input = test_ds[0][0].unsqueeze(0)
requant = calibrate_model(model_cpu, cal_input, q_weights, global_scale)


## Step 4: NANO-RUST Test (500 samples)

In [None]:
def build_nano():
    nano = nano_rust_py.PySequentialModel(input_shape=[N_FEATURES], arena_size=16384)
    m, s, bc = requant['0']
    nano.add_dense_with_requant(q_weights['0']['weights'].flatten().tolist(), bc, m, s)
    nano.add_relu()
    m, s, bc = requant['2']
    nano.add_dense_with_requant(q_weights['2']['weights'].flatten().tolist(), bc, m, s)
    nano.add_relu()
    m, s, bc = requant['4']
    nano.add_dense_with_requant(q_weights['4']['weights'].flatten().tolist(), bc, m, s)
    return nano

N_TEST = min(500, len(test_ds))
correct_nano, match_count, max_diffs = 0, 0, []
t0 = time.time()
for i in range(N_TEST):
    feat, label = test_ds[i]
    q_feat = quantize_global(feat.numpy())
    nano_out = build_nano().forward(q_feat.tolist())
    nano_cls = int(np.argmax(nano_out))
    with torch.no_grad():
        pt_out = model_cpu(feat.unsqueeze(0)).numpy().flatten()
    pt_cls = int(np.argmax(pt_out))
    q_pt, _ = quantize_to_i8(pt_out)
    diff = np.abs(q_pt.astype(np.int32) - np.array(nano_out, dtype=np.int8).astype(np.int32))
    max_diffs.append(int(np.max(diff)))
    if nano_cls == label: correct_nano += 1
    if nano_cls == pt_cls: match_count += 1
    if (i+1) % 100 == 0: print(f'  {i+1}/{N_TEST}...')
infer_time = time.time() - t0


## üìä Results

In [None]:
nano_acc = 100. * correct_nano / N_TEST
agreement = 100. * match_count / N_TEST
total_w = sum(q['weights'].nbytes for q in q_weights.values())
print('=' * 60)
print('       KEYWORD SPOTTING RESULTS')
print('=' * 60)
print(f'Keywords: {", ".join(KEYWORDS)}')
print(f'PyTorch Accuracy:     {pt_acc:.2f}%')
print(f'NANO-RUST Accuracy:   {nano_acc:.2f}% (n={N_TEST})')
print(f'Classification Match: {agreement:.1f}%')
print(f'Max Diff (median):    {int(np.median(max_diffs))}')
print(f'Memory: {total_w:,} bytes ({total_w/1024:.1f}KB) + 16KB arena')
print(f'Fits ESP32 (520KB)? {"YES" if total_w + 16384 < 520*1024 else "NO"}')
print('=' * 60)
print(f'{"‚úÖ PASS" if agreement > 85 else "‚ùå FAIL"}: {agreement:.1f}% agreement')
