

# Speech Commands Classification - Lab Evaluation



### Tasks:
1. Summarize the paper in about 50 words.
2. Download, analyze, and statistically describe the dataset.
3. Train a classifier to distinguish commands.
4. Report performance results using standard benchmarks.
5. Record 30 samples of each command in your voice and create a new dataset.
6. Fine-tune the classifier on your voice.
7. Report the results.

## 1. Paper Summary

The research paper describes Google's Speech Commands dataset for training and evaluating keyword spotting systems. It targets simple speech recognition tasks, detecting spoken words with limited vocabulary. The paper covers data collection, challenges in building small on-device models, and provides baseline results, highlighting the dataset's utility for improving voice interface technology.

## 2. Analyze Dataset

In [1]:
# Cell 2: Download and Analyze the Dataset
import os
import torchaudio
from collections import Counter
from torch.utils.data import Subset

In [2]:
# Create the data directory if it doesn't exist
data_dir = './data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [3]:
# Download the Speech Commands dataset
dataset = torchaudio.datasets.SPEECHCOMMANDS(root=data_dir, download=True)

100%|██████████| 2.26G/2.26G [00:19<00:00, 124MB/s]


In [4]:
# Select 10 commands to work with
selected_commands = ['yes', 'no', 'up', 'down', 'left', 'right', 'go', 'stop', 'on', 'off']

# Limit the number of samples per command (e.g., 100 samples per command)
samples_per_command = 100

In [5]:
# Create a subset of the dataset by filtering for the selected commands
subset_indices = []
command_counter = Counter()

for idx, sample in enumerate(dataset):
    label = sample[2]
    if label in selected_commands and command_counter[label] < samples_per_command:
        subset_indices.append(idx)
        command_counter.update([label])

    # Stop when we have enough samples for each command
    if all(command_counter[cmd] >= samples_per_command for cmd in selected_commands):
        break

# Create a subset of the dataset
subset_dataset = Subset(dataset, subset_indices)

# Check the sample count for each command in the subset
print(f"Sample counts in subset: {command_counter}")
print(f"Total subset size: {len(subset_dataset)}")

Sample counts in subset: Counter({'down': 100, 'go': 100, 'left': 100, 'no': 100, 'off': 100, 'on': 100, 'right': 100, 'stop': 100, 'up': 100, 'yes': 100})
Total subset size: 1000


## 3. Data Preprocessing

In [16]:
# Cell 3: Data Preprocessing (Padding and Truncating)
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [17]:
# Define a fixed length for all audio samples (1 second = 16000 samples at 16kHz)
fixed_length = 16000

# Custom collate function to pad and truncate audio data
def collate_fn(batch):
    waveforms = []
    labels = []

    for item in batch:
        waveform = item[0]
        label = item[2]

        if waveform.shape[1] > fixed_length:
            waveform = waveform[:, :fixed_length]
        elif waveform.shape[1] < fixed_length:
            pad_amount = fixed_length - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad_amount))

        waveforms.append(waveform)
        labels.append(label)

    waveforms = torch.stack(waveforms)
    return waveforms, labels

In [18]:
# DataLoader for the subset dataset
loader = DataLoader(subset_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

## 4. CNN Classifier

In [22]:
# Cell 4: Define and Train a CNN Classifier (with correct fully connected layer input size)
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as transforms

# Define the MelSpectrogram transform to convert audio waveforms into spectrograms
mel_spectrogram = transforms.MelSpectrogram(
    sample_rate=16000, n_mels=128, n_fft=400, hop_length=160
)

# Define a simple CNN model for speech command classification
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(32 * 25 * 32, 128)  # Correct input size based on shape (32*25=800)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten the output for the fully connected layers
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Get the number of classes (commands)
num_classes = len(selected_commands)

# Create a dictionary to map commands (labels) to numerical values
label_to_index = {label: idx for idx, label in enumerate(selected_commands)}

# Function to convert string labels to numerical indices
def label_to_tensor(label):
    return torch.tensor(label_to_index[label])

# Instantiate the model
model = SimpleCNN(num_classes=num_classes).to('cuda')

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for waveforms, labels in loader:
        # Convert waveforms to spectrograms
        waveforms = mel_spectrogram(waveforms)
        waveforms = waveforms.squeeze(1).unsqueeze(1).to('cuda')  # Remove extra dimension, add channel dimension
        labels = torch.tensor([label_to_tensor(label) for label in labels]).to('cuda')

        optimizer.zero_grad()
        outputs = model(waveforms)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(loader)}')

print('Training completed!')

Epoch 1/10, Loss: 3.029912658035755
Epoch 2/10, Loss: 1.9660861305892467
Epoch 3/10, Loss: 1.441526371985674
Epoch 4/10, Loss: 0.9779491610825062
Epoch 5/10, Loss: 0.705291461199522
Epoch 6/10, Loss: 0.6768102450296283
Epoch 7/10, Loss: 0.8007653304375708
Epoch 8/10, Loss: 0.3804143578745425
Epoch 9/10, Loss: 0.23944445000961423
Epoch 10/10, Loss: 0.18713378009852022
Training completed!


## 5. Model Evaluation

In [24]:
# Cell 5: Evaluate the Model
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Path to your recordings folder in Drive
recordings_path = '/content/drive/My Drive/path_to_recordings_folder'

# Verify the recordings are accessible
import os
recording_files = os.listdir(recordings_path)
print("Files in the recordings folder:", recording_files)

Accuracy: 98.4%
