In [1]:
# !git clone https://github.com/IlyaBolkisev/speech_course
%cd speech_course/assignment1/

/content/speech_course/assignment1


# 1. Implement a PyTorch layer

In [2]:
import torch
import torchaudio

from melbanks import LogMelFilterBanks

In [3]:
signal, sr = torchaudio.load('../assignment2/examples/sample1.wav')
signal.shape

torch.Size([1, 243520])

In [4]:
melspec = torchaudio.transforms.MelSpectrogram(
    hop_length=160,
    n_mels=80
)(signal)
logmelbanks = LogMelFilterBanks()(signal)

assert torch.log(melspec + 1e-6).shape == logmelbanks.shape
assert torch.allclose(torch.log(melspec + 1e-6), logmelbanks)

# 2. Train a simple CNN model

In [5]:
import os
import time

import numpy as np
from tqdm import tqdm
from thop import profile

from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchaudio.datasets import SPEECHCOMMANDS

## Load data

In [6]:
class SpeechDataset(Dataset):
    def __init__(self, root, subset):
        self.data = SPEECHCOMMANDS(root=root, download=True, subset=subset)

        self._yes_no_indices = []
        for i in tqdm(range(len(self.data))):
            _, _, label, _, _ = self.data[i]
            if label.lower() in ['yes', 'no']:
                self._yes_no_indices.append(i)

    def __getitem__(self, idx):
        real_idx = self._yes_no_indices[idx]
        waveform, label, *_ = self.data[real_idx]
        target = 0 if label == 'no' else 1
        return waveform, target

    def __len__(self):
        return len(self._yes_no_indices)

In [7]:
def collate_fn(batch, max_len=16000):
    waveforms = []
    targets = []
    for (waveform, label) in batch:
        if waveform.shape[1] > max_len:
            waveform = waveform[:, :max_len]
        elif waveform.shape[1] < max_len:
            pad_length = max_len - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad_length))
        waveforms.append(waveform)
        targets.append(label)

    waveforms = torch.stack(waveforms, dim=0)
    targets = torch.tensor(targets, dtype=torch.long)
    waveforms = waveforms.squeeze(1)
    return waveforms, targets

data_dir = './'

train_dataset = SpeechDataset(data_dir, subset='training')
val_dataset = SpeechDataset(data_dir, subset='validation')
test_dataset = SpeechDataset(data_dir, subset='testing')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

100%|██████████| 84843/84843 [06:08<00:00, 229.94it/s]
100%|██████████| 9981/9981 [00:35<00:00, 284.43it/s]
100%|██████████| 11005/11005 [00:39<00:00, 280.95it/s]


## Init model

In [23]:
class SpeechModel(nn.Module):
    def __init__(self, n_mels=40, groups=1, num_classes=2):
        super(SpeechModel, self).__init__()
        self.n_mels = n_mels

        self.logmel = LogMelFilterBanks(n_mels=n_mels)

        self.conv1 = nn.Conv1d(n_mels, 32, 3, padding=1, groups=groups)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv1d(32, 64, 3, padding=1, groups=groups)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv1d(64, 128, 3, padding=1, groups=groups)
        self.pool3 = nn.AdaptiveMaxPool1d(1)

        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.logmel(x)

        x = self.conv1(x)
        x = torch.nn.functional.relu(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = torch.nn.functional.relu(x)
        x = self.pool2(x)

        x = self.conv3(x)
        x = torch.nn.functional.relu(x)
        x = self.pool3(x)

        x = x.squeeze(-1)
        x = self.fc(x)
        return x

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def compute_flops(model, input_tensor):
    flops, params = profile(model, inputs=(input_tensor,))
    return flops

## Train model

In [26]:
device = torch.device("cuda" )
model = SpeechModel(n_mels=40, groups=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

num_epochs = 10
train_losses = []
val_accuracies = []
epoch_times = []

for epoch in range(num_epochs):
    model.train()
    epoch_start = time.time()
    running_loss = 0.0
    total_train_samples = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        total_train_samples += inputs.size(0)

    epoch_loss = running_loss / total_train_samples
    train_losses.append(epoch_loss)

    epoch_time = time.time() - epoch_start
    epoch_times.append(epoch_time)

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == targets).sum().item()
            total += targets.size(0)
    val_acc = correct / total
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_loss:.4f}, Val Acc: {val_acc:.4f}, Epoch Time: {epoch_time:.2f} sec")

Epoch 1/10 - Train Loss: 0.0022, Val Acc: 1.0000, Epoch Time: 23.33 sec
Epoch 2/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 22.05 sec
Epoch 3/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 21.97 sec
Epoch 4/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 22.03 sec
Epoch 5/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 22.05 sec
Epoch 6/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 22.08 sec
Epoch 7/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 22.11 sec
Epoch 8/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 21.94 sec
Epoch 9/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 21.84 sec
Epoch 10/10 - Train Loss: 0.0000, Val Acc: 1.0000, Epoch Time: 21.67 sec


In [27]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == targets).sum().item()
        total += targets.size(0)
test_acc = correct / total
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 1.0000


In [28]:
num_params = count_parameters(model)
print("Trainable parameters:", num_params)
sample_input = next(iter(train_loader))[0][:1].to(device)
flops = compute_flops(model, sample_input)
print("FLOPs:", flops)

Trainable parameters: 35042
[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv1d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.pooling.MaxPool1d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.pooling.AdaptiveMaxPool1d'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
FLOPs: 1309696.0
