In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchaudio import datasets, transforms
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.nn.functional as F


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
train_data = datasets.SPEECHCOMMANDS(root='./data', download=True, subset='training')
# valid_data = datasets.SPEECHCOMMANDS(root='./data', download=True, subset='validation')
test_data = datasets.SPEECHCOMMANDS(root='./data', download=True, subset='testing')

100%|██████████| 2.26G/2.26G [00:57<00:00, 42.5MB/s]


In [None]:
labels = sorted(list(set([i[2] for i in train_data])))
label_to_index = {label: i for i, label in enumerate(labels)}
insex_to_label = {i: label for i, label in enumerate(labels)}
speaker = list(set([i[3] for i in train_data]))

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


In [None]:
transform = transforms.MelSpectrogram(
    sample_rate= 16000,
    n_mels=32

)

In [None]:
max_len = 100

def collate_fn(batch):
    spectrograms, targets = [], []
    for waveform, sample_rate, label, *_ in batch:
      spec = transform(waveform).squeeze(0)

      if spec.shape[1] > max_len:
        spec = spec[:, :max_len]

      if spec.shape[1] < max_len:
        pad_amount = max_len - spec.shape[1]
        spec = F.pad(spec, (0, pad_amount))
      spectrograms.append(spec)
      targets.append(label_to_index[label])

    spectrograms = torch.stack(spectrograms)
    targets = torch.tensor(targets)

    return spectrograms, targets

In [None]:
train = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)
test = DataLoader(test_data, batch_size=64, collate_fn=collate_fn)

In [None]:
num_classes = len(labels)

In [None]:
class CheckAudio(nn.Module):
  def __init__(self):
    super().__init__()
    self.first = nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(32, 64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.AdaptiveAvgPool2d((8, 8))
    )
    self.second = nn.Sequential(
        nn.Flatten(),
        nn.Linear(64 * 8 * 8, 128),
        nn.ReLU(),
        nn.Linear(128, num_classes)
    )
  def forward(self, x):
    x = x.unsqueeze(1)
    x = self.first(x)
    x = self.second(x)
    return x

In [None]:
model = CheckAudio().to(device)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(20):
    model.train()
    total_loss = 0

    for x_batch, y_batch in train:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        y_pred_train = model(x_batch)

        loss = loss_fn(y_pred_train, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'epoch {epoch+1}, loss: {total_loss:.4f}')


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


epoch 1, loss: 2764.8122
epoch 2, loss: 1437.8333
epoch 3, loss: 1114.3701
epoch 4, loss: 939.7304
epoch 5, loss: 822.8629
epoch 6, loss: 738.5819
epoch 7, loss: 669.3671
epoch 8, loss: 619.1836
epoch 9, loss: 568.9691
epoch 10, loss: 529.6503
epoch 11, loss: 493.1725
epoch 12, loss: 474.8859
epoch 13, loss: 435.7707
epoch 14, loss: 426.3704
epoch 15, loss: 378.3901
epoch 16, loss: 369.1787
epoch 17, loss: 355.2971
epoch 18, loss: 347.1122
epoch 19, loss: 328.1899
epoch 20, loss: 306.6942


In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
  for x_batch, y_batch in test:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    y_pred_test = model(x_batch)
    predicted = torch.argmax(y_pred_test, dim=1)

    total += y_batch.size(0)
    correct += (predicted == y_batch).sum().item()

accuracy = 100 * correct / total
print(f'toch models is test datasets:  {accuracy:.2f}%')