In [13]:
import numpy as np
import os
import  torch
import torchaudio
from torch.utils.data import DataLoader, Dataset, random_split
from torchaudio import datasets
import matplotlib.pyplot as plt
from torch import nn
import torch.optim as optim
from torchvision.transforms import ToTensor
import tensorflow as tf
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder



In [14]:
class VGG_audio(nn.Module):
    def __init__(self, in_channel=1, num_classes=10, stride=1, padding=1, channel_factor=0.65, kernel_factor=1.25):
        super(VGG_audio, self).__init__()
        self.in_channel = in_channel
        self.channel_factor = channel_factor
        self.kernel_factor = kernel_factor

        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channel, 64, kernel_size=3, stride=stride, padding=padding),
            nn.Conv1d(64, 64, kernel_size=3, stride=stride, padding=padding),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )

        in_channel = 64
        out_channel = int(in_channel * channel_factor)
        kernels = int(3 * kernel_factor)
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.Conv1d(out_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )

        in_channel = out_channel
        out_channel = int(in_channel * channel_factor)
        kernels = int(kernels * kernel_factor)
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.Conv1d(out_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.Conv1d(out_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )

        in_channel = out_channel
        out_channel = int(in_channel * channel_factor)
        kernels = int(kernels * kernel_factor)
        self.conv4 = nn.Sequential(
            nn.Conv1d(in_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.Conv1d(out_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.Conv1d(out_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )

        in_channel = out_channel
        out_channel = int(in_channel * channel_factor)
        kernels = int(kernels * kernel_factor)
        self.conv5 = nn.Sequential(
            nn.Conv1d(in_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.Conv1d(out_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.Conv1d(out_channel, out_channel, kernel_size=kernels, stride=stride, padding=padding),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )

        self.dense = nn.Sequential(
            nn.Linear(out_channel, 4096),
            nn.ReLU(True),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = x.view(x.size(0), -1)
        x = self.dense(x)
        return x

In [16]:
# Load the complete dataset
dataset =datasets.SPEECHCOMMANDS(root="data", download=True)
labels = [data[2] for data in dataset]
label_encoder = LabelEncoder()
label_encoder.fit(labels)

In [19]:
def custom_collate_func(batch):
    waveforms = pad_sequence([i[0].squeeze(0) for i in batch], batch_first=True)
    sr = torch.tensor([i[1] for i in batch])
    labels = torch.tensor([label_encoder[i[2]] for i in batch])  # Encode labels
    speaker_ids = torch.tensor([i[3] for i in batch])
    utterance_num = torch.tensor([i[4] for i in batch])
    return waveforms, sr, labels, speaker_ids, utterance_num

In [20]:
total_size = len(dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])
print(total_size)

# Create DataLoader instances for each dataset
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, collate_fn=custom_collate_func)
val_loader = DataLoader(val_data, batch_size=128, shuffle=False, collate_fn=custom_collate_func)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False,collate_fn=custom_collate_func)

105829


In [21]:
#Sample
for waveform, sample_rate, label, speaker_id, utterance_number in train_loader:
    print(f"Waveform shape: {waveform.shape}")
    print(f"Sample rate: {sample_rate}")
    print(f"Label: {label}")
    print(f"Speaker ID: {speaker_id}")
    print(f"Utterance number: {utterance_number}")
    break

TypeError: 'LabelEncoder' object is not subscriptable