In [102]:
import torch, torchaudio, os
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
from tqdm import tqdm

In [103]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device} used")

cpu used


In [104]:
dataset_path = "/content/drive/MyDrive/prepared_dataset"

In [105]:
epochs = 40
lr = 0.001
batch_size = 3

In [106]:
class Model(nn.Module):
    def __init__(self, n_input=1, n_output=10, stride=16, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        x = F.softmax(x, dim=1)
        return x


    def accuracy(self, preds, labels):
        maxs, indices = torch.max(preds, 1)
        acc = torch.sum(indices == labels) / len(preds)
        return acc.cpu()


In [107]:
class AudioData(Dataset):
    def __init__(self, root):
        self.directory_path = root
        self.classes = os.listdir(self.directory_path)
        print(self.classes)
        self.data_paths = []
        self.labels = []

        for root, dirs, files in os.walk(self.directory_path):
            for file in files:
                if file != ".DS_Store":
                    label = os.path.basename(root)
                    data_path = os.path.join(root, file)
                    self.data_paths.append(data_path)
                    self.labels.append(self.classes.index(label))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        data_path = self.data_paths[index]
        label = self.labels[index]
        signal, sample_rate = torchaudio.load(data_path)
        signal = torch.mean(signal, dim=0, keepdim=True)

        new_sample_rate = 8000
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
        signal = transform(signal)

        return signal, label

In [108]:
dataset = AudioData(dataset_path)

['Parisa', 'Zeinab', 'Alireza', 'Maryam', 'Morteza', 'Nahid', 'MohammadAli', 'Zahra', 'Sajjad', 'Hossein']


In [109]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [110]:
for audios, labels in (train_data_loader):
  print(labels)

tensor([9, 1, 9])
tensor([5, 6, 2])
tensor([0, 1, 2])
tensor([2, 2, 2])
tensor([9, 2, 9])
tensor([7, 9, 6])
tensor([3, 8, 8])
tensor([1, 7, 0])
tensor([2, 9, 9])
tensor([9, 3, 9])
tensor([1, 9, 0])
tensor([6, 2, 7])
tensor([1, 0, 5])
tensor([3, 7, 9])
tensor([3, 0, 9])
tensor([1, 9, 0])
tensor([5, 9, 9])
tensor([5, 7, 9])
tensor([2, 3, 9])
tensor([6, 6, 0])
tensor([7, 7, 7])
tensor([7, 3, 9])
tensor([9, 7, 9])
tensor([7, 7, 2])
tensor([5, 2, 0])
tensor([1, 2, 5])
tensor([6, 4, 2])
tensor([1, 5, 2])
tensor([6, 1, 7])
tensor([7, 2, 2])
tensor([7, 7, 0])
tensor([2, 4, 8])
tensor([3, 6, 1])
tensor([4, 6, 8])
tensor([2, 5, 0])
tensor([2, 5, 6])
tensor([9, 3, 9])
tensor([5, 0, 1])
tensor([3, 8, 4])
tensor([1, 1, 3])
tensor([5, 5, 9])
tensor([7, 2, 3])
tensor([5, 0, 9])
tensor([7, 2, 3])
tensor([6, 7, 0])
tensor([9, 2, 1])
tensor([3, 7, 0])
tensor([1, 1, 5])
tensor([2, 3, 3])
tensor([1, 8, 9])
tensor([1, 7, 1])
tensor([9, 5, 1])
tensor([3, 0, 6])
tensor([5, 9, 0])
tensor([6, 5, 3])
tensor([0,

In [111]:
model = Model(n_output=10).to(device)
model

Model(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(16,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=10, bias=True)
)

In [112]:
n = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: %s" % n)

Number of parameters: 25290


In [113]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

### Train


In [114]:
model.train()

for epoch in range(epochs):
    train_loss = 0.0
    train_acc = 0.0
    for audios, labels in tqdm(train_data_loader):
        audios, labels = audios.to(device), labels.to(device)
        labels_one_hot = F.one_hot(labels, num_classes=10).type(torch.LongTensor).to(device)
        # print(audios, labels, labels_one_hot)
        preds = model(audios)
        loss = loss_function(preds, labels_one_hot)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss
        train_acc += model.accuracy(preds, labels)
    
    total_loss = train_loss / len(train_data_loader)
    total_acc = train_acc / len(train_data_loader)

    print(f"Epoch: {epoch}, Loss: {total_loss}, Acc: {total_acc}")


  0%|          | 0/284 [00:00<?, ?it/s]


IndexError: ignored

In [None]:
model.eval()

test_acc = 0.0
test_loss = 0.0
for audios, labels in tqdm(test_data_loader):
    audios = audios.to(device)
    labels = labels.to(device)
    labels_one_hot = F.one_hot(labels, num_classes=10).type(torch.FloatTensor).to(device)

    pred = model(audios)
    loss = loss_function(pred, labels_one_hot)
    test_loss += loss
    test_acc += model.accuracy(pred, labels)

total_test_loss = test_loss / len(test_data_loader)
total_test_acc = test_acc / len(test_data_loader)
print(f"Accuracy: {total_test_acc}, Loss: {total_test_loss}")

In [None]:
## save weights

In [None]:
## inference