In [102]:
import torch, torchaudio, os
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
from tqdm import tqdm

In [103]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device} used")

cpu used


In [104]:
dataset_path = "/content/drive/MyDrive/prepared_dataset"

### hyperparameter

In [105]:
epochs = 40
lr = 0.001
batch_size = 3

### model

In [147]:
class Model(nn.Module):
    def __init__(self, n_input=1, n_output=10, stride=16, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = torch.flatten(x, start_dim=1)
        x = self.fc1(x)
        x = F.softmax(x, dim=1)
        return x


    def accuracy(self, preds, labels):
        maxs, indices = torch.max(preds, 1)
        acc = torch.sum(indices == labels) / len(preds)
        return acc.cpu()


### dataset

In [148]:
class AudioData(Dataset):
    def __init__(self, root):
        self.directory_path = root
        self.classes = os.listdir(self.directory_path)
        print(self.classes)
        self.data_paths = []
        self.labels = []

        for root, dirs, files in os.walk(self.directory_path):
            for file in files:
                if file != ".DS_Store":
                    label = os.path.basename(root)
                    data_path = os.path.join(root, file)
                    self.data_paths.append(data_path)
                    self.labels.append(self.classes.index(label))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        data_path = self.data_paths[index]
        label = self.labels[index]
        signal, sample_rate = torchaudio.load(data_path)
        signal = torch.mean(signal, dim=0, keepdim=True)

        new_sample_rate = 8000
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
        signal = transform(signal)

        return signal, label

In [149]:
dataset = AudioData(dataset_path)

['Parisa', 'Zeinab', 'Alireza', 'Maryam', 'Morteza', 'Nahid', 'MohammadAli', 'Zahra', 'Sajjad', 'Hossein']


### split train and test data

In [150]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

### model summary

In [151]:
model = Model(n_output=10).to(device)
model

Model(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(16,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=10, bias=True)
)

In [152]:
n = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: %s" % n)

Number of parameters: 25290


### optimizer and loss function

In [153]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

### Train


In [155]:
model.train()

for epoch in range(epochs):
    train_loss = 0.0
    train_acc = 0.0
    for audios, labels in tqdm(train_data_loader):
        audios, labels = audios.to(device), labels.to(device)
        labels_one_hot = F.one_hot(labels, num_classes=10).type(torch.FloatTensor).to(device)
        preds = model(audios)
        loss = loss_function(preds, labels_one_hot)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss
        train_acc += model.accuracy(preds, labels)
    
    total_loss = train_loss / len(train_data_loader)
    total_acc = train_acc / len(train_data_loader)

    print(f"Epoch: {epoch}, Loss: {total_loss}, Acc: {total_acc}")


100%|██████████| 284/284 [00:06<00:00, 44.72it/s]


Epoch: 0, Loss: 2.180433988571167, Acc: 0.27934280037879944


100%|██████████| 284/284 [00:05<00:00, 48.24it/s]


Epoch: 1, Loss: 2.1603405475616455, Acc: 0.2981221079826355


100%|██████████| 284/284 [00:05<00:00, 51.90it/s]


Epoch: 2, Loss: 2.1155929565429688, Acc: 0.34272301197052


100%|██████████| 284/284 [00:05<00:00, 49.35it/s]


Epoch: 3, Loss: 2.081749439239502, Acc: 0.3896714746952057


100%|██████████| 284/284 [00:05<00:00, 51.10it/s]


Epoch: 4, Loss: 2.0595991611480713, Acc: 0.40962451696395874


100%|██████████| 284/284 [00:05<00:00, 51.35it/s]


Epoch: 5, Loss: 2.021286964416504, Acc: 0.45422524213790894


100%|██████████| 284/284 [00:05<00:00, 51.67it/s]


Epoch: 6, Loss: 2.0239503383636475, Acc: 0.4424884021282196


100%|██████████| 284/284 [00:05<00:00, 49.82it/s]


Epoch: 7, Loss: 2.000952959060669, Acc: 0.4788733422756195


100%|██████████| 284/284 [00:05<00:00, 50.31it/s]


Epoch: 8, Loss: 1.9858179092407227, Acc: 0.47652575373649597


100%|██████████| 284/284 [00:05<00:00, 50.29it/s]


Epoch: 9, Loss: 1.9611495733261108, Acc: 0.5140844583511353


100%|██████████| 284/284 [00:05<00:00, 52.39it/s]


Epoch: 10, Loss: 1.958609938621521, Acc: 0.5082160234451294


100%|██████████| 284/284 [00:06<00:00, 47.32it/s]


Epoch: 11, Loss: 1.932957649230957, Acc: 0.5434272885322571


100%|██████████| 284/284 [00:05<00:00, 52.66it/s]


Epoch: 12, Loss: 1.9456262588500977, Acc: 0.5258216261863708


100%|██████████| 284/284 [00:05<00:00, 51.34it/s]


Epoch: 13, Loss: 1.921541690826416, Acc: 0.5528169870376587


100%|██████████| 284/284 [00:05<00:00, 49.32it/s]


Epoch: 14, Loss: 1.905726671218872, Acc: 0.5633803606033325


100%|██████████| 284/284 [00:05<00:00, 52.25it/s]


Epoch: 15, Loss: 1.886316180229187, Acc: 0.5903758406639099


100%|██████████| 284/284 [00:05<00:00, 51.12it/s]


Epoch: 16, Loss: 1.9129743576049805, Acc: 0.5504698157310486


100%|██████████| 284/284 [00:05<00:00, 50.33it/s]


Epoch: 17, Loss: 1.8773809671401978, Acc: 0.5892019867897034


100%|██████████| 284/284 [00:05<00:00, 52.91it/s]


Epoch: 18, Loss: 1.8563823699951172, Acc: 0.6173710227012634


100%|██████████| 284/284 [00:05<00:00, 50.12it/s]


Epoch: 19, Loss: 1.8653682470321655, Acc: 0.6009390354156494


100%|██████████| 284/284 [00:05<00:00, 51.97it/s]


Epoch: 20, Loss: 1.8638546466827393, Acc: 0.6044601798057556


100%|██████████| 284/284 [00:05<00:00, 51.74it/s]


Epoch: 21, Loss: 1.8516072034835815, Acc: 0.6173710227012634


100%|██████████| 284/284 [00:05<00:00, 50.42it/s]


Epoch: 22, Loss: 1.8685723543167114, Acc: 0.5927230715751648


100%|██████████| 284/284 [00:05<00:00, 47.65it/s]


Epoch: 23, Loss: 1.8473562002182007, Acc: 0.6267606019973755


100%|██████████| 284/284 [00:06<00:00, 47.09it/s]


Epoch: 24, Loss: 1.8422163724899292, Acc: 0.6244131922721863


100%|██████████| 284/284 [00:05<00:00, 49.24it/s]


Epoch: 25, Loss: 1.8472533226013184, Acc: 0.61854487657547


100%|██████████| 284/284 [00:06<00:00, 44.81it/s]


Epoch: 26, Loss: 1.8432416915893555, Acc: 0.6279346942901611


100%|██████████| 284/284 [00:05<00:00, 51.27it/s]


Epoch: 27, Loss: 1.831842303276062, Acc: 0.6396716237068176


100%|██████████| 284/284 [00:05<00:00, 48.96it/s]


Epoch: 28, Loss: 1.8251991271972656, Acc: 0.6443665027618408


100%|██████████| 284/284 [00:05<00:00, 48.45it/s]


Epoch: 29, Loss: 1.7915722131729126, Acc: 0.6842724084854126


100%|██████████| 284/284 [00:05<00:00, 48.41it/s]


Epoch: 30, Loss: 1.7692394256591797, Acc: 0.7018783092498779


100%|██████████| 284/284 [00:05<00:00, 47.56it/s]


Epoch: 31, Loss: 1.778700828552246, Acc: 0.693662166595459


100%|██████████| 284/284 [00:05<00:00, 50.39it/s]


Epoch: 32, Loss: 1.7832374572753906, Acc: 0.681925356388092


100%|██████████| 284/284 [00:05<00:00, 51.35it/s]


Epoch: 33, Loss: 1.7648580074310303, Acc: 0.710094153881073


100%|██████████| 284/284 [00:05<00:00, 50.54it/s]


Epoch: 34, Loss: 1.780847191810608, Acc: 0.6854461431503296


100%|██████████| 284/284 [00:05<00:00, 47.38it/s]


Epoch: 35, Loss: 1.760385513305664, Acc: 0.707746684551239


100%|██████████| 284/284 [00:05<00:00, 52.00it/s]


Epoch: 36, Loss: 1.758996844291687, Acc: 0.7147892713546753


100%|██████████| 284/284 [00:05<00:00, 50.24it/s]


Epoch: 37, Loss: 1.7589055299758911, Acc: 0.7100942730903625


100%|██████████| 284/284 [00:05<00:00, 53.95it/s]


Epoch: 38, Loss: 1.7381020784378052, Acc: 0.7335683703422546


100%|██████████| 284/284 [00:05<00:00, 50.20it/s]

Epoch: 39, Loss: 1.759501576423645, Acc: 0.7042257189750671





### test

In [156]:
model.eval()

test_acc = 0.0
test_loss = 0.0
for audios, labels in tqdm(test_data_loader):
    audios = audios.to(device)
    labels = labels.to(device)
    labels_one_hot = F.one_hot(labels, num_classes=10).type(torch.FloatTensor).to(device)

    pred = model(audios)
    loss = loss_function(pred, labels_one_hot)
    test_loss += loss
    test_acc += model.accuracy(pred, labels)

total_test_loss = test_loss / len(test_data_loader)
total_test_acc = test_acc / len(test_data_loader)
print(f"Accuracy: {total_test_acc}, Loss: {total_test_loss}")

100%|██████████| 72/72 [00:01<00:00, 46.50it/s]

Accuracy: 0.7083333730697632, Loss: 1.7473279237747192





### save weights

In [157]:
torch.save(model.state_dict(), "weights.pth")

### inference

In [159]:
names = ['Parisa', 'Zeinab', 'Alireza', 'Maryam', 'Morteza', 'Nahid', 'MohammadAli', 'Zahra', 'Sajjad', 'Hossein']

signal, sample_rate = torchaudio.load("/content/drive/MyDrive/Datasets/voices_dataset/Maryam/Maryam_2.ogg")

# preprocess
signal = torch.mean(signal, dim=0, keepdim=True)
new_sample_rate = 8000
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
signal = transform(signal)
signal = signal[:, 32000:40000]
signal = signal.unsqueeze(0).to(device)

# process
preds = model(signal)

# postprocess
preds = preds.cpu().detach().numpy()
output = np.argmax(preds)
print(names[output])

Maryam
