In [None]:
import torch, torchaudio, os
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
from tqdm import tqdm

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device} used")

cuda used


In [None]:
dataset_path = "/content/drive/MyDrive/prepared_dataset"

### hyperparameter

In [36]:
epochs = 60
lr = 0.0001
batch_size = 16

### model

In [37]:
class Model(nn.Module):
    def __init__(self, n_input=1, n_output=10, stride=4, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = torch.flatten(x, start_dim=1)
        x = self.fc1(x)
        x = F.softmax(x, dim=1)
        return x


    def accuracy(self, preds, labels):
        maxs, indices = torch.max(preds, 1)
        acc = torch.sum(indices == labels) / len(preds)
        return acc.cpu()


### dataset

In [38]:
class AudioData(Dataset):
    def __init__(self, root):
        self.directory_path = root
        self.classes = os.listdir(self.directory_path)
        print(self.classes)
        self.data_paths = []
        self.labels = []

        for root, dirs, files in os.walk(self.directory_path):
            for file in files:
                if file != ".DS_Store":
                    label = os.path.basename(root)
                    data_path = os.path.join(root, file)
                    self.data_paths.append(data_path)
                    self.labels.append(self.classes.index(label))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        data_path = self.data_paths[index]
        label = self.labels[index]
        signal, sample_rate = torchaudio.load(data_path)
        signal = torch.mean(signal, dim=0, keepdim=True)

        new_sample_rate = 8000
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
        signal = transform(signal)

        return signal, label

In [39]:
dataset = AudioData(dataset_path)

['Parisa', 'Zeinab', 'Alireza', 'Maryam', 'Morteza', 'Nahid', 'MohammadAli', 'Zahra', 'Sajjad', 'Hossein']


### split train and test data

In [40]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

### model summary

In [41]:
model = Model(n_output=10).to(device)
model

Model(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(4,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=10, bias=True)
)

In [42]:
n = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: %s" % n)

Number of parameters: 25290


### optimizer and loss function

In [43]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

### Train


In [44]:
model.train()

for epoch in range(epochs):
    train_loss = 0.0
    train_acc = 0.0
    for audios, labels in tqdm(train_data_loader):
        audios, labels = audios.to(device), labels.to(device)
        labels_one_hot = F.one_hot(labels, num_classes=10).type(torch.FloatTensor).to(device)
        preds = model(audios)
        loss = loss_function(preds, labels_one_hot)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss
        train_acc += model.accuracy(preds, labels)
    
    total_loss = train_loss / len(train_data_loader)
    total_acc = train_acc / len(train_data_loader)

    print(f"Epoch: {epoch}, Loss: {total_loss}, Acc: {total_acc}")


100%|██████████| 54/54 [01:50<00:00,  2.05s/it]


Epoch: 0, Loss: 2.2717509269714355, Acc: 0.17129629850387573


100%|██████████| 54/54 [00:02<00:00, 22.77it/s]


Epoch: 1, Loss: 2.2145535945892334, Acc: 0.29050925374031067


100%|██████████| 54/54 [00:02<00:00, 22.71it/s]


Epoch: 2, Loss: 2.1863083839416504, Acc: 0.37037035822868347


100%|██████████| 54/54 [00:02<00:00, 22.71it/s]


Epoch: 3, Loss: 2.1636433601379395, Acc: 0.40393519401550293


100%|██████████| 54/54 [00:02<00:00, 22.40it/s]


Epoch: 4, Loss: 2.1376473903656006, Acc: 0.45949074625968933


100%|██████████| 54/54 [00:02<00:00, 22.29it/s]


Epoch: 5, Loss: 2.1141560077667236, Acc: 0.49768519401550293


100%|██████████| 54/54 [00:02<00:00, 22.34it/s]


Epoch: 6, Loss: 2.077705144882202, Acc: 0.5520833134651184


100%|██████████| 54/54 [00:02<00:00, 22.58it/s]


Epoch: 7, Loss: 2.049978017807007, Acc: 0.5601851940155029


100%|██████████| 54/54 [00:02<00:00, 22.55it/s]


Epoch: 8, Loss: 2.0235326290130615, Acc: 0.6064814925193787


100%|██████████| 54/54 [00:02<00:00, 22.55it/s]


Epoch: 9, Loss: 1.9920735359191895, Acc: 0.6145833134651184


100%|██████████| 54/54 [00:02<00:00, 22.74it/s]


Epoch: 10, Loss: 1.9688833951950073, Acc: 0.6377314925193787


100%|██████████| 54/54 [00:02<00:00, 22.20it/s]


Epoch: 11, Loss: 1.9379459619522095, Acc: 0.6828703880310059


100%|██████████| 54/54 [00:02<00:00, 22.44it/s]


Epoch: 12, Loss: 1.920078992843628, Acc: 0.6851851940155029


100%|██████████| 54/54 [00:02<00:00, 22.33it/s]


Epoch: 13, Loss: 1.8847516775131226, Acc: 0.7175925970077515


100%|██████████| 54/54 [00:02<00:00, 22.44it/s]


Epoch: 14, Loss: 1.8739506006240845, Acc: 0.7245370149612427


100%|██████████| 54/54 [00:02<00:00, 22.13it/s]


Epoch: 15, Loss: 1.8562361001968384, Acc: 0.7303240895271301


100%|██████████| 54/54 [00:02<00:00, 22.39it/s]


Epoch: 16, Loss: 1.839751124382019, Acc: 0.7407407164573669


100%|██████████| 54/54 [00:02<00:00, 22.45it/s]


Epoch: 17, Loss: 1.8276658058166504, Acc: 0.7523148059844971


100%|██████████| 54/54 [00:02<00:00, 22.32it/s]


Epoch: 18, Loss: 1.8244470357894897, Acc: 0.7407407164573669


100%|██████████| 54/54 [00:02<00:00, 22.44it/s]


Epoch: 19, Loss: 1.8108556270599365, Acc: 0.7581018805503845


100%|██████████| 54/54 [00:02<00:00, 22.75it/s]


Epoch: 20, Loss: 1.7989838123321533, Acc: 0.7604166865348816


100%|██████████| 54/54 [00:02<00:00, 22.42it/s]


Epoch: 21, Loss: 1.79977548122406, Acc: 0.7569444179534912


100%|██████████| 54/54 [00:02<00:00, 22.31it/s]


Epoch: 22, Loss: 1.7818048000335693, Acc: 0.7662037014961243


100%|██████████| 54/54 [00:02<00:00, 22.13it/s]


Epoch: 23, Loss: 1.777308464050293, Acc: 0.7696759104728699


100%|██████████| 54/54 [00:02<00:00, 21.63it/s]


Epoch: 24, Loss: 1.7721483707427979, Acc: 0.7696759104728699


100%|██████████| 54/54 [00:02<00:00, 21.75it/s]


Epoch: 25, Loss: 1.770725131034851, Acc: 0.7662037014961243


100%|██████████| 54/54 [00:02<00:00, 22.20it/s]


Epoch: 26, Loss: 1.7645727396011353, Acc: 0.7777777910232544


100%|██████████| 54/54 [00:02<00:00, 21.82it/s]


Epoch: 27, Loss: 1.7592642307281494, Acc: 0.7708333134651184


100%|██████████| 54/54 [00:02<00:00, 22.45it/s]


Epoch: 28, Loss: 1.7515428066253662, Acc: 0.7673611044883728


100%|██████████| 54/54 [00:02<00:00, 22.34it/s]


Epoch: 29, Loss: 1.750941276550293, Acc: 0.7766203880310059


100%|██████████| 54/54 [00:02<00:00, 21.65it/s]


Epoch: 30, Loss: 1.7453484535217285, Acc: 0.7685185074806213


100%|██████████| 54/54 [00:02<00:00, 21.81it/s]


Epoch: 31, Loss: 1.7440770864486694, Acc: 0.7731481194496155


100%|██████████| 54/54 [00:02<00:00, 22.03it/s]


Epoch: 32, Loss: 1.7381985187530518, Acc: 0.7696759104728699


100%|██████████| 54/54 [00:02<00:00, 22.37it/s]


Epoch: 33, Loss: 1.7337232828140259, Acc: 0.7754629850387573


100%|██████████| 54/54 [00:02<00:00, 22.49it/s]


Epoch: 34, Loss: 1.7172967195510864, Acc: 0.8136574029922485


100%|██████████| 54/54 [00:02<00:00, 22.28it/s]


Epoch: 35, Loss: 1.712289571762085, Acc: 0.8275462985038757


100%|██████████| 54/54 [00:02<00:00, 22.09it/s]


Epoch: 36, Loss: 1.6988141536712646, Acc: 0.8321759104728699


100%|██████████| 54/54 [00:02<00:00, 22.33it/s]


Epoch: 37, Loss: 1.6899316310882568, Acc: 0.8495370149612427


100%|██████████| 54/54 [00:02<00:00, 22.42it/s]


Epoch: 38, Loss: 1.6827523708343506, Acc: 0.8587962985038757


100%|██████████| 54/54 [00:02<00:00, 22.48it/s]


Epoch: 39, Loss: 1.6824681758880615, Acc: 0.8472222089767456


100%|██████████| 54/54 [00:02<00:00, 22.75it/s]


Epoch: 40, Loss: 1.6792562007904053, Acc: 0.8460648059844971


100%|██████████| 54/54 [00:02<00:00, 22.14it/s]


Epoch: 41, Loss: 1.6760082244873047, Acc: 0.8553240895271301


100%|██████████| 54/54 [00:02<00:00, 22.44it/s]


Epoch: 42, Loss: 1.662243366241455, Acc: 0.8634259104728699


100%|██████████| 54/54 [00:02<00:00, 21.81it/s]


Epoch: 43, Loss: 1.66257643699646, Acc: 0.8692129850387573


100%|██████████| 54/54 [00:02<00:00, 22.53it/s]


Epoch: 44, Loss: 1.6588891744613647, Acc: 0.8622685074806213


100%|██████████| 54/54 [00:02<00:00, 22.09it/s]


Epoch: 45, Loss: 1.6555386781692505, Acc: 0.8657407164573669


100%|██████████| 54/54 [00:02<00:00, 21.87it/s]


Epoch: 46, Loss: 1.6552187204360962, Acc: 0.8611111044883728


100%|██████████| 54/54 [00:02<00:00, 22.66it/s]


Epoch: 47, Loss: 1.6534523963928223, Acc: 0.8622685074806213


100%|██████████| 54/54 [00:02<00:00, 21.95it/s]


Epoch: 48, Loss: 1.6431070566177368, Acc: 0.8715277910232544


100%|██████████| 54/54 [00:02<00:00, 22.31it/s]


Epoch: 49, Loss: 1.6547362804412842, Acc: 0.8611111044883728


100%|██████████| 54/54 [00:02<00:00, 21.73it/s]


Epoch: 50, Loss: 1.6382527351379395, Acc: 0.8807870149612427


100%|██████████| 54/54 [00:02<00:00, 22.55it/s]


Epoch: 51, Loss: 1.6408653259277344, Acc: 0.8726851940155029


100%|██████████| 54/54 [00:02<00:00, 22.33it/s]


Epoch: 52, Loss: 1.6422386169433594, Acc: 0.8668981194496155


100%|██████████| 54/54 [00:02<00:00, 22.55it/s]


Epoch: 53, Loss: 1.6359260082244873, Acc: 0.8726851940155029


100%|██████████| 54/54 [00:02<00:00, 22.52it/s]


Epoch: 54, Loss: 1.633500576019287, Acc: 0.8692129850387573


100%|██████████| 54/54 [00:02<00:00, 22.37it/s]


Epoch: 55, Loss: 1.6381109952926636, Acc: 0.8692129850387573


100%|██████████| 54/54 [00:02<00:00, 21.97it/s]


Epoch: 56, Loss: 1.6297805309295654, Acc: 0.8807870149612427


100%|██████████| 54/54 [00:02<00:00, 22.73it/s]


Epoch: 57, Loss: 1.6322335004806519, Acc: 0.875


100%|██████████| 54/54 [00:02<00:00, 22.40it/s]


Epoch: 58, Loss: 1.6303901672363281, Acc: 0.8761574029922485


100%|██████████| 54/54 [00:02<00:00, 22.50it/s]

Epoch: 59, Loss: 1.6290189027786255, Acc: 0.8784722089767456





### test

In [45]:
model.eval()

test_acc = 0.0
test_loss = 0.0
for audios, labels in tqdm(test_data_loader):
    audios = audios.to(device)
    labels = labels.to(device)
    labels_one_hot = F.one_hot(labels, num_classes=10).type(torch.FloatTensor).to(device)

    pred = model(audios)
    loss = loss_function(pred, labels_one_hot)
    test_loss += loss
    test_acc += model.accuracy(pred, labels)

total_test_loss = test_loss / len(test_data_loader)
total_test_acc = test_acc / len(test_data_loader)
print(f"Accuracy: {total_test_acc}, Loss: {total_test_loss}")

100%|██████████| 14/14 [00:39<00:00,  2.82s/it]

Accuracy: 0.9032737612724304, Loss: 1.6169768571853638





### save weights

In [46]:
torch.save(model.state_dict(), "/content/drive/MyDrive/models/voice_classification_weights.pth")

### inference

In [None]:
names = ['Parisa', 'Zeinab', 'Alireza', 'Maryam', 'Morteza', 'Nahid', 'MohammadAli', 'Zahra', 'Sajjad', 'Hossein']

signal, sample_rate = torchaudio.load("/content/drive/MyDrive/Datasets/voices_dataset/Maryam/Maryam_2.ogg")

# preprocess
signal = torch.mean(signal, dim=0, keepdim=True)
new_sample_rate = 8000
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
signal = transform(signal)
signal = signal[:, 32000:40000]
signal = signal.unsqueeze(0).to(device)

# process
preds = model(signal)

# postprocess
preds = preds.cpu().detach().numpy()
output = np.argmax(preds)
print(names[output])

Maryam
