In [None]:
!pip install pytelegrambotapi

In [39]:
# Imports

import os
import os.path as ops
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
import torchaudio # for speech
import numpy as np
from tqdm import tqdm

In [40]:
# Directories

source = "/content/drive/MyDrive/04-AudioClassification"

In [None]:
# Device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

In [42]:
# Hyperparameters

epochs = 120
lr = 0.0001
batch_size = 16

### Prepairing dataset

In [8]:
class AudioDataset(Dataset):
    def __init__(self, dataset_path):
        self.dir_path = dataset_path
        self.classes = os.listdir(self.dir_path)
        self.data_paths = []
        self.labels = []

        for root, dirs, files in os.walk(self.dir_path):
            for file in files:
                label = os.path.basename(root)
                data_path = os.path.join(root, file)
                self.data_paths.append(data_path)
                self.labels.append(self.classes.index(label))

        print(f"{len(self.labels)} data loaded from {len(set(self.labels))} classes")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        data_path = self.data_paths[index]
        label = self.labels[index]

        signal, sample_rate = torchaudio.load(data_path)
        signal = torch.mean(signal, dim=0, keepdim=True)

        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=8000)
        signal_mono_transformed = transform(signal)

        return signal_mono_transformed, label

In [46]:
dataset = AudioDataset(ops.join(source, "final_dataset"))
class_names = dataset.classes
np.save('label_map.npy', class_names) 
num_classes = len(class_names)


train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_data_loader = torch.utils.data.DataLoader(train_dataset, 
                                                batch_size=batch_size, 
                                                shuffle=True, 
                                                # collate_fn=collate_fn,
                                                # num_workers=num_workers, 
                                                # pin_memory=pin_memory
                                                )

test_data_loader = torch.utils.data.DataLoader(test_dataset, 
                                                batch_size=batch_size, 
                                                shuffle=False,
                                                # collate_fn=collate_fn,
                                                # num_workers=num_workers, 
                                                # pin_memory=pin_memory
                                               )

1242 data loaded from 10 classes


### Define Model

In [10]:
class M5(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=8, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=48, stride=stride) # 1*80 array
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(2 * n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(2 * n_channel, 4 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(4 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(4 * n_channel, 8 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(8 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(8 * n_channel, n_output)
        # self.dropout1 = nn.Dropout(0.2)
        # self.fc2 = nn.Linear(4 * n_channel, n_output)
        # self.dropout2 = nn.Dropout(0.2)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = torch.flatten(x, start_dim=1)
        # x = self.dropout1(x)
        x = self.fc1(x)
        # x = self.dropout2(x)
        # x = self.fc2(x)
        x = F.softmax(x, dim=1)
        return x

    def accuracy(self, preds, labels):
        maxs, indices = torch.max(preds, 1)
        acc = torch.sum(indices == labels) / len(preds)
        return acc.cpu()

In [None]:
model = M5(n_output=num_classes).to(device)
print(model)

# count_parameters
n = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: %s" % n)

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

### Train, Test and Enference

In [None]:
# train

model.train()

for epoch in range(epochs):
    train_loss = 0.0
    train_acc = 0.0
    for audios, labels in tqdm(train_data_loader):
        audios, labels = audios.to(device), labels.to(device)
        labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=num_classes).type(torch.FloatTensor).to(device)

        preds = model(audios)
        loss = loss_function(preds, labels_one_hot)
        
        # optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss
        train_acc += model.accuracy(preds, labels)
    
    total_loss = train_loss / len(train_data_loader)
    total_acc = train_acc / len(train_data_loader)

    print(f"Epoch: {epoch}, Loss: {total_loss}, Acc: {total_acc}")

In [12]:
# Evaluation
model.eval()

test_loss = 0.0
test_acc = 0.0

for audios, labels in tqdm(test_data_loader):
    audios, labels = audios.to(device), labels.to(device)
    labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=num_classes).type(torch.FloatTensor).to(device)

    preds = model(audios)
    loss = loss_function(preds, labels_one_hot)

    test_loss += loss
    test_acc += model.accuracy(preds, labels)

total_loss = test_loss / len(test_data_loader)
total_acc = test_acc / len(test_data_loader)

print(f"Loss: {total_loss}, Acc: {total_acc}")

100%|██████████| 16/16 [00:00<00:00, 18.25it/s]

Loss: 1.5264931917190552, Acc: 0.93359375





In [14]:
# Save model
torch.save(model.state_dict(), "/content/drive/MyDrive/04-AudioClassification/weights.pth")

### Telegram Bot

In [None]:
%cd /content/drive/MyDrive/04-AudioClassification

In [None]:
import numpy as np
import telebot
from keras.models import load_model
from model import M5

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

# Load model
label_map = np.load('label_map.npy',allow_pickle='TRUE')
num_classes = len(label_map)
model = M5(n_output = num_classes).to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/04-AudioClassification/audio_classifier_weights_best.pth"))
model.eval()

bot = telebot.TeleBot(" ")

@bot.message_handler(commands=['start'])
def start(messages):
    bot.send_message(messages.chat.id, f'welcome dear {messages.from_user.first_name} ')
    bot.send_message(messages.chat.id, f'***Audio Classification***')
    bot.send_message(messages.chat.id, f'Please send me your voice😊')

@bot.message_handler(content_types=['voice'])
def voice(message):
    audio_info = bot.get_file(message.voice.file_id)
    downloaded_file = bot.download_file(audio_info.file_path)
    src = audio_info.file_path

    with open(src, 'wb') as audio_file:
        audio_file.write(downloaded_file)

    signal, sample_rate = torchaudio.load(src)

    # preprocess
    signal = torch.mean(signal, dim=0, keepdim=True)
    transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=8000)
    signal = transform(signal)
    signal = signal.unsqueeze(0).to(device)

    # process
    preds = model(signal)

    # postprocess
    preds = preds.cpu().detach().numpy()
    output = np.argmax(preds)
    print(label_map[output])
    bot.reply_to(message, label_map[output])


bot.polling()