In [3]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [4]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ToTensor
from pydub import AudioSegment
import pandas as pd
from sklearn.metrics import classification_report

In [5]:
TRAIN_DIR = '/content/drive/MyDrive/train'
TEST_DIR = '/content/drive/MyDrive/test'

In [6]:
CLASSES = sorted([cls for cls in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, cls))])
print("Classes and Labels:")
for idx, class_name in enumerate(CLASSES):
    print(f"Class: {class_name}, Label: {idx}")

Classes and Labels:
Class: 0, Label: 0
Class: 1, Label: 1
Class: 2, Label: 2
Class: 3, Label: 3
Class: 4, Label: 4
Class: 5, Label: 5
Class: 6, Label: 6
Class: 7, Label: 7
Class: 8, Label: 8
Class: 9, Label: 9
Class: none, Label: 10


In [7]:
def audio_to_melspectrogram(file_path, max_len=128):
    audio, sr = librosa.load(file_path, sr=16000)
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

    if log_mel_spec.shape[1] > max_len:
        log_mel_spec = log_mel_spec[:, :max_len]
    else:
        log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, max_len - log_mel_spec.shape[1])), mode='constant')

    return log_mel_spec

In [8]:
def convert_to_wav(file_path):
    if file_path.lower().endswith(".mp3"):
        wav_path = file_path.rsplit(".", 1)[0] + ".wav"
        audio = AudioSegment.from_mp3(file_path)
        audio.export(wav_path, format="wav")
        print(f"Converted {file_path} to {wav_path}")
        return wav_path
    return file_path

In [9]:
class AudioDataset(Dataset):
    def __init__(self, data_dir, classes, max_len=128, transform=None):
        self.file_paths = []
        self.labels = []
        self.max_len = max_len
        self.transform = transform
        self.classes = classes

        for label, class_name in enumerate(classes):
            class_dir = os.path.join(data_dir, class_name)
            for file_name in os.listdir(class_dir):
                file_path = os.path.join(class_dir, file_name)
                file_path = convert_to_wav(file_path)  # بررسی و تبدیل mp3 به wav
                self.file_paths.append(file_path)
                self.labels.append(label)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        mel_spec = audio_to_melspectrogram(file_path, max_len=self.max_len)

        if self.transform:
            mel_spec = self.transform(mel_spec)

        return mel_spec, label, os.path.basename(file_path)

In [10]:
train_dataset = AudioDataset(TRAIN_DIR, CLASSES, transform=ToTensor())
test_dataset = AudioDataset(TEST_DIR, CLASSES, transform=ToTensor())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [11]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 16 * 16, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = self.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioClassifier(num_classes=len(CLASSES)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [15]:
from tqdm import tqdm

best_accuracy = 0.0
num_epochs = 40
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    print(f"Epoch {epoch + 1}/{num_epochs}")
    progress_bar = tqdm(train_loader, desc="Training", unit="batch")

    for mel_specs, labels, _ in progress_bar:
        mel_specs = mel_specs.to(device).unsqueeze(1).squeeze(2)
        labels = labels.to(device)

        outputs = model(mel_specs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        progress_bar.set_postfix({"Loss": loss.item(), "Accuracy": 100. * correct / total})

    accuracy = 100. * correct / total
    print(f"Epoch {epoch + 1}/{num_epochs} Completed, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), "best_model.pth")
        print(f"New Best Model Saved with Accuracy: {best_accuracy:.2f}%")


Epoch 1/40


Training: 100%|██████████| 677/677 [02:39<00:00,  4.24batch/s, Loss=1.69, Accuracy=37.9]


Epoch 1/40 Completed, Loss: 1.6986, Accuracy: 37.88%
New Best Model Saved with Accuracy: 37.88%
Epoch 2/40


Training: 100%|██████████| 677/677 [02:28<00:00,  4.55batch/s, Loss=1.58, Accuracy=42]


Epoch 2/40 Completed, Loss: 1.5997, Accuracy: 41.97%
New Best Model Saved with Accuracy: 41.97%
Epoch 3/40


Training: 100%|██████████| 677/677 [02:27<00:00,  4.58batch/s, Loss=1.71, Accuracy=45.3]


Epoch 3/40 Completed, Loss: 1.5039, Accuracy: 45.34%
New Best Model Saved with Accuracy: 45.34%
Epoch 4/40


Training: 100%|██████████| 677/677 [02:24<00:00,  4.68batch/s, Loss=1.19, Accuracy=49.1]


Epoch 4/40 Completed, Loss: 1.4231, Accuracy: 49.12%
New Best Model Saved with Accuracy: 49.12%
Epoch 5/40


Training: 100%|██████████| 677/677 [02:25<00:00,  4.64batch/s, Loss=1.24, Accuracy=52.3]


Epoch 5/40 Completed, Loss: 1.3315, Accuracy: 52.27%
New Best Model Saved with Accuracy: 52.27%
Epoch 6/40


Training: 100%|██████████| 677/677 [02:25<00:00,  4.66batch/s, Loss=1.3, Accuracy=55]


Epoch 6/40 Completed, Loss: 1.2448, Accuracy: 55.03%
New Best Model Saved with Accuracy: 55.03%
Epoch 7/40


Training: 100%|██████████| 677/677 [02:25<00:00,  4.64batch/s, Loss=1.16, Accuracy=58.3]


Epoch 7/40 Completed, Loss: 1.1591, Accuracy: 58.27%
New Best Model Saved with Accuracy: 58.27%
Epoch 8/40


Training: 100%|██████████| 677/677 [02:25<00:00,  4.65batch/s, Loss=0.705, Accuracy=61.3]


Epoch 8/40 Completed, Loss: 1.0707, Accuracy: 61.31%
New Best Model Saved with Accuracy: 61.31%
Epoch 9/40


Training: 100%|██████████| 677/677 [02:24<00:00,  4.68batch/s, Loss=0.992, Accuracy=64.8]


Epoch 9/40 Completed, Loss: 0.9950, Accuracy: 64.78%
New Best Model Saved with Accuracy: 64.78%
Epoch 10/40


Training: 100%|██████████| 677/677 [02:27<00:00,  4.58batch/s, Loss=0.762, Accuracy=66.9]


Epoch 10/40 Completed, Loss: 0.9112, Accuracy: 66.87%
New Best Model Saved with Accuracy: 66.87%
Epoch 11/40


Training: 100%|██████████| 677/677 [02:31<00:00,  4.46batch/s, Loss=1.17, Accuracy=70.9]


Epoch 11/40 Completed, Loss: 0.8251, Accuracy: 70.86%
New Best Model Saved with Accuracy: 70.86%
Epoch 12/40


Training: 100%|██████████| 677/677 [02:36<00:00,  4.32batch/s, Loss=0.489, Accuracy=72.9]


Epoch 12/40 Completed, Loss: 0.7594, Accuracy: 72.90%
New Best Model Saved with Accuracy: 72.90%
Epoch 13/40


Training: 100%|██████████| 677/677 [02:29<00:00,  4.53batch/s, Loss=0.392, Accuracy=76]


Epoch 13/40 Completed, Loss: 0.6925, Accuracy: 75.99%
New Best Model Saved with Accuracy: 75.99%
Epoch 14/40


Training: 100%|██████████| 677/677 [02:30<00:00,  4.49batch/s, Loss=0.959, Accuracy=77.8]


Epoch 14/40 Completed, Loss: 0.6436, Accuracy: 77.81%
New Best Model Saved with Accuracy: 77.81%
Epoch 15/40


Training: 100%|██████████| 677/677 [02:32<00:00,  4.44batch/s, Loss=0.276, Accuracy=80.2]


Epoch 15/40 Completed, Loss: 0.5677, Accuracy: 80.18%
New Best Model Saved with Accuracy: 80.18%
Epoch 16/40


Training: 100%|██████████| 677/677 [02:33<00:00,  4.40batch/s, Loss=0.418, Accuracy=81.5]


Epoch 16/40 Completed, Loss: 0.5283, Accuracy: 81.53%
New Best Model Saved with Accuracy: 81.53%
Epoch 17/40


Training: 100%|██████████| 677/677 [02:26<00:00,  4.63batch/s, Loss=0.411, Accuracy=83.5]


Epoch 17/40 Completed, Loss: 0.4779, Accuracy: 83.48%
New Best Model Saved with Accuracy: 83.48%
Epoch 18/40


Training: 100%|██████████| 677/677 [02:26<00:00,  4.62batch/s, Loss=0.792, Accuracy=84.9]


Epoch 18/40 Completed, Loss: 0.4369, Accuracy: 84.92%
New Best Model Saved with Accuracy: 84.92%
Epoch 19/40


Training: 100%|██████████| 677/677 [02:26<00:00,  4.61batch/s, Loss=0.149, Accuracy=86.1]


Epoch 19/40 Completed, Loss: 0.4071, Accuracy: 86.10%
New Best Model Saved with Accuracy: 86.10%
Epoch 20/40


Training: 100%|██████████| 677/677 [02:27<00:00,  4.58batch/s, Loss=0.55, Accuracy=88.2]


Epoch 20/40 Completed, Loss: 0.3557, Accuracy: 88.19%
New Best Model Saved with Accuracy: 88.19%
Epoch 21/40


Training: 100%|██████████| 677/677 [02:27<00:00,  4.58batch/s, Loss=0.679, Accuracy=88.6]


Epoch 21/40 Completed, Loss: 0.3404, Accuracy: 88.55%
New Best Model Saved with Accuracy: 88.55%
Epoch 22/40


Training: 100%|██████████| 677/677 [02:23<00:00,  4.73batch/s, Loss=0.112, Accuracy=89]


Epoch 22/40 Completed, Loss: 0.3227, Accuracy: 89.04%
New Best Model Saved with Accuracy: 89.04%
Epoch 23/40


Training: 100%|██████████| 677/677 [02:23<00:00,  4.71batch/s, Loss=0.414, Accuracy=90.1]


Epoch 23/40 Completed, Loss: 0.2934, Accuracy: 90.12%
New Best Model Saved with Accuracy: 90.12%
Epoch 24/40


Training: 100%|██████████| 677/677 [02:26<00:00,  4.61batch/s, Loss=0.0633, Accuracy=90.8]


Epoch 24/40 Completed, Loss: 0.2704, Accuracy: 90.84%
New Best Model Saved with Accuracy: 90.84%
Epoch 25/40


Training: 100%|██████████| 677/677 [02:26<00:00,  4.63batch/s, Loss=0.205, Accuracy=90.7]


Epoch 25/40 Completed, Loss: 0.2727, Accuracy: 90.73%
Epoch 26/40


Training: 100%|██████████| 677/677 [02:22<00:00,  4.75batch/s, Loss=0.0239, Accuracy=92.1]


Epoch 26/40 Completed, Loss: 0.2399, Accuracy: 92.11%
New Best Model Saved with Accuracy: 92.11%
Epoch 27/40


Training: 100%|██████████| 677/677 [02:23<00:00,  4.70batch/s, Loss=0.215, Accuracy=93]


Epoch 27/40 Completed, Loss: 0.2176, Accuracy: 93.03%
New Best Model Saved with Accuracy: 93.03%
Epoch 28/40


Training: 100%|██████████| 677/677 [02:24<00:00,  4.68batch/s, Loss=0.407, Accuracy=92.9]


Epoch 28/40 Completed, Loss: 0.2122, Accuracy: 92.88%
Epoch 29/40


Training: 100%|██████████| 677/677 [02:25<00:00,  4.66batch/s, Loss=0.929, Accuracy=93.6]


Epoch 29/40 Completed, Loss: 0.1974, Accuracy: 93.55%
New Best Model Saved with Accuracy: 93.55%
Epoch 30/40


Training: 100%|██████████| 677/677 [02:27<00:00,  4.60batch/s, Loss=0.14, Accuracy=93.9]


Epoch 30/40 Completed, Loss: 0.1985, Accuracy: 93.87%
New Best Model Saved with Accuracy: 93.87%
Epoch 31/40


Training: 100%|██████████| 677/677 [02:31<00:00,  4.48batch/s, Loss=0.049, Accuracy=94.2]


Epoch 31/40 Completed, Loss: 0.1842, Accuracy: 94.20%
New Best Model Saved with Accuracy: 94.20%
Epoch 32/40


Training: 100%|██████████| 677/677 [02:32<00:00,  4.44batch/s, Loss=0.172, Accuracy=94.4]


Epoch 32/40 Completed, Loss: 0.1690, Accuracy: 94.38%
New Best Model Saved with Accuracy: 94.38%
Epoch 33/40


Training: 100%|██████████| 677/677 [02:33<00:00,  4.42batch/s, Loss=0.153, Accuracy=94.8]


Epoch 33/40 Completed, Loss: 0.1636, Accuracy: 94.81%
New Best Model Saved with Accuracy: 94.81%
Epoch 34/40


Training: 100%|██████████| 677/677 [02:34<00:00,  4.37batch/s, Loss=0.542, Accuracy=94.9]


Epoch 34/40 Completed, Loss: 0.1570, Accuracy: 94.85%
New Best Model Saved with Accuracy: 94.85%
Epoch 35/40


Training: 100%|██████████| 677/677 [02:34<00:00,  4.39batch/s, Loss=0.247, Accuracy=95.5]


Epoch 35/40 Completed, Loss: 0.1475, Accuracy: 95.49%
New Best Model Saved with Accuracy: 95.49%
Epoch 36/40


Training: 100%|██████████| 677/677 [02:32<00:00,  4.44batch/s, Loss=0.195, Accuracy=95.1]


Epoch 36/40 Completed, Loss: 0.1471, Accuracy: 95.09%
Epoch 37/40


Training: 100%|██████████| 677/677 [02:30<00:00,  4.49batch/s, Loss=0.127, Accuracy=95.9]


Epoch 37/40 Completed, Loss: 0.1298, Accuracy: 95.89%
New Best Model Saved with Accuracy: 95.89%
Epoch 38/40


Training: 100%|██████████| 677/677 [02:32<00:00,  4.43batch/s, Loss=0.406, Accuracy=95.9]


Epoch 38/40 Completed, Loss: 0.1329, Accuracy: 95.87%
Epoch 39/40


Training: 100%|██████████| 677/677 [02:32<00:00,  4.45batch/s, Loss=0.0261, Accuracy=95.6]


Epoch 39/40 Completed, Loss: 0.1293, Accuracy: 95.63%
Epoch 40/40


Training: 100%|██████████| 677/677 [02:34<00:00,  4.40batch/s, Loss=0.0542, Accuracy=96.2]

Epoch 40/40 Completed, Loss: 0.1197, Accuracy: 96.24%
New Best Model Saved with Accuracy: 96.24%





In [16]:
model.eval()
all_preds = []
all_labels = []
file_names = []

with torch.no_grad():
    for mel_specs, labels, files in test_loader:
        mel_specs = mel_specs.to(device).unsqueeze(1).squeeze(2)
        labels = labels.to(device)

        outputs = model(mel_specs)
        _, predicted = outputs.max(1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        file_names.extend(files)

In [None]:
output_df = pd.DataFrame({
    0: file_names,
    1: [CLASSES[pred] for pred in all_preds]})
output_df.to_csv("model_output.csv", index=False, header=False)


In [18]:
torch.save(model.state_dict(), "audio_classifier.pth")

In [None]:
torch.save(model.state_dict(), "best_model.pth")