<a href="https://colab.research.google.com/github/JayTiptown/Music108-Sound-Classification/blob/main/Music108_sound_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install soundata

Collecting soundata
  Downloading soundata-1.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting jams>=0.3.4 (from soundata)
  Downloading jams-0.3.4.tar.gz (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr>=0.16.0 (from soundata)
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting sortedcontainers>=2.0.0 (from jams>=0.3.4->soundata)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting mir_eval>=0.5 (from jams>=0.3.4->soundata)
  Downloading mir_eval-0.7.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.7/90.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting texttable (from p

In [16]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import librosa
import soundata
from tqdm import tqdm
import shutil
import requests
from pathlib import Path

In [17]:
# Load dataset
def prepare_urbansound8k(fold_number=None):
    # Initialize the dataset
    dataset = soundata.initialize('urbansound8k')

    # Download the dataset if not already downloaded
    dataset.download()

    # Get all clip IDs
    clip_ids = dataset.clip_ids

    if fold_number is None:
        # Original behavior - return all data
        file_paths = []
        labels = []

        for clip_id in clip_ids:
            clip = dataset.clip(clip_id)
            file_paths.append(clip.audio_path)
            labels.append(clip.tags.class_id)

        return file_paths, labels
    else:
        # Split by fold
        train_files, train_labels = [], []
        val_files, val_labels = [], []

        for clip_id in clip_ids:
            clip = dataset.clip(clip_id)
            if clip.tags.fold == fold_number:
                val_files.append(clip.audio_path)
                val_labels.append(clip.tags.class_id)
            else:
                train_files.append(clip.audio_path)
                train_labels.append(clip.tags.class_id)

        return train_files, val_files, train_labels, val_labels

In [18]:
# Audio processing

class AudioProcessor:
    def __init__(self, sr=22050, duration=4, n_mels=128):
        self.sr = sr
        self.duration = duration
        self.n_mels = n_mels
        self.n_samples = sr * duration

    def load_audio(self, file_path):
        audio, _ = librosa.load(file_path, sr=self.sr, duration=self.duration)
        if len(audio) < self.n_samples:
            audio = np.pad(audio, (0, self.n_samples - len(audio)))
        return audio[:self.n_samples]

    def get_melspectrogram(self, audio):
        mel_spec = librosa.feature.melspectrogram(
            y=audio,
            sr=self.sr,
            n_mels=self.n_mels
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        return mel_spec_db

In [19]:
class SoundDataset(Dataset):
    def __init__(self, file_paths, labels, processor):
        self.file_paths = file_paths
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio = self.processor.load_audio(self.file_paths[idx])
        mel_spec = self.processor.get_melspectrogram(audio)
        return torch.FloatTensor(mel_spec), torch.tensor(self.labels[idx])

In [20]:
# CNN model for Sound classification

class AudioCNN(nn.Module):
    def __init__(self, num_classes):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 16 * 16, 512)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [21]:
# training cell

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    best_val_acc = 0
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0

        for specs, labels in tqdm(train_loader):
            specs, labels = specs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(specs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for specs, labels in val_loader:
                specs, labels = specs.to(device), labels.to(device)
                outputs = model(specs)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        val_acc = 100. * val_correct / val_total
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss/len(train_loader):.3f}')
        print(f'Train Acc: {100.*correct/total:.2f}%')
        print(f'Val Acc: {val_acc:.2f}%')

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')

In [22]:
# Test Cell: Verify data loading
def test_data_loading():
    # Test without fold specification
    try:
        file_paths, labels = prepare_urbansound8k()
        print("Successfully loaded all data:")
        print(f"Total samples: {len(file_paths)}")
        print(f"Sample file path: {file_paths[0]}")
        print(f"Sample label: {labels[0]}")
    except Exception as e:
        print(f"Error loading all data: {e}")

    # Test with fold specification
    try:
        train_files, val_files, train_labels, val_labels = prepare_urbansound8k(fold_number=0)
        print("\nSuccessfully loaded fold-based data:")
        print(f"Training samples: {len(train_files)}")
        print(f"Validation samples: {len(val_files)}")
        print(f"Sample training file: {train_files[0]}")
        print(f"Sample training label: {train_labels[0]}")
    except Exception as e:
        print(f"Error loading fold-based data: {e}")

# Run test
test_data_loading()

Error loading all data: 'Tags' object has no attribute 'class_id'
Error loading fold-based data: 'Tags' object has no attribute 'fold'


In [14]:
# execution

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load dataset using soundata
    print("Loading dataset...")
    file_paths, labels = prepare_urbansound8k()

    # Initialize processor
    processor = AudioProcessor()

    # Split data
    train_files, val_files, train_labels, val_labels = train_test_split(
        file_paths, labels, test_size=0.2, random_state=42
    )

    print(f"Training samples: {len(train_files)}")
    print(f"Validation samples: {len(val_files)}")

    # Create datasets
    train_dataset = SoundDataset(train_files, train_labels, processor)
    val_dataset = SoundDataset(val_files, val_labels, processor)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    # Initialize model
    model = AudioCNN(num_classes=10).to(device)  # 10 classes for UrbanSound8K

    # Training setup
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train model
    print("Starting training...")
    train_model(model, train_loader, val_loader, criterion, optimizer,
                num_epochs=50, device=device)

if __name__ == '__main__':
    main()

Using device: cpu
Loading dataset...


KeyboardInterrupt: 