# Dataset 

In [None]:
from datasets.datasets import UrbanSound8KDataset, get_data_loaders

data_csv='/data/urbansound8k/UrbanSound8K.csv'
root_dir='/data/urbansound8k'
train_fold=[1,2,3,4,5,6,7,8]
val_fold=[10]
test_fold=[9]
train_loader, val_loader, test_loader = get_data_loaders(data_csv, root_dir, train_fold, val_fold, test_fold, batch_size=32, mode="attack")

In [None]:
from collections import Counter

all_labels = []
for _, labels in test_loader:
    all_labels.extend(labels.cpu().numpy())
label_counts = Counter(all_labels)
label_counts

In [None]:
import matplotlib.pyplot as plt
import os
# Fetch the first batch from the train_loader
for batch_idx, (data, labels) in enumerate(train_loader):
    # Print the shape of the data and labels
    print(f"Batch {batch_idx+1}")
    print(f"Data shape: {data.shape}")  # Should be [batch_size, 128, 84] if correct
    print(f"Labels shape: {labels.shape}")
    
    # Take the first example in the batch
    example_spectrogram = data[0].numpy()
    example_label = labels[0].item()
    
    # Get the corresponding file information from the dataset annotations
    annotation_idx = train_loader.dataset.annotations.index[batch_idx * len(data)]
    file_name = train_loader.dataset.annotations.iloc[annotation_idx, 0]
    fold_number = train_loader.dataset.annotations.iloc[annotation_idx, 5]
    
    # Print the file name, fold number, and label
    print(f"File: {file_name}")
    print(f"Fold: {fold_number}")
    print(f"Label: {example_label}")
    
    # Plot the mel-spectrogram
    plt.figure(figsize=(10, 4))
    plt.imshow(example_spectrogram, aspect='auto', origin='lower')
    plt.title(f"Mel-Spectrogram Example - Label: {example_label}")
    plt.colorbar(format='%+2.0f dB')
    plt.xlabel('Time')
    plt.ylabel('Mel Frequency Bands')
    
    # Save the plot with detailed filename
    save_path = f'/home/ilias/projects/adversarial_thesis/data/mel_spectrogram_fold{fold_number}_label{example_label}_{os.path.splitext(file_name)[0]}.png'
    plt.savefig(save_path)
    plt.close()
    break

# Model

In [1]:
from models.models import BaselineCNN
import torch 

model = BaselineCNN(num_classes=10)
model.load_state_dict(torch.load('/home/ilias/projects/adversarial_thesis/src/models/baseline_cnn.pth'))
model.eval() 

  from torch.distributed.optim import ZeroRedundancyOptimizer
  model.load_state_dict(torch.load('/home/ilias/projects/adversarial_thesis/src/models/baseline_cnn.pth'))


BaselineCNN(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=10240, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
audio_file = "/data/urbansound8k/fold10/15544-5-0-8.wav"

output1 = model()

In [None]:
from loops.trainer import test

test(test_loader=test_loader, model=model, device = torch.device("cpu"))

# Attacks

### PSO Attack example

Load audio

In [None]:
import librosa
import IPython

audio_file = "/data/urbansound8k/fold10/15544-5-0-8.wav"
IPython.display.display(IPython.display.Audio(audio_file))

Plot original waveform

In [None]:
import matplotlib.pyplot as plt

original_audio, sr = librosa.load(audio_file)

# Plot the waveform
fig, ax = plt.subplots(figsize=(10, 3))
librosa.display.waveshow(original_audio, sr=sr, ax=ax)

# Customize the plot as needed
ax.set(title='Waveform of Example Audio File')
ax.label_outer()
plt.show()

Create noise & plot noise

In [None]:
import numpy as np

epsilon = 0.3

noise = np.random.uniform(
            -np.abs(original_audio),  # Minimum noise for each point
            np.abs(original_audio)   # Maximum noise for each point
        ) * epsilon  # Scale by epsilon

# Plot the waveform
fig, ax = plt.subplots(figsize=(10, 3))
librosa.display.waveshow(noise, sr=sr, ax=ax)

# Customize the plot as needed
ax.set(title='Waveform of Example Audio File')
ax.label_outer()
plt.show()

Compute SNR

In [None]:
from utils.utils import calculate_snr

snr = calculate_snr(original_audio, noise)
print(f"SNR = {snr}")

Perturbed audio

In [None]:
perturbed = original_audio + noise
# Plot the waveform
fig, ax = plt.subplots(figsize=(10, 3))
librosa.display.waveshow(perturbed, sr=sr, ax=ax)

# Customize the plot as needed
ax.set(title='Waveform of perturbed example')
ax.label_outer()
plt.show()

In [None]:
IPython.display.Audio(perturbed, rate=22010)

L2 Norm

In [None]:
l2_penalty = np.linalg.norm(original_audio - original_audio)
l2_penalty

# ESC-50

In [4]:
import os
import pandas as pd
import numpy as np
import torch
import librosa
from torch.utils.data import Dataset, DataLoader
from utils.utils import extract_mel_spectrogram

CATEGORY_MAPPING = {
    "dog": "Animals",
    "rooster": "Animals",
    "pig": "Animals",
    "cow": "Animals",
    "frog": "Animals",
    "cat": "Animals",
    "hen": "Animals",
    "insects": "Animals",
    "sheep": "Animals",
    "crow": "Animals",
    "rain": 'Natural soundscapes & water sounds',
    "sea_waves": 'Natural soundscapes & water sounds',
    "crackling_fire": 'Natural soundscapes & water sounds',
    "crickets": 'Natural soundscapes & water sounds',
    "chirping_birds": 'Natural soundscapes & water sounds',
    'water_drops': 'Natural soundscapes & water sounds',
    "wind": 'Natural soundscapes & water sounds',
    'pouring_water': 'Natural soundscapes & water sounds',
    "toilet_flush": 'Natural soundscapes & water sounds',
    "thunderstorm": 'Natural soundscapes & water sounds',
    "crying baby": "Human, non-speech sounds",
    "sneezing": "Human, non-speech sounds",
    "clapping": "Human, non-speech sounds",
    "breathing": "Human, non-speech sounds",
    "coughing": "Human, non-speech sounds",
    "footsteps": "Human, non-speech sounds",
    "laughing": "Human, non-speech sounds",
    "brushing_teeth": "Human, non-speech sounds",
    "snoring": "Human, non-speech sounds",
    "drinking_sipping": "Human, non-speech sounds",
    "door_wood_knock": "Interior/domestic sounds",
    "mouse_click": "Interior/domestic sounds",
    "keyboard_typing": "Interior/domestic sounds",
    "door_wood_creaks": "Interior/domestic sounds",
    "can_opening": "Interior/domestic sounds",
    "washing_machine": "Interior/domestic sounds",
    "vacuum_cleaner": "Interior/domestic sounds",
    "clock_alarm": "Interior/domestic sounds",
    "clock_tick": "Interior/domestic sounds",
    "glass_breaking": "Interior/domestic sounds",
    "helicopter": "Exterior/urban noises",
    "chainsaw": "Exterior/urban noises",
    "siren": "Exterior/urban noises",
    "car_horn": "Exterior/urban noises",
    "engine": "Exterior/urban noises",
    "train": "Exterior/urban noises",
    "church_bells": "Exterior/urban noises",
    "airplane": "Exterior/urban noises",
    "fireworks": "Exterior/urban noises",
    "hand_saw": "Exterior/urban noises"
}

class ESC50Dataset(Dataset):
    def __init__(self, annotations_file, root_dir, folds, mode='train', transform=None):
        """
        ESC-50 dataset class.

        Args:
            annotations_file (str): Path to the annotations CSV file.
            root_dir (str): Root directory containing audio files.
            folds (list): List of fold numbers to include in the dataset.
            mode (str): 'train' for mel-spectrograms, 'attack' for waveforms, 'AudioCLIP' for normalized waveforms.
            transform (callable, optional): Transformation function for data augmentation.
        """
        self.annotations = pd.read_csv(annotations_file)
        self.root_dir = root_dir
        self.transform = transform
        self.folds = folds
        self.mode = mode  # 'train', 'attack', or 'AudioCLIP'
        self.sr = 22050
        self.target_length = 4 * self.sr  # 4 seconds at 22050 Hz
        # Filter annotations to include only the specified folds
        self.annotations = self.annotations[self.annotations['fold'].isin(self.folds)]

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        """
        Retrieves an item from the dataset.

        Args:
            idx (int): Index of the item.

        Returns:
            tuple: (features, label, file path), where features can be a mel-spectrogram or waveform.
        """
        # Get the file path
        file_name = self.annotations.iloc[idx, 0]
        audio_file_path = os.path.join(self.root_dir, file_name)
        
        # Load the audio file
        audio, sample_rate = librosa.load(audio_file_path, sr=self.sr)
        
        # Ensure all audio has the same length (4 seconds)
        if len(audio) < self.target_length:
            padding = self.target_length - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant')
        elif len(audio) > self.target_length:
            audio = audio[:self.target_length]
        
        # Get the label and map to higher-level category
        category = self.annotations.iloc[idx, 3]
        label = CATEGORY_MAPPING.get(category, "Unknown")
        
        # Depending on the mode, return the appropriate features
        if self.mode == 'train' or self.mode == "evaluate":
            features = extract_mel_spectrogram(audio, sample_rate)
            if self.transform:
                features = self.transform(features)
        elif self.mode == 'attack':
            features = audio
        elif self.mode == 'AudioCLIP':
            audio = torch.tensor(audio, dtype=torch.float32)
            features = audio
                
        return features, label, audio_file_path

def get_esc50_data_loaders(annotations_file, root_dir, train_folds, val_folds, test_folds, batch_size=32, transform=None, mode="train"):
    # Create datasets for training, validation, and testing
    train_dataset = ESC50Dataset(annotations_file, root_dir, train_folds, mode=mode, transform=transform)
    val_dataset = ESC50Dataset(annotations_file, root_dir, val_folds, mode=mode, transform=transform)
    test_dataset = ESC50Dataset(annotations_file, root_dir, test_folds, mode=mode, transform=transform)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader


In [None]:
# Define file paths
annotations_file = "/data/ESC-50-master/meta/esc50.csv" 
audio_root_dir = "/data/ESC-50-master/audio"  

# Define dataset folds
train_folds = [1, 2, 3]  # Example train folds
val_folds = [4]          # Example validation fold
test_folds = [5]         # Example test fold

# Initialize dataset
dataset = ESC50Dataset(annotations_file, audio_root_dir, folds=train_folds, mode='train')

# Load a single sample
features, label, file_path = dataset[0]

# Print output
print("Sample File Path:", file_path)
print("Label (Higher Class):", label)
print("Feature Shape:", features.shape if isinstance(features, torch.Tensor) else len(features))


Sample File Path: /data/ESC-50-master/audio/1-100032-A-0.wav
Label (Higher Class): Animals
Feature Shape: torch.Size([1, 128, 84])


  mel_tensor = torch.tensor(mel_spectrogram_db, dtype=torch.float32).unsqueeze(0).to(device)
