In [1]:
import io
import os
import glob
import json
import time
import tqdm
import signal
import argparse
import numpy as np

import torch
import torch.utils.data

import torchvision as tv

import ignite.engine as ieng
import ignite.metrics as imet
import ignite.handlers as ihan

from typing import Any
from typing import Dict
from typing import List
from typing import Type
from typing import Union
from typing import Optional

from termcolor import colored

from collections import defaultdict
from collections.abc import Iterable

from ignite_trainer import _utils
from ignite_trainer import _visdom
from ignite_trainer import _interfaces
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pretrained_model = '/home/ilias/projects/AudioCLIP/assets/UrbanSound8K_Multimodal-Audio-x2_ACLIP-CV01/UrbanSound8K_Multimodal-Audio-x2_ACLIP-CV01_ACLIP-CV01_performance=0.9188.pt'


  from torch.distributed.optim import ZeroRedundancyOptimizer


In [2]:
from PIL import Image
Dataset: Type = _utils.load_class('utils.datasets.UrbanSound8K')
dataset_args = {
    "root": "/data/urbansound8k",
    "sample_rate": 44100, 
    "fold": 10, 
    "mono": False,
    "training": {"key": "train", "yes": True, "no": False}
}
batch_train = 32
batch_test = 32
workers_train = 4
workers_test = 4

transforms = [
    {
      "class": "utils.transforms.ToTensor1D",
      "args": {}
    },
    {
      "class": "utils.transforms.RandomFlip",
      "args": {"p": 0.5},
      "test": False
    },
    {
      "class": "utils.transforms.RandomScale",
      "args": {"max_scale": 1.50},
      "test": False
    },
    {
      "class": "utils.transforms.RandomPadding",
      "args": {"out_len": 176400},
      "test": False
    },
    {
      "class": "utils.transforms.RandomCrop",
      "args": {"out_len": 176400},
      "test": False
    },
    {
      "class": "utils.transforms.RandomNoise",
      "args": {"snr_min_db": 10.0, "snr_max_db": 120.0, "p": 0.25},
      "test": False
    },
    {
      "class": "utils.transforms.RandomPadding",
      "args": {"out_len": 176400, "train": False},
      "train": False
    },
    {
      "class": "utils.transforms.RandomCrop",
      "args": {"out_len": 176400, "train": False},
      "train": False
    }
  ]

transforms_train = list()
transforms_test = list()

for idx, transform in enumerate(transforms):
    use_train = transform.get('train', True)
    use_test = transform.get('test', True)

    transform = _utils.load_class(transform['class'])(**transform['args'])

    if use_train:
        transforms_train.append(transform)
    if use_test:
        transforms_test.append(transform)

    transforms[idx]['train'] = use_train
    transforms[idx]['test'] = use_test

transforms_train = tv.transforms.Compose(transforms_train)
transforms_test = tv.transforms.Compose(transforms_test)
print(transforms_test)

Compose(
    ToTensor1D()
    RandomPadding()
    RandomCrop()
)


In [3]:
train_loader, eval_loader = _utils.get_data_loaders(
    Dataset,
    dataset_args,
    batch_train,
    batch_test,
    workers_train,
    workers_test,
    transforms_train,
    transforms_test
)

NOT RANDOM FOLD
Loading UrbanSound8K (train=True)
NOT RANDOM FOLD
Loading UrbanSound8K (train=False)


In [4]:
from model.audioclip import AudioCLIP

model = AudioCLIP(pretrained=pretrained_model,
                  multilabel=False)
model = model.to(device=device)

In [5]:
# disable all parameters
for p in model.parameters():
    p.requires_grad = False

# enable only audio-related parameters
for p in model.audio.parameters():
    p.requires_grad = True

model.logit_scale_ai.requires_grad = True
model.logit_scale_at.requires_grad = True

optimizer = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=0.9, nesterov=True, weight_decay=5e-4)

In [6]:
num_params_total = sum(p.numel() for p in model.parameters())
num_params_train = sum(p.numel() for grp in optimizer.param_groups for p in grp['params'])

params_total_label = ''
params_train_label = ''
if num_params_total > 1e6:
    num_params_total /= 1e6
    params_total_label = 'M'
elif num_params_total > 1e3:
    num_params_total /= 1e3
    params_total_label = 'k'

if num_params_train > 1e6:
    num_params_train /= 1e6
    params_train_label = 'M'
elif num_params_train > 1e3:
    num_params_train /= 1e3
    params_train_label = 'k'
tqdm.write('Total number of parameters: {:.2f}{}'.format(num_params_total, params_total_label))
tqdm.write('Number of trainable parameters: {:.2f}{}'.format(num_params_train, params_train_label))


Total number of parameters: 134.11M
Number of trainable parameters: 134.11M


In [None]:
model.to(device)
model.eval()

from sklearn.metrics import accuracy_score, classification_report

# Ensure y_true and y_pred are lists
y_true = []
y_pred_list = []

with torch.no_grad():
    for batch in tqdm(eval_loader, desc="Evaluating"):
        audio, _, text = batch

        ((audio_features, _, _), _), _ = model(
            audio=audio,
            batch_indices=torch.arange(audio.shape[0], dtype=torch.int64, device=device)
        )
        audio_features = audio_features.unsqueeze(1)

        ((_, _, text_features), _), _ = model(
            text=[
                [eval_loader.dataset.class_idx_to_label[class_idx]]
                for class_idx in sorted(eval_loader.dataset.class_idx_to_label.keys())
            ],
            batch_indices=torch.arange(
                len(eval_loader.dataset.class_idx_to_label), dtype=torch.int64, device=device
            )
        )
        text_features = text_features.unsqueeze(1).transpose(0, 1)

        logit_scale_at = torch.clamp(model.logit_scale_at.exp(), min=1.0, max=100.0)
        logits = (logit_scale_at * audio_features @ text_features.transpose(-1, -2)).squeeze(1)

        y = torch.zeros(
            audio.shape[0], len(eval_loader.dataset.class_idx_to_label), dtype=torch.int8, device=device
        )
        for item_idx, labels in enumerate(text):
            class_ids = list(sorted([
                eval_loader.dataset.label_to_class_idx[lb] for lb in labels
            ]))
            y[item_idx][class_ids] = 1

        y_pred_batch = torch.softmax(logits, dim=-1)
        y_pred_batch = y_pred_batch.argmax(dim=-1)  # Get predicted class index
        y = y.argmax(dim=-1)  # Convert true labels to class indices for comparison

        y_true.extend(y.cpu().numpy())
        y_pred_list.extend(y_pred_batch.cpu().numpy())

# Compute metrics
y_true_np = np.array(y_true)
y_pred_np = np.array(y_pred_list)

# Accuracy
accuracy = accuracy_score(y_true_np, y_pred_np)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_true_np, y_pred_np, target_names=list(eval_loader.dataset.class_idx_to_label.values())))


In [9]:
import torch
import torchaudio
import librosa

def pad_audio(waveform, target_length):
    """
    Pads or truncates the waveform to the target length.

    Args:
        waveform (torch.Tensor): The audio waveform (C, L), where C is the channel and L is the length.
        target_length (int): The target length in samples.

    Returns:
        torch.Tensor: The padded or truncated waveform.
    """
    current_length = waveform.shape[-1]

    if current_length < target_length:
        # Pad waveform to the right to reach the target length
        padding = target_length - current_length
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    elif current_length > target_length:
        # Truncate waveform to the target length
        waveform = waveform[:, :target_length]

    return waveform

def infer_audio_class(audio_path, model, device, label_mapping, transform_audio=None):
    model.eval()

    # Load and preprocess audio
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = pad_audio(waveform, sample_rate*4)
    waveform = waveform.numpy()  # Ensure compatibility with `transforms_test``
   
    for t in transforms_test.transforms:
        waveform = t(waveform)

    # Move to GPU and free memory
    torch.cuda.empty_cache()
    waveform = waveform.to(device)




    with torch.no_grad():
        # Model inference
        ((audio_features, _, _), _), _ = model(
            audio=waveform,
            batch_indices=torch.arange(waveform.shape[0], dtype=torch.int64, device=device)
        )
        audio_features = audio_features.unsqueeze(1)

        # Generate text features
        ((_, _, text_features), _), _ = model(
            text=[
                [label_mapping[class_idx]]
                for class_idx in sorted(label_mapping.keys())
            ],
            batch_indices=torch.arange(len(label_mapping), dtype=torch.int64, device=device)
        )
        text_features = text_features.unsqueeze(1).transpose(0, 1)

        # Compute similarity scores
        logit_scale_at = torch.clamp(model.logit_scale_at.exp(), min=1.0, max=100.0)
        y_pred = (logit_scale_at * audio_features @ text_features.transpose(-1, -2)).squeeze(1)
        y = torch.zeros(
                    waveform.shape[0], len(eval_loader.dataset.class_idx_to_label), dtype=torch.int8, device=device
                )
           
        predictions = torch.softmax(y_pred, dim=-1)
        y = y.argmax(dim=-1)

                
        predictions = predictions.cpu().numpy().squeeze()
        predicted_idx = np.argmax(predictions)
        predicted_label = label_mapping[predicted_idx]

    return {
         "predictions": predictions.tolist(),
         "predicted_label": predicted_label
     }


# Example Usage:
# Assuming label_mapping is a dictionary {0: "class1", 1: "class2", ...}
audio_path = "/home/ilias/projects/adversarial_thesis/data/orig_1_siren.wav"
result = infer_audio_class(audio_path, model, device, eval_loader.dataset.class_idx_to_label, transform_audio=transforms_test)
print(result)


{'predictions': [0.19977906346321106, 0.06820614635944366, 0.010029755532741547, 0.3603510558605194, 0.03957076370716095, 0.07147389650344849, 0.1348404586315155, 0.010213349014520645, 0.09584540128707886, 0.009690063074231148], 'predicted_label': 'dog bark'}
