Import Modules for Zero Shot Classification

In [1]:
from msclap import CLAP
import torch.nn.functional as F
import pandas as pd
import os

Load and Initialize a CLAP model

In [2]:
clap_model = CLAP(version = '2023', use_cuda=False)

Set up Classes Labels and Audio Files Paths and Ground Truths

In [3]:
# Define class descriptions
labels = [
    "air conditioner",
    "car horn",
    "children playing",
    "dog bark",
    "drilling",
    "engine idling",
    "gunshot",
    "jackhammer",
    "siren",
    "street music"]

audio_paths = ["../dataset/UrbanSound8K/audio/fold2/4201-3-0-0.wav"]

file_names = [os.path.basename(audio_path) for audio_path in audio_paths]

# Load the metadata
metadata_path = '../dataset/UrbanSound8K/metadata/UrbanSound8K.csv'
metadata = pd.read_csv(metadata_path)

# Find the row corresponding to the file
file_metadatas = [metadata[metadata['slice_file_name'] == file_name] for file_name in file_names]

true_label = []
true_label_id = []

for file_metadata, file_name in zip(file_metadatas, file_names):
    if not file_metadata.empty:
        true_label = file_metadata['class'].values[0]  # Human-readable label
        true_label_id = file_metadata['classID'].values[0]  # Numerical label
        print(f"File: {file_name}, Label: {true_label}, Label ID: {true_label_id}")
    else:
        print(f"File: {file_name} not found in metadata.")

File: 4201-3-0-0.wav, Label: dog_bark, Label ID: 3


Get Audio and Test Embbedings

In [4]:
audio_embeddings = clap_model.get_audio_embeddings(audio_paths)
text_embeddings = clap_model.get_text_embeddings(labels)

Calculate Similarities and Show Results

In [5]:
similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)

similarity = F.softmax(similarities, dim=1)
values, indices = similarity[0].topk(5)

# Print the results
print(f"Ground truth: {true_label}")
print("Top predictions:")
for value, index in zip(values, indices):
    print(f"{labels[index]:>16s}: {100 * value.item():.2f}%")

Ground truth: dog_bark
Top predictions:
        dog bark: 99.42%
         gunshot: 0.37%
           siren: 0.07%
 air conditioner: 0.05%
      jackhammer: 0.04%
