In [11]:
"""
This is an example using CLAP to perform zeroshot
    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
"""

from msclap import CLAP
from esc50_dataset import ESC50
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

In [13]:
# Load dataset
root_path = "root_path" # Folder with ESC-50-master/
dataset = ESC50(root=root_path, download=True) #If download=False code assumes base_folder='ESC-50-master' in esc50_dataset.py
prompt = 'this is the sound of '
y = [prompt + x for x in dataset.classes]

Using downloaded and verified file: root_path/ESC-50-master.zip
Loading audio files


2000it [00:00, 9458.87it/s]


In [17]:
print(y)


['this is the sound of airplane', 'this is the sound of breathing', 'this is the sound of brushing teeth', 'this is the sound of can opening', 'this is the sound of car horn', 'this is the sound of cat', 'this is the sound of chainsaw', 'this is the sound of chirping birds', 'this is the sound of church bells', 'this is the sound of clapping', 'this is the sound of clock alarm', 'this is the sound of clock tick', 'this is the sound of coughing', 'this is the sound of cow', 'this is the sound of crackling fire', 'this is the sound of crickets', 'this is the sound of crow', 'this is the sound of crying baby', 'this is the sound of dog', 'this is the sound of door wood creaks', 'this is the sound of door wood knock', 'this is the sound of drinking sipping', 'this is the sound of engine', 'this is the sound of fireworks', 'this is the sound of footsteps', 'this is the sound of frog', 'this is the sound of glass breaking', 'this is the sound of hand saw', 'this is the sound of helicopter', 

In [15]:
# Load and initialize CLAP
clap_model = CLAP(version = '2023', use_cuda=False)

# Computing text embeddings
text_embeddings = clap_model.get_text_embeddings(y)

In [22]:
# Computing audio embeddings
y_preds, y_labels = [], []

for i in tqdm(range(len(dataset))):
    x, _, one_hot_target = dataset.__getitem__(i)
    
    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
    y_preds.append(y_pred)
    y_labels.append(one_hot_target.detach().cpu().numpy())
    break

100%|██████████| 2000/2000 [02:16<00:00, 14.63it/s]


In [25]:
print(len(y_preds))
print(len(y_labels[0]))

2000
1


In [None]:
y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
print('ESC50 Accuracy {}'.format(acc))

"""
The output:

ESC50 Accuracy: 93.9%

"""