In [24]:
"""
This is an example using CLAP to perform zeroshot
    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
"""

from msclap import CLAP
import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import pandas as pd
import os

In [63]:
# Load dataset
root_path = "/work/user_data/jtaylor/data/acoustic_scene_classification/data/CochlScene"
val_df = pd.read_csv(f'{root_path}/val_shuffled_tiny.tsv', sep='\t')
classes = val_df['scene_label'].unique()
display(val_df)

class_id_map = {name: i for i, name in enumerate(classes, 0)}
print(class_id_map)

prompt = 'this is the sound of '
y = [prompt + x for x in classes]
print(y)

Unnamed: 0,filename,scene_label
0,Kitchen/Kitchen_user0500_14825861_004.wav,Car
1,Kitchen/Kitchen_user0365_14829983_000.wav,Kitchen
2,Kitchen/Kitchen_user0445_14861824_000.wav,Kitchen
3,Cafe/Cafe_user0159_14987442_005.wav,Cafe
4,Bus/Bus_user0138_14982391_008.wav,Bus
...,...,...
226,Bus/Bus_user0530_14878229_004.wav,Bus
227,CrowdedIndoor/CrowdedIndoor_user0811_14981731_...,CrowdedIndoor
228,CrowdedIndoor/CrowdedIndoor_user0001_14876853_...,CrowdedIndoor
229,Kitchen/Kitchen_user0585_14834807_000.wav,Kitchen


{'Car': 0, 'Kitchen': 1, 'Cafe': 2, 'Bus': 3, 'ResidentialArea': 4, 'Restaurant': 5, 'Restroom': 6, 'Street': 7, 'CrowdedIndoor': 8, 'Subway': 9, 'SubwayStation': 10, 'Elevator': 11, 'Park': 12}
['this is the sound of Car', 'this is the sound of Kitchen', 'this is the sound of Cafe', 'this is the sound of Bus', 'this is the sound of ResidentialArea', 'this is the sound of Restaurant', 'this is the sound of Restroom', 'this is the sound of Street', 'this is the sound of CrowdedIndoor', 'this is the sound of Subway', 'this is the sound of SubwayStation', 'this is the sound of Elevator', 'this is the sound of Park']


In [64]:
# Load and initialize CLAP
clap_model = CLAP(version = '2023', use_cuda=False)

# Computing text embeddings
text_embeddings = clap_model.get_text_embeddings(y)

In [68]:
# Computing audio embeddings
y_preds, y_labels = [], []
print(os.getcwd())

for i in tqdm(range(len(val_df))):
    path, label = val_df.iloc[i]
    path = f'{root_path}/Val/{path}'
    idx = class_id_map[label]
    #print(idx)
    one_hot_target = torch.nn.functional.one_hot(torch.tensor([idx]), num_classes=len(classes)+1)

    audio_embeddings = clap_model.get_audio_embeddings([path], resample=True)
    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)

    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()

    y_preds.append(y_pred)
    y_labels.append(one_hot_target.detach().cpu().numpy())


/mounts/ud-data/jtaylor/a5f2c23d-7fb6-41a6-ac0c-3a2ffc02b992/data/acoustic_scene_classification/CLAP-main/examples


100%|██████████| 231/231 [00:19<00:00, 11.91it/s]


In [66]:
print(y_preds)
print(y_labels)

[array([[2.3342727e-06, 9.9877101e-01, 6.2616064e-06, 4.8860937e-07,
        9.3500428e-05, 7.9232705e-05, 9.3976600e-04, 4.3500545e-06,
        1.2932780e-05, 1.9139382e-07, 3.2675572e-07, 1.9265119e-05,
        7.0437294e-05]], dtype=float32), array([[1.1373779e-05, 9.9446702e-01, 3.4023742e-05, 1.9009992e-06,
        3.2453315e-04, 3.2638613e-04, 4.7429637e-03, 1.1701374e-05,
        1.2022599e-06, 3.2149990e-07, 2.6058984e-07, 5.1041989e-05,
        2.7247119e-05]], dtype=float32), array([[2.4114372e-06, 9.9098653e-01, 1.4744124e-04, 9.2183427e-07,
        8.8739573e-05, 2.3134758e-03, 6.3867308e-03, 2.9005878e-06,
        2.4908975e-06, 2.3651734e-07, 4.8722751e-07, 6.2111591e-05,
        5.5750238e-06]], dtype=float32), array([[1.2702657e-05, 5.3329865e-04, 8.3285302e-01, 7.7392033e-05,
        8.7633291e-03, 9.9677123e-02, 4.9501625e-03, 1.5035517e-03,
        3.2160338e-03, 9.2121674e-04, 4.5314673e-02, 5.1669759e-04,
        1.6607062e-03]], dtype=float32), array([[3.4716073e-

In [69]:
y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
print('ESC50 Accuracy {}'.format(acc))

ESC50 Accuracy 0.8614718614718615


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

DCASE_ClassNames = ['airport', 'bus', 'metro', 'metro_station', 'park', 'public_square',
 'shopping_mall', 'street_pedestrian', 'street_traffic', 'tram']

# get confusion matrix
conf_matrix = confusion_matrix(y_labels,y_preds)
conf_matrix = np.delete(conf_matrix, np.s_[-3:], axis=1)
conf_mat_norm_recall = conf_matrix.astype('float32')/conf_matrix.sum(axis=1)[:,np.newaxis]
recall_by_class = np.diagonal(conf_mat_norm_recall)
mean_recall = np.mean(recall_by_class)

# Calculate row sums to use for percentages
row_sums = conf_matrix.sum(axis=1)

# Calculate percentages
percentages = (conf_matrix.T / row_sums).T  # Transpose for division, then transpose back

# Format percentages as strings with '%' symbol
annot_data = [['{:.2f}'.format(val) for val in row] for row in percentages]
annot_data = [['' if float(val <= 0.04) else '{:.2f}'.format(val) for val in row] for row in percentages]

plt.figure(dpi=200)

annot_kws = {'size': 6}
ax = sns.heatmap(percentages, xticklabels=DCASE_ClassNames, yticklabels=ClassNames, annot=annot_data, fmt='', square=True, cmap='Blues', annot_kws=annot_kws)
ax.set_xlabel('Predicted Label')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.set_ylabel('True Label')

#plt.subplots_adjust(left=0.2, bottom=0.35)
plt.show()