In [None]:
from torch.utils.data import DataLoader
from clap import  ClapAudioClassifier
from clap.evaluate import eval_fine_tuned_classification
from clap.datasets import ClapDataset
from clap.utils import get_target_device, load_clap_config

# Evaluate fine-tuned audio classification performance on the ESC-50 dataset

In [None]:
# Load config for audio processing and get target device
audio_encoder = "htsat-tiny"
text_encoder = "gpt2"
cfg_version = 1
ckpt_version = 1
config = load_clap_config(audio_encoder=audio_encoder, text_encoder=text_encoder, version=cfg_version)
device = get_target_device()

In [None]:
# Load Dataset and DataLoader
esc50_dataset = ClapDataset(config=config, kinds=["train", "val", "test"], datasets=["ESC50"])
esc50_dataloader = DataLoader(esc50_dataset, batch_size=config["training"]["batch_size"], shuffle=False)

In [None]:
# Load pretrained model
clf = ClapAudioClassifier.from_ckpt(audio_encoder=audio_encoder, text_encoder=text_encoder, clap_cfg_version=cfg_version, clf_ckpt_version=ckpt_version).to(device)

In [None]:
acc = eval_fine_tuned_classification(model=clf, eval_loader=esc50_dataloader)

In [None]:
print(f'ESC-50 Accuracy: {acc}')