# ATX Practical Datascience: Audio Similarity Search

_Adapted from https://docs.pinecone.io/docs/audio-search_

**Installs:**
```bash
conda install -c huggingface -c conda-forge datasets numpy tqdm python-annoy
pip install panns-inference
```

## Imports

In [49]:
from pathlib import Path
import typing as t
from functools import cache

import numpy as np
from datasets import load_dataset
from panns_inference import AudioTagging
from tqdm.auto import tqdm
from IPython.display import Audio, Markdown, display
from annoy import AnnoyIndex

## Model & Data

In [39]:
@cache
def load_model():
    # load the default model into the gpu.
    # change device to cpu if a gpu is not available
    model = AudioTagging(checkpoint_path=None, device='cuda')
    return model

@cache
def load_audios():
    # load the dataset from huggingface model hub
    data = load_dataset("ashraq/esc50", split="train")

    # select only the audio data from the dataset and store in a numpy array
    audios = np.array([a["array"] for a in data["audio"]])
    return audios

## ETL into ANN Index

In [25]:
ndim = 2048

if Path("./esc50-audio.ann").exists():
    annoy_index = AnnoyIndex(ndim, "angular")
    annoy_index.load("esc50-audio.ann")
else:
    annoy_index = AnnoyIndex(ndim, "angular")
    annoy_index.on_disk_build("esc50-audio.ann")

    model = load_model()
    audios = load_audios()

    # we will use batches of 64
    batch_size = 64

    with tqdm(total=len(audios)) as pbar:
        for i in range(0, len(audios), batch_size):
            # find end of batch
            i_end = min(i+batch_size, len(audios))
            # extract batch
            batch = audios[i:i_end]
            # generate embeddings for all the audios in the batch
            _, emb = model.inference(batch)
            # create unique IDs
            ids = list(range(i, i_end))
            # upsert/insert these records to annoy
            for id, embedding in zip(ids, emb.tolist()):
                annoy_index.add_item(id, embedding)
                pbar.update()
    annoy_index.build(10)

  0%|          | 0/2000 [00:00<?, ?it/s]

## Query ANN Index

In [54]:
def get_audio(audio_num: int) -> t.Tuple[np.array, str]:
    # get the audio data of the audio number
    query_audio = data[audio_num]["audio"]["array"]
    # get the category of the audio number
    category = data[audio_num]["category"]
    # print the category and play the audio
    return query_audio, category

def get_audio_embedding(audio_num: int):
    # get the audio data of the audio number
    query_audio = data[audio_num]["audio"]["array"]
    # reshape query audio
    query_audio = query_audio[None, :]
    # get the embeddings for the audio from the model
    _, xq = model.inference(query_audio)
    return xq.reshape((-1,))

def display_audio(audio_num: int):
    sound_data, sound_category = get_audio(audio_num)
    display(
        Markdown(f"**Query Audio**: {sound_category}"),
        Audio(sound_data, rate=44100)
    )

In [55]:
display_audio(400)

**Query Audio**: car_horn

In [56]:
annoy_index.get_nns_by_vector(get_audio_embedding(400), 3)

[400, 1667, 1666]

In [57]:
display_audio(1667)

**Query Audio**: car_horn

In [58]:
display_audio(1666)

**Query Audio**: car_horn