# audio fingerprint retrivial

> [Audio Fingerprint I: Build a Demo with Towhee & Milvus](https://github.com/towhee-io/examples/blob/f3c11a9f153ed17c1957ff89727e1b5cc9cb741f/audio/audio_fingerprint/audio_fingerprint_beginner.ipynb)

## prepare the data

`pdm run data_gtzan`

In [1]:
import pandas as pd

CSV_FILE = "./data/audio_fp/ground_truth.csv"

df = pd.read_csv(CSV_FILE)
df.head()

Unnamed: 0,query,answer,time,snr,reverb
0,audio_fp/queries/q0079_blues.00078_snr3_stairw...,audio_fp/candidates/blues.00078.wav,10.095964,3.930908,stairway1
1,audio_fp/queries/q0084_blues.00083_snr-10_stai...,audio_fp/candidates/blues.00083.wav,6.786349,-9.423101,stairway4
2,audio_fp/queries/q0001_blues.00000_snr5_stairw...,audio_fp/candidates/blues.00000.wav,2.534059,5.011763,stairway
3,audio_fp/queries/q0064_blues.00063_snr-10_meet...,audio_fp/candidates/blues.00063.wav,7.999637,-9.451896,meeting
4,audio_fp/queries/q0072_blues.00071_snr2_stairw...,audio_fp/candidates/blues.00071.wav,19.31288,2.812108,stairway


In [2]:
df["query"] = df["query"].str.replace("audio_fp", "./data/audio_fp", regex=False)
df["answer"] = df["answer"].str.replace("audio_fp", "./data/audio_fp", regex=False)
df.head()

Unnamed: 0,query,answer,time,snr,reverb
0,./data/audio_fp/queries/q0079_blues.00078_snr3...,./data/audio_fp/candidates/blues.00078.wav,10.095964,3.930908,stairway1
1,./data/audio_fp/queries/q0084_blues.00083_snr-...,./data/audio_fp/candidates/blues.00083.wav,6.786349,-9.423101,stairway4
2,./data/audio_fp/queries/q0001_blues.00000_snr5...,./data/audio_fp/candidates/blues.00000.wav,2.534059,5.011763,stairway
3,./data/audio_fp/queries/q0064_blues.00063_snr-...,./data/audio_fp/candidates/blues.00063.wav,7.999637,-9.451896,meeting
4,./data/audio_fp/queries/q0072_blues.00071_snr2...,./data/audio_fp/candidates/blues.00071.wav,19.31288,2.812108,stairway


In [4]:
import glob
import os
import statistics

import gradio
import IPython
import pandas as pd
import towhee
from pymilvus import (
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    connections,
    utility,
)
from sklearn.metrics import accuracy_score
from towhee import ops, pipe
from towhee.datacollection import DataCollection

## observe the data

In [5]:
example_query = df["query"][0]
example_candidate = df["answer"][0]

IPython.display.display(
    f"example query: {example_query}",
    IPython.display.Audio(example_query),
    f"example answer: {example_candidate}",
    IPython.display.Audio(example_candidate),
)

'example query: ./data/audio_fp/queries/q0079_blues.00078_snr3_stairway1.wav'

'example answer: ./data/audio_fp/candidates/blues.00078.wav'

In [6]:
def get_gt(query_path):
    filename = query_path.split("/")[-1]
    name = filename.split("_")[1]
    answer = os.path.join("./data/audio_fp", "candidates", name + ".wav")
    return answer

## Create a Milvus Collection

In [7]:
HOST = "localhost"
PORT = "19530"
COLLECTION_NAME = "nnfp"
INDEX_TYPE = "IVF_FLAT"
METRIC_TYPE = "L2"
DIM = 128
TOPK = 10

connections.connect(host=HOST, port=PORT)

# Create Milvus collection
fields = [
    FieldSchema(
        name="id",
        dtype=DataType.INT64,
        description="embedding ids",
        is_primary=True,
        auto_id=True,
    ),
    FieldSchema(
        name="embedding",
        dtype=DataType.FLOAT_VECTOR,
        description="audio embeddings",
        dim=DIM,
    ),
    FieldSchema(
        name="path", dtype=DataType.VARCHAR, description="audio path", max_length=500
    ),
]
schema = CollectionSchema(fields=fields, description="audio fingerprints")

if utility.has_collection(COLLECTION_NAME):
    collection = Collection(COLLECTION_NAME)
    collection.drop()  # drop collection if it exists

collection = Collection(name=COLLECTION_NAME, schema=schema)

# Create index
index_params = {
    "metric_type": METRIC_TYPE,
    "index_type": INDEX_TYPE,
    "params": {"nlist": 2048},
}

status = collection.create_index(field_name="embedding", index_params=index_params)

## Audio Fingerprinting

In [8]:
emb_pipe = (
    pipe.input("url")
    .map("url", "frames", ops.audio_decode.ffmpeg())
    .map("frames", "embedding", ops.audio_embedding.nnfp())
    .output("embedding")
)
DataCollection(emb_pipe(example_candidate)).show()

  from .autonotebook import tqdm as notebook_tqdm


embedding
"[0.0070677847, 0.09711096, 0.017180806, ...] shape=(30, 128)"


In [9]:
insert_pipe = (
    pipe.input("path")
    .map("path", "frames", ops.audio_decode.ffmpeg())
    .flat_map("frames", "fingerprints", ops.audio_embedding.nnfp())
    .map(
        ("fingerprints", "path"),
        "milvus_res",
        ops.ann_insert.milvus_client(
            host=HOST, port=PORT, collection_name=COLLECTION_NAME
        ),
    )
    .output("fingerprints")
)


path = glob.glob("./data/audio_fp/candidates/*.wav")

for i, p in enumerate(path):
    res = insert_pipe(p)

In [17]:
collection.load()
print(f"Total number of embeddings in the collection: {collection.num_entities}")

Total number of embeddings in the collection: 0


In [11]:
def vote(milvus_res):
    votes = {}
    for res in milvus_res:
        path = res[2]
        score = res[1]
        if path not in votes:
            votes[path] = score
        else:
            votes[path] = votes[path] + score
    votes = sorted(votes.items(), key=lambda item: item[1], reverse=True)
    return votes[0]


def select(pred, score):
    preds = {}
    for i, j in zip(pred, score):
        if i not in preds:
            preds[i] = j
        else:
            preds[i] += j

    final_preds = sorted(preds.items(), key=lambda item: item[1], reverse=True)
    return final_preds[0][0]

In [13]:
collection.load()
search_pipe = (
    pipe.input("path")
    .map("path", "frames", ops.audio_decode.ffmpeg())
    .flat_map("frames", "embs", ops.audio_embedding.nnfp())
    .map(
        "embs",
        "milvus_res",
        ops.ann_search.milvus_client(
            host=HOST,
            port=PORT,
            collection_name=COLLECTION_NAME,
            metric_type=METRIC_TYPE,
            limit=TOPK,
            output_fields=["path"],
        ),
    )
    .map("milvus_res", ("pred", "score"), vote)
    .window_all(("pred", "score"), "result", select)
)


query_pipe = search_pipe.output("path", "result")
DataCollection(query_pipe(example_query)).show()

path,result
./data/audio_fp/queries/q0079_blues.00078_snr3_stairway1.wav,./data/audio_fp/candidates/blues.00078.wav


In [15]:
query_pipe = search_pipe.output("path", "result")
DataCollection(query_pipe(df["query"][42])).show()

path,result
./data/audio_fp/queries/q0424_hiphop.00023_snr-7_meeting.wav,./data/audio_fp/candidates/hiphop.00023.wav


In [18]:
query_pipe = search_pipe.output("path", "result")
DataCollection(query_pipe(df["query"][2])).show()

path,result
./data/audio_fp/queries/q0001_blues.00000_snr5_stairway.wav,./data/audio_fp/candidates/blues.00000.wav


In [19]:
import gradio


def query_function(query_path):
    pred = query_pipe(query_path).get()[1]
    return os.path.basename(pred)


interface = gradio.Interface(
    query_function,
    gradio.inputs.Audio(type="filepath", source="upload"),
    gradio.outputs.Label(),
)

interface.launch(inline=True, share=True)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://6616899076ee2fe3a9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


