In [2]:
from pathlib import Path
from typing import List

from more_itertools import chunked
import pandas as pd
import torch

import sys
sys.path.append("../")

from src.config.project_paths import get_data_file_path, get_model_save_dir, get_project_root_path
from speechbrain.inference.speaker import EncoderClassifier
from src.embedding.create_embedding import batch_create_speechbrain_embedding
from src.embedding.embedded_audio import EmbeddedAudio
from tqdm.auto import tqdm
import pickle

torchvision is not available - cannot save figures


In [3]:
# device used for embedding creation
DEVICE: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# pretrained model to download and use for embedded creation. must be a speechbrain model compatible
# with EncoderClassifier class
MODEL_NAME: str = "speechbrain/spkrec-ecapa-voxceleb"
# Path to a json file, which contains relative paths to audio files
ANNOTATION_PATH: Path = get_data_file_path("annotations_original.json")
# How many audio files to pass through the model at the same time. Must be >= 1
BATCH_SIZE: int = 1
# once all the embeddings are calculated, they are pickled to this path.
EMBEDDING_PICKLE_PATH: Path = Path("../data/original_audio_embeddings.pkl")

In [6]:
model = EncoderClassifier.from_hparams(source=MODEL_NAME, savedir=get_model_save_dir(MODEL_NAME),
                                       run_opts={"device": str(DEVICE)})

In [5]:
annotation_df = pd.read_json(ANNOTATION_PATH, orient="records")
rel_audio_paths: List[str] = annotation_df["wav_path"].to_list()
rel_audio_path_batches = chunked(rel_audio_paths, BATCH_SIZE)

In [6]:
number_of_batches = len(rel_audio_paths) // BATCH_SIZE + (1 if len(rel_audio_paths) % BATCH_SIZE > 0 else 0)
print(f"Number of batches: {number_of_batches}")

Number of batches: 1341


In [11]:
import os

embeddings = []
for rel_audio_path_batch in tqdm(rel_audio_path_batches, f"Creating embeddings", total=number_of_batches):
    abs_audio_path_batch = [".." +  rel_path for rel_path in rel_audio_path_batch]    
    for abs_audio_path in abs_audio_path_batch:
        if os.path.exists(abs_audio_path):
            audio_embeddings = batch_create_speechbrain_embedding(model, abs_audio_path_batch)
            embedded_audio_list = [EmbeddedAudio(audio_rel_path=audio_rel_path, embedding=embedding) for
                           audio_rel_path, embedding in zip(rel_audio_path_batch, audio_embeddings)]
            embeddings.extend(embedded_audio_list)
        else:
            print(f"File does not exist: {abs_audio_path}")



    

Creating embeddings:   0%|          | 0/1341 [00:00<?, ?it/s]

['../data/vox2_test/wav/id08374/6z6TFaKJ3No/00035.wav']
['../data/vox2_test/wav/id08374/6z6TFaKJ3No/00034.wav']
['../data/vox2_test/wav/id08374/6z6TFaKJ3No/00036.wav']
['../data/vox2_test/wav/id08374/6z6TFaKJ3No/00037.wav']
['../data/vox2_test/wav/id08374/6z6TFaKJ3No/00040.wav']
['../data/vox2_test/wav/id08374/6z6TFaKJ3No/00039.wav']
['../data/vox2_test/wav/id08374/6z6TFaKJ3No/00038.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00252.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00253.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00251.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00250.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00254.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00255.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00257.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00256.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00264.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/00258.wav']
['../data/vox2_test/wav/id08374/Yh9O9ETuF_0/0025

In [17]:
with open(EMBEDDING_PICKLE_PATH, "wb+") as output_file:
    pickle.dump(embeddings, output_file)

In [7]:
import os

annotations = []
for id_path in Path("../data/vox2_test/wav/").iterdir():
    sample_paths = list(id_path.iterdir())
    for audio_path in sample_paths:
        for audio in audio_path.iterdir():
            annotations.append(
            {
                "original_wav_path": f"/data/vox2_test/wav/{id_path.name}/{audio_path.name}/{audio.stem}.wav",
                "user_id": id_path.stem
            }
        )



len(annotations)

1386

In [8]:
pd.DataFrame(annotations).to_json("../data/annotations_original_audio_embeddings.json", orient="records", default_handler=str)