In [24]:
from pathlib import Path
from typing import List

from more_itertools import chunked
import pandas as pd
import torch

import sys
sys.path.append("../")

from src.config.project_paths import get_data_file_path, get_model_save_dir, get_project_root_path
from speechbrain.inference.speaker import EncoderClassifier
from src.embedding.create_embedding import batch_create_speechbrain_embedding
from src.embedding.embedded_audio import EmbeddedAudio
from tqdm.auto import tqdm
import pickle

In [29]:
# device used for embedding creation
DEVICE: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# pretrained model to download and use for embedded creation. must be a speechbrain model compatible
# with EncoderClassifier class
MODEL_NAME: str = "speechbrain/spkrec-ecapa-voxceleb"
# Path to a json file, which contains relative paths to audio files
ANNOTATION_PATH: Path = "../data/generated/annotations_reduced_noise.json"
# How many audio files to pass through the model at the same time. Must be >= 1
BATCH_SIZE: int = 1
# once all the embeddings are calculated, they are pickled to this path.
EMBEDDING_PICKLE_PATH: Path = f"../data/reduced_noise_audio_embeddings.pkl"

model = EncoderClassifier.from_hparams(source=MODEL_NAME, savedir=get_model_save_dir(MODEL_NAME),
                                       run_opts={"device": str(DEVICE)})

annotation_df = pd.read_json(ANNOTATION_PATH, orient="records")
rel_audio_paths: List[str] = annotation_df["reduced_noise_wav_path"].to_list()
rel_audio_path_batches = chunked(rel_audio_paths, BATCH_SIZE)

number_of_batches = len(rel_audio_paths) // BATCH_SIZE + (1 if len(rel_audio_paths) % BATCH_SIZE > 0 else 0)
print(f"Number of batches: {number_of_batches}")

from typing import Dict


annotations: List[Dict[str, any]] = []

Number of batches: 1341


In [13]:
import os

embeddings = []
for rel_audio in tqdm([".." + str(real_audio_path[0]) for real_audio_path in rel_audio_path_batches], f"Creating embeddings", total=number_of_batches):
    if os.path.exists(rel_audio):
        audio_embeddings = batch_create_speechbrain_embedding(model, [Path(rel_audio)])
        embedded_audio_list = [EmbeddedAudio(audio_rel_path=rel_audio, embedding=audio_embeddings)]
        embeddings.extend(embedded_audio_list)
    else:
        print(f"File does not exist: {rel_audio}")



len(embeddings)

Creating embeddings:   0%|          | 0/1341 [00:00<?, ?it/s]

1341

In [17]:
EMBEDDING_PICKLE_PATH: Path = f"../data/reduced_noise_audio_embeddings.pkl"
with open(EMBEDDING_PICKLE_PATH, "wb+") as output_file:
    pickle.dump(embeddings, output_file)

In [30]:
import os

annotations = []
for id_path in Path("../data/generated/amplitude_audio_files/").iterdir():
    sample_paths = list(id_path.iterdir())
    for audio_path in sample_paths:
        for audio in audio_path.iterdir():
            if str(audio).endswith("reduced_noise.wav"):
                annotations.append(
                {
                    "reduced_noise_wav_path": f"/data/generated/amplitude_audio_files/{id_path.name}/{audio_path.name}/{audio.stem}.wav",
                    "original_wav_path": f"/data/vox2_test/wav/{id_path.name}/{audio_path.name}/{audio.stem}.wav",
                    "user_id": id_path.stem
                }
            )



len(annotations)

1341

In [31]:
pd.DataFrame(annotations).to_json("../data/annotations_reduced_noise.json", orient="records", default_handler=str)