# Извлечение голосовых эмбеддингов
## Speechbrain

Указываем путь до датасета

In [3]:
import os

DATASET_DIR = 'C:/Maksim/diploma/dataset'

speakers = os.listdir(DATASET_DIR)

print(len(speakers))

785


## Модель-эмбеддер

Базовый класс-интерфейс:

In [4]:
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass, field
import numpy as np


class SpeakerEmbedder(metaclass=ABCMeta):
  @abstractmethod
  def __call__(self, audio_path: str) -> np.ndarray:
    pass


@dataclass
class AudioData:
  speaker: str = field(repr=True)
  name: str = field(repr=True)
  embedding: list[float] = field(repr=False)

In [5]:
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier
from torchaudio.transforms import Resample


class EcapaEmbedder(SpeakerEmbedder):
  def __init__(self, device: str = 'cpu'):
    super().__init__()
    self.model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":device})


  def __call__(self, audio_path: str) -> np.ndarray:
    signal, sr = torchaudio.load(audio_path)

    signal = Resample(sr, 16000)(signal)

    embedding = self.model.encode_batch(signal)[0][0].cpu().tolist()

    return embedding

In [None]:
import json
from tqdm.auto import tqdm


class DatasetEmbedder:
  def __init__(self, embedder: SpeakerEmbedder,
               dataset_dir: str,
               output_file: str = './embeddings.json'):
    self.embedder = embedder
    self.dataset_dir = dataset_dir
    self.speakers = os.listdir(self.dataset_dir)
    self.output_file = output_file

    self.embd_dataset: list[AudioData] = []


  def process_speaker(self, spk_dir: str):
    spk = os.path.basename(spk_dir)
    audios: list[str] = os.listdir(spk_dir)

    for audio in audios:
      audio_path = os.path.normpath(os.path.join(spk_dir, audio))
      embedding = self.embedder(audio_path)
      self.embd_dataset.append(AudioData(
          speaker=spk,
          name=audio,
          embedding=embedding
      ))

  def to_json(self):
    """Преобразует self.embd_dataset -> json в формате:
    {
      "data": [
        {
          "spk": "1",
          "audio": "1.wav",
          "embedding": [...] # list[float]
        },
        ...
      ]
    }
    """
    data_list = []
    for audio_data in self.embd_dataset:
        data_list.append({
            "spk": audio_data.speaker,
            "audio": audio_data.name,
            "embedding": audio_data.embedding
        })

    result = {
        "data": data_list
    }

    with open(self.output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)


  def __call__(self) -> list[AudioData]:
    for spk in tqdm(self.speakers, desc="Speakers"):
      spk_dir = os.path.normpath(os.path.join(self.dataset_dir, spk))
      self.process_speaker(spk_dir)

    try:
      self.to_json()
    except Exception as err:
      print(f"Ошибка при сохранении результатов в файл: {err}")

    return self.embd_dataset

In [None]:
embd = DatasetEmbedder(
    embedder=EcapaEmbedder(),
    dataset_dir=DATASET_DIR,
    output_file='./voxtube-speakernet-embeddings.json'
)

embd_dataset = embd()