# Извлечение голосовых эмбеддингов
## Pyannote

#### Виртуальное окружение и зависимости
```bash
$conda create -n pyannote-env python=3.12
$conda activate pyannote-env
$conda install ffmpeg
```

In [None]:
!pip install pyannote-audio==4.0.1
!pip install omegaconf==2.3.0

Версии библиотек-зависимостей:
- torch==2.9.0
- torchaudio==2.9.0
- torchcodec==0.8.1

Указываем путь до датасета

In [1]:
import os

DATASET_DIR = 'C:/Maksim/diploma/dataset'

speakers = os.listdir(DATASET_DIR)

print(len(speakers))

785


Задаём huggingface access token:

In [None]:
TOKEN = "<YOUR TOKEN HERE>"

os.environ["HF_TOKEN"] = TOKEN

In [None]:
# или используйте hugginface консоль для авторизации
#!hf auth login <token>

## Модель-эмбеддер

Базовый класс-интерфейс:

In [3]:
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass, field
import numpy as np


class SpeakerEmbedder(metaclass=ABCMeta):
  @abstractmethod
  def __call__(self, audio_path: str) -> np.ndarray:
    pass



@dataclass
class AudioData:
  speaker: str = field(repr=True)
  name: str = field(repr=True)
  embedding: list[float] = field(repr=False)

In [4]:
import torch
from pyannote.audio import Model, Inference

class PyannoteEmbedder(SpeakerEmbedder):
    def __init__(self, device: str = "cpu"):
        super().__init__()
        model = Model.from_pretrained("pyannote/embedding", use_auth_token=TOKEN)
        self.inference = Inference(model, window="whole").to(torch.device(device))

    def __call__(self, audio_path: str) -> list[float]:
        embedding = self.inference(audio_path)
        return embedding.tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import json
from tqdm.auto import tqdm


class DatasetEmbedder:
  def __init__(self, embedder: SpeakerEmbedder,
               dataset_dir: str,
               output_file: str = './embeddings.json'):
    self.embedder = embedder
    self.dataset_dir = dataset_dir
    self.speakers = os.listdir(self.dataset_dir)
    self.output_file = output_file

    self.embd_dataset: list[AudioData] = []


  def process_speaker(self, spk_dir: str):
    spk = os.path.basename(spk_dir)
    audios: list[str] = os.listdir(spk_dir)

    for audio in audios:
      audio_path = os.path.normpath(os.path.join(spk_dir, audio))
      embedding = self.embedder(audio_path)
      self.embd_dataset.append(AudioData(
          speaker=spk,
          name=audio,
          embedding=embedding
      ))

  def to_json(self):
    """Преобразует self.embd_dataset -> json в формате:
    {
      "data": [
        {
          "spk": "1",
          "audio": "1.wav",
          "embedding": [...] # list[float]
        },
        ...
      ]
    }
    """
    data_list = []
    for audio_data in self.embd_dataset:
        data_list.append({
            "spk": audio_data.speaker,
            "audio": audio_data.name,
            "embedding": audio_data.embedding
        })

    result = {
        "data": data_list
    }

    with open(self.output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)


  def __call__(self) -> list[AudioData]:
    for spk in tqdm(self.speakers, desc="Speakers"):
      spk_dir = os.path.normpath(os.path.join(self.dataset_dir, spk))
      self.process_speaker(spk_dir)

    try:
      self.to_json()
    except Exception as err:
      print(f"Ошибка при сохранении результатов в файл: {err}")

    return self.embd_dataset

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

dataset_embedder = DatasetEmbedder(
    embedder=PyannoteEmbedder(),
    dataset_dir=DATASET_DIR,
    output_file='./voxtube-pyannote-embeddings.json'
)

embd_dataset = dataset_embedder()

Device: cpu


c:\Users\maksi\miniconda3\envs\pyannote-env\Lib\site-packages\lightning\pytorch\utilities\migration\utils.py:197: Redirecting import of pytorch_lightning.callbacks.early_stopping.EarlyStopping to lightning.pytorch.callbacks.early_stopping.EarlyStopping
c:\Users\maksi\miniconda3\envs\pyannote-env\Lib\site-packages\lightning\pytorch\utilities\migration\utils.py:197: Redirecting import of pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint to lightning.pytorch.callbacks.model_checkpoint.ModelCheckpoint
c:\Users\maksi\miniconda3\envs\pyannote-env\Lib\site-packages\lightning\pytorch\utilities\migration\migration.py:208: You have multiple `ModelCheckpoint` callback states in this checkpoint, but we found state keys that would end up colliding with each other after an upgrade, which means we can't differentiate which of your checkpoint callbacks needs which states. At least one of your `ModelCheckpoint` callbacks will not be able to reload the state.
Lightning automatically upgraded 