In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd

In [None]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [None]:
torchaudio.list_audio_backends()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)


In [None]:
def extract_voice_embeddings(audio_file):
    waveform, sample_rate = torchaudio.load(audio_file)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    waveform = F.normalize(waveform)

    if waveform.ndimension() == 2:
        waveform = waveform.squeeze(0)
        
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)

    input_values = inputs['input_values'].to(device)


    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state
    voice_embedding = torch.mean(embeddings, dim=1).squeeze().cpu().numpy()
    return voice_embedding

In [None]:
extract_voice_embeddings("data/characters/Albedo/0_audio.wav")

In [None]:
data_dir = "data/characters"
embeddings = []
labels = []

In [None]:
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

In [None]:
for character in char_folder:
    character_dir = os.path.join(data_dir, character)
    print(f"Currently on Character: {character}")
    if os.path.isdir(character_dir):
        for file_name in os.listdir(character_dir):
            file_path = os.path.join(character_dir, file_name)
            if file_path.endswith(".wav"):
                embedding = extract_voice_embeddings(file_path)
                embeddings.append(embedding)
                labels.append(character)

In [None]:
X = np.array(embeddings)
y = np.array(labels)