In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [3]:
torchaudio.list_audio_backends()

['soundfile']

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def extract_voice_embeddings(audio_file):
    waveform, sample_rate = torchaudio.load(audio_file)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    waveform = F.normalize(waveform)

    if waveform.ndimension() == 2:
        waveform = waveform.squeeze(0)
        
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
    input_values = inputs['input_values'].to(device)

    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state
    
    voice_embedding = torch.mean(embeddings, dim=1).squeeze().cpu().numpy()
    return voice_embedding

In [7]:
extract_voice_embeddings("characters/Albedo/79_audio.wav")

  return F.conv1d(input, weight, bias, self.stride,
  attn_output = torch.nn.functional.scaled_dot_product_attention(


array([-0.0357823 ,  0.05431148,  0.34946448,  0.1121379 ,  0.27968332,
       -0.2737367 ,  0.07846474, -0.20731293, -0.17776124, -0.12317158,
       -0.04602772,  0.5656206 ,  0.23186235,  0.16278815,  0.18053694,
       -0.04977191,  0.27083847, -0.00630548, -0.128712  , -0.6694658 ,
       -0.08326234, -0.09285967,  0.0740937 ,  0.42214197,  0.41874737,
       -0.12893163,  0.12028143, -0.22105113, -0.13587776,  0.12663187,
       -0.30808446, -0.04941976, -0.51878434,  0.08194198, -0.00690099,
        0.17276925, -0.16830753,  0.09708958,  0.12068896, -0.13339195,
       -0.24577633,  0.04646227, -0.1270652 , -0.09494544, -0.7655716 ,
       -0.35917178, -0.22518249,  0.14477172,  0.05852978, -0.16208558,
       -0.02091702,  0.36183825,  0.34151512,  0.08962654,  0.28449824,
       -0.02282706,  0.13232182,  0.02939306,  0.08540127, -0.09387352,
       -0.0825976 ,  0.04466575, -0.08380934,  0.10722651, -0.22208484,
       -0.03650331, -0.11089969, -0.3184559 , -0.00740583,  0.05

In [8]:
data_dir = "characters"
embeddings = []
labels = []

In [9]:
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

['Albedo',
 'Alhaitham',
 'Aloy',
 'Amber',
 'Arataki Itto',
 'Baizhu',
 'Barbara',
 'Beidou',
 'Bennett',
 'Candace',
 'Charlotte',
 'Childe',
 'Chongyun',
 'Clorinde',
 'Collei',
 'Cyno',
 'Dehya',
 'Diluc',
 'Diona',
 'Dori',
 'Ei',
 'Eula',
 'Faruzan',
 'Fischl',
 'Freminet',
 'Furina',
 'Ganyu',
 'Gorou',
 'Hu Tao',
 'Jean',
 'Kaede',
 'Kaedehara Kazuha',
 'Kaeya',
 'Kamisato Ayaka',
 'Kamisato Ayato',
 'Kaveh',
 'Kazuha',
 'Keqing',
 'Kirara',
 'Klee',
 'Kujou Sara',
 'Kuki Shinobu',
 'Layla',
 'Lisa',
 'Lynette',
 'Lyney',
 'Mika',
 'Mona',
 'Nahida',
 'Navia',
 'Neuvillette',
 'Nilou',
 'Ningguang',
 'Noelle',
 'Paimon',
 'Qiqi',
 'Raiden Shogun',
 'Razor',
 'Rosaria',
 'Sangonomiya Kokomi',
 'Sayu',
 'Shenhe',
 'Shikanoin Heizou',
 'Sucrose',
 'Tartaglia',
 'Thoma',
 'Tighnari',
 'Traveler',
 'Venti',
 'Wanderer',
 'Wriothesley',
 'Xiangling',
 'Xiao',
 'Xingqiu',
 'Xinyan',
 'Yae Miko',
 'Yanfei',
 'Yaoyao',
 'Yelan',
 'Yoimiya',
 'Yun Jin',
 'Zhongli']

In [10]:
for character in char_folder:
    character_dir = os.path.join(data_dir, character)
    print(f"Currently on Character: {character}")
    if os.path.isdir(character_dir):
        for file_name in os.listdir(character_dir):
            file_path = os.path.join(character_dir, file_name)
            if file_path.endswith(".wav"):
                embedding = extract_voice_embeddings(file_path)
                embeddings.append(embedding)
                labels.append(character)

Currently on Character: Albedo
Currently on Character: Alhaitham
Currently on Character: Aloy
Currently on Character: Amber
Currently on Character: Arataki Itto
Currently on Character: Baizhu
Currently on Character: Barbara
Currently on Character: Beidou
Currently on Character: Bennett
Currently on Character: Candace
Currently on Character: Charlotte
Currently on Character: Childe
Currently on Character: Chongyun
Currently on Character: Clorinde
Currently on Character: Collei
Currently on Character: Cyno
Currently on Character: Dehya
Currently on Character: Diluc
Currently on Character: Diona
Currently on Character: Dori
Currently on Character: Ei
Currently on Character: Eula
Currently on Character: Faruzan
Currently on Character: Fischl
Currently on Character: Freminet
Currently on Character: Furina
Currently on Character: Ganyu
Currently on Character: Gorou
Currently on Character: Hu Tao
Currently on Character: Jean
Currently on Character: Kaede
Currently on Character: Kaedehara Kazu

In [11]:
X = np.array(embeddings)
y = np.array(labels)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [13]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

In [14]:
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(VoiceClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = F.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [15]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor([char_folder.index(label) for label in y_train], dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor([char_folder.index(label) for label in y_test], dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [16]:
input_dim = X_train.shape[1]
num_classes = len(char_folder)
vc_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vc_model.parameters(), lr=0.001)
num_epochs = 100
patience = 5
best_loss = float('inf')
counter = 0

In [17]:
for epoch in range(num_epochs):
    vc_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = vc_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    vc_model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vc_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_loss /= len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    if val_loss < best_loss:
        best_loss = val_loss
        counter = 0
    else:
        counter += 1
    
    if counter >= patience:
        print("Early stopping triggered")
        break

Epoch [1/100], Loss: 4.0622, Val Loss: 3.4643, Accuracy: 0.1836
Epoch [2/100], Loss: 3.3211, Val Loss: 2.7833, Accuracy: 0.3579
Epoch [3/100], Loss: 2.8910, Val Loss: 2.3713, Accuracy: 0.4285
Epoch [4/100], Loss: 2.5927, Val Loss: 2.0397, Accuracy: 0.5097
Epoch [5/100], Loss: 2.3744, Val Loss: 1.8246, Accuracy: 0.5390
Epoch [6/100], Loss: 2.2267, Val Loss: 1.6898, Accuracy: 0.6109
Epoch [7/100], Loss: 2.0640, Val Loss: 1.5473, Accuracy: 0.6121
Epoch [8/100], Loss: 1.9914, Val Loss: 1.4272, Accuracy: 0.6627
Epoch [9/100], Loss: 1.8705, Val Loss: 1.3506, Accuracy: 0.6608
Epoch [10/100], Loss: 1.7848, Val Loss: 1.2822, Accuracy: 0.6802
Epoch [11/100], Loss: 1.7158, Val Loss: 1.2019, Accuracy: 0.6983
Epoch [12/100], Loss: 1.6616, Val Loss: 1.1598, Accuracy: 0.7046
Epoch [13/100], Loss: 1.6104, Val Loss: 1.1303, Accuracy: 0.7064
Epoch [14/100], Loss: 1.5917, Val Loss: 1.1305, Accuracy: 0.7164
Epoch [15/100], Loss: 1.5540, Val Loss: 1.0906, Accuracy: 0.7171
Epoch [16/100], Loss: 1.5065, Val 

In [18]:
def predict_character(audio_file):
    vc_model.eval()
    embedding = extract_voice_embeddings(audio_file)
    embedding_scaled = scaler.transform([embedding])
    embedding_tensor = torch.tensor(embedding_scaled, dtype=torch.float32).to(device)
    
    
    with torch.no_grad():
        output = vc_model(embedding_tensor)
        _, pred = torch.max(output, 1)
    
    return char_folder[pred.item()]




In [19]:
predict_character('characters/Lisa/27_audio.wav')

  return F.conv1d(input, weight, bias, self.stride,


'Lisa'

In [20]:
import sounddevice as sd
from scipy.io.wavfile import write

In [21]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [113]:
record_audio('output.wav', duration=5)


Recording...
Recording saved to output.wav


In [22]:
predict_character("output.wav")


  return F.conv1d(input, weight, bias, self.stride,


'Zhongli'

In [23]:
predict_character("calvin.wav")


  return F.conv1d(input, weight, bias, self.stride,


'Zhongli'