In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd
import torchaudio.transforms as T
import librosa
import parselmouth
from parselmouth.praat import call

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [3]:
torchaudio.list_audio_backends()

['soundfile']

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
def extract_audio_features(audio_file):
    y, sr = librosa.load(audio_file, sr=16000)
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = mfccs.mean(axis=1)
    
    snd = parselmouth.Sound(audio_file)
    pitch = call(snd, "To Pitch", 0.0, 75, 600)
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz")

    formants = call(snd, "To Formant (burg)", 0.0, 5, 5500, 0.025, 50)
    formant1 = call(formants, "Get mean", 1, 0, 0, "Hertz")
    formant2 = call(formants, "Get mean", 2, 0, 0, "Hertz")
    
    features = np.concatenate([mfccs_mean, [mean_pitch, formant1, formant2]])
    
    if np.isnan(features).any():
        print(f"NaN values found in features from {audio_file}")
        features = np.nan_to_num(features)
    
    return features

In [6]:
extract_audio_features("characters/Albedo/79_audio.wav")

array([-161.35746765,   77.45172882,  -20.14035034,   15.95415974,
         -7.5135932 ,  -11.78045082,  -12.19209194,   -3.3014617 ,
        -14.07195854,  -11.5178175 ,   -8.25163078,   -7.53186035,
         -7.88459492,  122.59520166,  782.72179796, 1880.05436535])

In [7]:
data_dir = "characters"
embeddings = []
labels = []

In [8]:
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

['Albedo',
 'Alhaitham',
 'Aloy',
 'Amber',
 'Arataki Itto',
 'Baizhu',
 'Barbara',
 'Beidou',
 'Bennett',
 'Candace',
 'Charlotte',
 'Childe',
 'Chongyun',
 'Clorinde',
 'Collei',
 'Cyno',
 'Dehya',
 'Diluc',
 'Diona',
 'Dori',
 'Ei',
 'Eula',
 'Faruzan',
 'Fischl',
 'Freminet',
 'Furina',
 'Ganyu',
 'Gorou',
 'Hu Tao',
 'Jean',
 'Kaede',
 'Kaedehara Kazuha',
 'Kaeya',
 'Kamisato Ayaka',
 'Kamisato Ayato',
 'Kaveh',
 'Kazuha',
 'Keqing',
 'Kirara',
 'Klee',
 'Kujou Sara',
 'Kuki Shinobu',
 'Layla',
 'Lisa',
 'Lynette',
 'Lyney',
 'Mika',
 'Mona',
 'Nahida',
 'Navia',
 'Neuvillette',
 'Nilou',
 'Ningguang',
 'Noelle',
 'Paimon',
 'Qiqi',
 'Raiden Shogun',
 'Razor',
 'Rosaria',
 'Sangonomiya Kokomi',
 'Sayu',
 'Shenhe',
 'Shikanoin Heizou',
 'Sucrose',
 'Tartaglia',
 'Thoma',
 'Tighnari',
 'Traveler',
 'Venti',
 'Wanderer',
 'Wriothesley',
 'Xiangling',
 'Xiao',
 'Xingqiu',
 'Xinyan',
 'Yae Miko',
 'Yanfei',
 'Yaoyao',
 'Yelan',
 'Yoimiya',
 'Yun Jin',
 'Zhongli']

In [9]:
embeddings = []
labels = []

for character in char_folder:
    character_dir = os.path.join(data_dir, character)
    if os.path.isdir(character_dir):
        for file_name in os.listdir(character_dir):
            file_path = os.path.join(character_dir, file_name)
            if file_path.endswith(".wav"):
                features = extract_audio_features(file_path)
                embeddings.append(features)
                labels.append(character)


X = np.array(embeddings)
y = np.array(labels)

NaN values found in features extracted from characters\Alhaitham\18_audio.wav
NaN values found in features extracted from characters\Baizhu\73_audio.wav
NaN values found in features extracted from characters\Candace\74_audio.wav
NaN values found in features extracted from characters\Childe\36_audio.wav
NaN values found in features extracted from characters\Childe\55_audio.wav
NaN values found in features extracted from characters\Cyno\36_audio.wav
NaN values found in features extracted from characters\Cyno\64_audio.wav
NaN values found in features extracted from characters\Dehya\32_audio.wav
NaN values found in features extracted from characters\Freminet\24_audio.wav
NaN values found in features extracted from characters\Freminet\4_audio.wav
NaN values found in features extracted from characters\Kaveh\40_audio.wav
NaN values found in features extracted from characters\Kirara\70_audio.wav
NaN values found in features extracted from characters\Kujou Sara\42_audio.wav
NaN values found in 

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [11]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

In [12]:
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(VoiceClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = F.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [13]:
assert not np.isnan(X).any(), "Input data contains NaNs"
assert not np.isinf(X).any(), "Input data contains infinite values"


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor([char_folder.index(label) for label in y_train], dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor([char_folder.index(label) for label in y_test], dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [15]:
input_dim = X_train.shape[1]
num_classes = len(char_folder)
classifier_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier_model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    classifier_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = classifier_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")


Epoch [1/20], Loss: 3.9093
Epoch [2/20], Loss: 3.0883
Epoch [3/20], Loss: 2.6918
Epoch [4/20], Loss: 2.4693
Epoch [5/20], Loss: 2.3295
Epoch [6/20], Loss: 2.2380
Epoch [7/20], Loss: 2.1506
Epoch [8/20], Loss: 2.1373
Epoch [9/20], Loss: 2.0732
Epoch [10/20], Loss: 2.0492
Epoch [11/20], Loss: 2.0101
Epoch [12/20], Loss: 1.9612
Epoch [13/20], Loss: 1.9441
Epoch [14/20], Loss: 1.9438
Epoch [15/20], Loss: 1.9481
Epoch [16/20], Loss: 1.8950
Epoch [17/20], Loss: 1.8711
Epoch [18/20], Loss: 1.8808
Epoch [19/20], Loss: 1.8389
Epoch [20/20], Loss: 1.8190


In [16]:
input_dim = X_train.shape[1]
num_classes = len(char_folder)
vc_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vc_model.parameters(), lr=0.001)
num_epochs = 100
patience = 5
best_loss = float('inf')
counter = 0

In [17]:
for epoch in range(num_epochs):
    vc_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = vc_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    vc_model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vc_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_loss /= len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    if val_loss < best_loss:
        best_loss = val_loss
        counter = 0
    else:
        counter += 1
    
    if counter >= patience:
        print("Early stopping triggered")
        break

Epoch [1/100], Loss: 3.9085, Val Loss: 3.1658, Accuracy: 0.3479
Epoch [2/100], Loss: 3.0638, Val Loss: 2.4658, Accuracy: 0.4616
Epoch [3/100], Loss: 2.6566, Val Loss: 2.0787, Accuracy: 0.5397
Epoch [4/100], Loss: 2.4511, Val Loss: 1.8701, Accuracy: 0.5721
Epoch [5/100], Loss: 2.3227, Val Loss: 1.7337, Accuracy: 0.5984
Epoch [6/100], Loss: 2.2103, Val Loss: 1.6926, Accuracy: 0.5909
Epoch [7/100], Loss: 2.1477, Val Loss: 1.5813, Accuracy: 0.6352
Epoch [8/100], Loss: 2.1114, Val Loss: 1.5435, Accuracy: 0.6096
Epoch [9/100], Loss: 2.0759, Val Loss: 1.5453, Accuracy: 0.6209
Epoch [10/100], Loss: 2.0329, Val Loss: 1.4855, Accuracy: 0.6340
Epoch [11/100], Loss: 1.9974, Val Loss: 1.4754, Accuracy: 0.6390
Epoch [12/100], Loss: 2.0033, Val Loss: 1.4547, Accuracy: 0.6402
Epoch [13/100], Loss: 1.9559, Val Loss: 1.4491, Accuracy: 0.6390
Epoch [14/100], Loss: 1.9008, Val Loss: 1.3833, Accuracy: 0.6540
Epoch [15/100], Loss: 1.9206, Val Loss: 1.3813, Accuracy: 0.6596
Epoch [16/100], Loss: 1.9224, Val 

In [18]:
def predict_character(audio_file):
    vc_model.eval()
    features = extract_audio_features(audio_file)
    features_scaled = scaler.transform([features])
    features_tensor = torch.tensor(features_scaled, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        output = vc_model(features_tensor)
        _, pred = torch.max(output, 1)
    
    return char_folder[pred.item()]

In [19]:
predict_character('characters/Lisa/27_audio.wav')

'Lisa'

In [20]:
import sounddevice as sd
from scipy.io.wavfile import write

In [21]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [22]:
record_audio('output.wav', duration=5)


Recording...
Recording saved to output.wav


In [23]:
predict_character("calvin.wav")


'Razor'