In [9]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd
import torchaudio.transforms as T


In [2]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [3]:
torchaudio.list_audio_backends()

['soundfile']

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def extract_mfcc_features(audio_file, n_mfcc=13):
    waveform, sample_rate = torchaudio.load(audio_file)
    
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000
    
    mfcc_transform = T.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc)
    mfcc = mfcc_transform(waveform)
    
    mfcc = mfcc.mean(dim=2).squeeze().numpy()
    return mfcc

In [11]:
extract_mfcc_features("characters/Albedo/79_audio.wav")



array([-178.0971   ,   42.131977 ,  -31.356077 ,   -3.3366582,
        -18.424139 ,  -22.077742 ,  -19.098751 ,   -6.620508 ,
        -11.038792 ,  -12.467265 ,   -5.432606 ,   -6.685038 ,
         -7.3597755], dtype=float32)

In [12]:
data_dir = "characters"
embeddings = []
labels = []

In [13]:
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

['Albedo',
 'Alhaitham',
 'Aloy',
 'Amber',
 'Arataki Itto',
 'Baizhu',
 'Barbara',
 'Beidou',
 'Bennett',
 'Candace',
 'Charlotte',
 'Childe',
 'Chongyun',
 'Clorinde',
 'Collei',
 'Cyno',
 'Dehya',
 'Diluc',
 'Diona',
 'Dori',
 'Ei',
 'Eula',
 'Faruzan',
 'Fischl',
 'Freminet',
 'Furina',
 'Ganyu',
 'Gorou',
 'Hu Tao',
 'Jean',
 'Kaede',
 'Kaedehara Kazuha',
 'Kaeya',
 'Kamisato Ayaka',
 'Kamisato Ayato',
 'Kaveh',
 'Kazuha',
 'Keqing',
 'Kirara',
 'Klee',
 'Kujou Sara',
 'Kuki Shinobu',
 'Layla',
 'Lisa',
 'Lynette',
 'Lyney',
 'Mika',
 'Mona',
 'Nahida',
 'Navia',
 'Neuvillette',
 'Nilou',
 'Ningguang',
 'Noelle',
 'Paimon',
 'Qiqi',
 'Raiden Shogun',
 'Razor',
 'Rosaria',
 'Sangonomiya Kokomi',
 'Sayu',
 'Shenhe',
 'Shikanoin Heizou',
 'Sucrose',
 'Tartaglia',
 'Thoma',
 'Tighnari',
 'Traveler',
 'Venti',
 'Wanderer',
 'Wriothesley',
 'Xiangling',
 'Xiao',
 'Xingqiu',
 'Xinyan',
 'Yae Miko',
 'Yanfei',
 'Yaoyao',
 'Yelan',
 'Yoimiya',
 'Yun Jin',
 'Zhongli']

In [14]:
def extract_features_and_labels(data_dir):
    features = []
    labels = []
    for character in char_folder:
        character_dir = os.path.join(data_dir, character)
        if os.path.isdir(character_dir):
            for file_name in os.listdir(character_dir):
                file_path = os.path.join(character_dir, file_name)
                if file_path.endswith(".wav"):
                    mfcc = extract_mfcc_features(file_path)
                    features.append(mfcc)
                    labels.append(character)
    return np.array(features), np.array(labels)

X, y = extract_features_and_labels(data_dir)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [16]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

In [17]:
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(VoiceClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = F.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [18]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor([char_folder.index(label) for label in y_train], dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor([char_folder.index(label) for label in y_test], dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [19]:
input_dim = X_train.shape[1]
num_classes = len(char_folder)
vc_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vc_model.parameters(), lr=0.001)
num_epochs = 100
patience = 5
best_loss = float('inf')
counter = 0

In [20]:
for epoch in range(num_epochs):
    vc_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = vc_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    vc_model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vc_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_loss /= len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    if val_loss < best_loss:
        best_loss = val_loss
        counter = 0
    else:
        counter += 1
    
    if counter >= patience:
        print("Early stopping triggered")
        break

Epoch [1/100], Loss: 3.9106, Val Loss: 3.1464, Accuracy: 0.3498
Epoch [2/100], Loss: 3.0857, Val Loss: 2.4992, Accuracy: 0.4653
Epoch [3/100], Loss: 2.6756, Val Loss: 2.1221, Accuracy: 0.5209
Epoch [4/100], Loss: 2.4713, Val Loss: 1.9095, Accuracy: 0.5484
Epoch [5/100], Loss: 2.3316, Val Loss: 1.7989, Accuracy: 0.5597
Epoch [6/100], Loss: 2.2743, Val Loss: 1.6816, Accuracy: 0.5715
Epoch [7/100], Loss: 2.1751, Val Loss: 1.6692, Accuracy: 0.5771
Epoch [8/100], Loss: 2.1654, Val Loss: 1.6044, Accuracy: 0.5878
Epoch [9/100], Loss: 2.1474, Val Loss: 1.5848, Accuracy: 0.5978
Epoch [10/100], Loss: 2.0800, Val Loss: 1.5828, Accuracy: 0.5871
Epoch [11/100], Loss: 2.0782, Val Loss: 1.5551, Accuracy: 0.5890
Epoch [12/100], Loss: 2.0700, Val Loss: 1.4998, Accuracy: 0.6121
Epoch [13/100], Loss: 2.0388, Val Loss: 1.5030, Accuracy: 0.6115
Epoch [14/100], Loss: 1.9968, Val Loss: 1.4729, Accuracy: 0.6234
Epoch [15/100], Loss: 1.9796, Val Loss: 1.4751, Accuracy: 0.6096
Epoch [16/100], Loss: 1.9912, Val 

In [23]:
def predict_character(audio_file):
    vc_model.eval()
    mfcc = extract_mfcc_features(audio_file)
    mfcc_scaled = scaler.transform([mfcc])
    mfcc_tensor = torch.tensor(mfcc_scaled, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        output = vc_model(mfcc_tensor)
        _, pred = torch.max(output, 1)
    
    return char_folder[pred.item()]


In [24]:
predict_character('characters/Lisa/27_audio.wav')



'Lisa'

In [25]:
import sounddevice as sd
from scipy.io.wavfile import write

In [26]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [28]:
record_audio('output.wav', duration=5)


Recording...
Recording saved to output.wav


In [27]:
predict_character("calvin.wav")




'Dehya'

In [29]:
predict_character("output.wav")




'Ei'