In [5]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd
import torchaudio.transforms as T
import librosa
import parselmouth
from parselmouth.praat import call

In [None]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

# Feature Extraction

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
def extract_audio_features(audio_file):
    y, sr = librosa.load(audio_file, sr=16000)
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = mfccs.mean(axis=1)
    
    snd = parselmouth.Sound(audio_file)
    pitch = call(snd, "To Pitch", 0.0, 75, 600)
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz")

    formants = call(snd, "To Formant (burg)", 0.0, 5, 5500, 0.025, 50)
    formant1 = call(formants, "Get mean", 1, 0, 0, "Hertz")
    formant2 = call(formants, "Get mean", 2, 0, 0, "Hertz")
    
    features = np.concatenate([mfccs_mean, [mean_pitch, formant1, formant2]])
    
    if np.isnan(features).any():
        print(f"NaN values found in features from {audio_file}")
        features = np.nan_to_num(features)
    
    return features

In [None]:
extract_audio_features("characters/Albedo/79_audio.wav")

In [None]:
data_dir = "characters"
embeddings = []
labels = []

In [None]:
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

In [None]:
embeddings = []
labels = []

for character in char_folder:
    character_dir = os.path.join(data_dir, character)
    if os.path.isdir(character_dir):
        for file_name in os.listdir(character_dir):
            file_path = os.path.join(character_dir, file_name)
            if file_path.endswith(".wav"):
                features = extract_audio_features(file_path)
                embeddings.append(features)
                labels.append(character)


X = np.array(embeddings)
y = np.array(labels)

# Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(VoiceClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = F.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [None]:
assert not np.isnan(X).any(), "Input data contains NaNs"
assert not np.isinf(X).any(), "Input data contains infinite values"


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor([char_folder.index(label) for label in y_train], dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor([char_folder.index(label) for label in y_test], dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
input_dim = X_train.shape[1]
num_classes = len(char_folder)
classifier_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier_model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    classifier_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = classifier_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")


In [None]:
input_dim = X_train.shape[1]
num_classes = len(char_folder)
vc_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vc_model.parameters(), lr=0.001)
num_epochs = 100
patience = 5
best_loss = float('inf')
counter = 0

In [None]:
for epoch in range(num_epochs):
    vc_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = vc_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    vc_model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vc_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_loss /= len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    if val_loss < best_loss:
        best_loss = val_loss
        counter = 0
    else:
        counter += 1
    
    if counter >= patience:
        print("Early stopping triggered")
        break

The accuracy we ended up getting was 67%. It isn't bad compared to simplyly using MFCCs, but it isn't as good as Wav2Vec2

In [None]:
def predict_character(audio_file):
    vc_model.eval()
    features = extract_audio_features(audio_file)
    features_scaled = scaler.transform([features])
    features_tensor = torch.tensor(features_scaled, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        output = vc_model(features_tensor)
        _, pred = torch.max(output, 1)
    
    return char_folder[pred.item()]

In [None]:
predict_character('characters/Lisa/27_audio.wav')

# Uploading your own voice

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write

In [None]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [None]:
record_audio('output.wav', duration=5)


In [None]:
predict_character("calvin.wav")
