In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd
import torchaudio.transforms as T


**Before you read**, do the preparation steps mentioned in the processing_lr.ipynb notebook to get the proper packages.

# Text Processing / Feature Engineering
Here, we are feeding our voice lines into MFCCs and then feeding them into a DNN. We are not expecting this to work well because MFCCs capture the spectral characteristics of speech and not speaker specific features like pitch and voice. They also miss temporal dynamics because they are computed over a short number of frames. We also lose finer speaker information because we use DCT to reduce the dimensionality!

In [None]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)


In [None]:
"""
Here we are turning sounds into matrics where according to Lecture Cepstra, Pitch Tracking and Voice Activity Detection
We put each frame of sound into a quefrency domain and then reduce the dimensionality with 
Discrete Cosine Transform. This turns sounds into arrays that we can use for training. 
We use 13 because usually only the 12-13 first coefficient are used for MFCCs according to the same lecture. 
"""
def extract_mfcc_features(audio_file, n_mfcc=13):
    waveform, sample_rate = torchaudio.load(audio_file)
    # resample the audio into 16000 to ensure consistency with other models and extraction
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000
    
    # use torch audio to transform the waveform into MFCCs 
    mfcc_transform = T.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc)
    mfcc = mfcc_transform(waveform)
    
    mfcc = mfcc.mean(dim=2).squeeze().numpy()
    return mfcc

In [None]:
extract_mfcc_features("characters/Albedo/79_audio.wav")

In [None]:
data_dir = "characters"
embeddings = []
labels = []

In [None]:
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

In [None]:
def extract_features_and_labels(data_dir):
    features = []
    labels = []
    for character in char_folder:
        character_dir = os.path.join(data_dir, character)
        if os.path.isdir(character_dir):
            for file_name in os.listdir(character_dir):
                file_path = os.path.join(character_dir, file_name)
                if file_path.endswith(".wav"):
                    mfcc = extract_mfcc_features(file_path)
                    features.append(mfcc)
                    labels.append(character)
    return np.array(features), np.array(labels)

X, y = extract_features_and_labels(data_dir)

# Training
If you would like comments for what is going on, look at the processing + nn.ipynb notebook

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(VoiceClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = F.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor([char_folder.index(label) for label in y_train], dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor([char_folder.index(label) for label in y_test], dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
input_dim = X_train.shape[1]
num_classes = len(char_folder)
vc_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vc_model.parameters(), lr=0.001)
num_epochs = 100
patience = 5
best_loss = float('inf')
counter = 0

In [None]:
for epoch in range(num_epochs):
    vc_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = vc_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    vc_model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vc_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_loss /= len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    if val_loss < best_loss:
        best_loss = val_loss
        counter = 0
    else:
        counter += 1
    
    if counter >= patience:
        print("Early stopping triggered")
        break

Here, we got an accuracy of around 62%, which isn't great compared to our logistic regression with Wav2Vec2 or DNN with Wav2Vec2. This is due to MFCCs not being able to capture speaker identity well. 

In [None]:
def predict_character(audio_file):
    vc_model.eval()
    mfcc = extract_mfcc_features(audio_file)
    mfcc_scaled = scaler.transform([mfcc])
    mfcc_tensor = torch.tensor(mfcc_scaled, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        output = vc_model(mfcc_tensor)
        _, pred = torch.max(output, 1)
    
    return char_folder[pred.item()]


In [None]:
predict_character('characters/Lisa/27_audio.wav')

# Uploading your own sound

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write

In [None]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [None]:
record_audio('output.wav', duration=5)


In [None]:
predict_character("output.wav")


Personally, this model is often off with its guesses. It guessed my gentle and soft spoken friend as a more gritty and tough character named "Wriothesley", which didn't seem quite right