In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

**Before you read**, do the preparation steps mentioned in the processing_lr.ipynb notebook to get the proper packages.

# Text Processing / Feature Engineering
Here, we are feeding our voice lines through wav2vec2 and convering them into waveforms that we can use for training!

We went through the same steps for processing and feature engineering as in processing_lr.ipynb, so you can **skip** through this part. 

In [None]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)


In [None]:
"""
Wav2Vec2 takes a waveform directly and outputs higher quality features or 
latent representations according to Neural Speech Recognition Lecture. 
Here, we are using Wav2Vec2 to go from acoustics to tensors using the 
processor to voicing embedding by transformers. Note that in class, 
we used Wav2Vec2 to go from waves to words, but here we go from waves 
to voice embeddings! This high dimensional embedding captures the speaker 
identity (tone, pitch, and accent), prosody, and phonetic content. 
"""

def extract_voice_embeddings(audio_file):
    waveform, sample_rate = torchaudio.load(audio_file)

    # resample the wav file to 16000 bc Wav2Vec2 is trained on those files
    # a perfect resample of this voiceline is not possible. According to the 
    # Nyquist Theorem, the highest freq is captured by a sample signal is one half
    # the sampling rate. The highest freq by a human voice is up to 20kHz, so the
    # frequencies of the voice should be captured for the most part according to the theorem
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    waveform = F.normalize(waveform)
    # change waveform to mono if it is stereo bc Wav2Vec2 is trained on that
    if waveform.ndimension() == 2:
        waveform = waveform.squeeze(0)
        
    # process the wavefrom into inputs 
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)

    input_values = inputs['input_values'].to(device)


    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state
    # create unique voice embeddings for each character 
    voice_embedding = torch.mean(embeddings, dim=1).squeeze().cpu().numpy()
    return voice_embedding

In [None]:
extract_voice_embeddings("characters/Albedo/79_audio.wav")

In [None]:
data_dir = "characters"
embeddings = []
labels = []

In [None]:
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

In [None]:
# going through each character 
for character in char_folder:
    character_dir = os.path.join(data_dir, character)
    print(f"Currently on Character: {character}")
    if os.path.isdir(character_dir):
        for file_name in os.listdir(character_dir):
            file_path = os.path.join(character_dir, file_name)
            if file_path.endswith(".wav"):
                # create embedding for each wav file
                embedding = extract_voice_embeddings(file_path)
                embeddings.append(embedding)
                # assign labels aka characters to each one 
                labels.append(character)

In [None]:
X = np.array(embeddings)
y = np.array(labels)

# Training

Here, we are using a deep neural network to run on the voice embeddings. SID that is text independent is a nonlinear task that involves a lot of high dimensional data, which means that it can capture intricate variations in speech patterns. However, according to class in Neural Network I and II, we have zero idea how they are making their decisions and requires way more computation than a normal model.

In [None]:
scaler = StandardScaler()
# fit the scaler to data to standarize the data to have mean 0 and variance of 1
X_scaled = scaler.fit_transform(X)

# Do a 80/20 split between training and test sets 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Defining a neural network for voice classification
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(VoiceClassifier, self).__init__()
        # first fully connected layer with BatchNorm and 512 units 
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)

        #second fully connected layer with BatchNorm and 256 units
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)

        #define third fully connected layer with BatchNorm and 128 yunits
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        
        #define output layer with the num_classes (82 characters))
        self.fc4 = nn.Linear(128, num_classes)

        #preventing overfitting by randomly setting some neurons to 0
        self.dropout = nn.Dropout(0.5)
    
    # function to pass input thorugh each layer and applying activation functions
    def forward(self, x):
        x = F.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [None]:
# convert splits into appropriate tensors 
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor([char_folder.index(label) for label in y_train], dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor([char_folder.index(label) for label in y_test], dtype=torch.long)

# create tensordataset objects
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# dataloader objects process batches and shuffle the training data
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# setting up initial parameters for neural network
input_dim = X_train.shape[1]
num_classes = len(char_folder)
vc_model = VoiceClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vc_model.parameters(), lr=0.001)
num_epochs = 100
patience = 5
best_loss = float('inf')
counter = 0

In [None]:
# training loop going through num_epochs
for epoch in range(num_epochs):

    vc_model.train()
    running_loss = 0.0

    # iterate over batches of data from train_loader
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        # forward pass and compute the loss
        outputs = vc_model(inputs)
        loss = criterion(outputs, labels)
        # backward pass
        loss.backward()
        optimizer.step()
        
        # accumulate running loss
        running_loss += loss.item()
    
    vc_model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vc_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            # get predicted class
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    
    # calculate avg validation loss and accuracy for each epoch
    val_loss /= len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    # early stop logic: we reset counter if there is improvement and otherwise we keep counting
    if val_loss < best_loss:
        best_loss = val_loss
        counter = 0
    else:
        counter += 1
    # if we do not improve in 5 epochs, we stop and we found the best model
    if counter >= patience:
        print("Early stopping triggered")
        break

In [None]:
# A function used to predict what character an audio file sounds like
def predict_character(audio_file):
    vc_model.eval()

    #extract voice embeddings and scale accordingly
    embedding = extract_voice_embeddings(audio_file)
    embedding_scaled = scaler.transform([embedding])
    # convert it to a tensor 
    embedding_tensor = torch.tensor(embedding_scaled, dtype=torch.float32).to(device)
    
    
    with torch.no_grad():
        #forward pass with the inputted embedding tensor
        output = vc_model(embedding_tensor)
        #get the predicted class by finding the index with max score
        _, pred = torch.max(output, 1)
    
    return char_folder[pred.item()]

In [None]:
predict_character('characters/Lisa/27_audio.wav')

The testing accuracy is found by looking at the accuracy of the last epoch. In our case, we found the accuracy to be aboue 78%, which is quite impressive, considering that we have over 82 characters to predict from. 

# Uploading your own sound

This is a copy and paste from the processing_lr.ipynb.

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write

In [None]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [None]:
record_audio('output.wav', duration=5)


In [None]:
predict_character("output.wav")


In [None]:
predict_character("calvin.wav")


# Saving the DNN Model

In [None]:
import pickle

In [None]:
with open('vc_nnmodel.pkl', 'wb') as f:
    pickle.dump(vc_model, f)

In [None]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)