In [None]:
import pickle
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchaudio
import torch.nn.functional as F
import sounddevice as sd
from scipy.io.wavfile import write
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.preprocessing import StandardScaler

Feel free to use this notebook to play around and see which character you sound like or different audio files! 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)

data_dir = "characters"
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]

In [None]:
# Defining a neural network for voice classification
class VoiceClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(VoiceClassifier, self).__init__()
        # first fully connected layer with BatchNorm and 512 units 
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)

        #second fully connected layer with BatchNorm and 256 units
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)

        #define third fully connected layer with BatchNorm and 128 yunits
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        
        #define output layer with the num_classes (82 characters))
        self.fc4 = nn.Linear(128, num_classes)

        #preventing overfitting by randomly setting some neurons to 0
        self.dropout = nn.Dropout(0.5)
    
    # function to pass input thorugh each layer and applying activation functions
    def forward(self, x):
        x = F.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [None]:
with open('vc_nnmodel.pkl', 'rb') as f:
    vc_model = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [None]:
vc_model.to(device)
vc_model.eval()

In [None]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [None]:
def predict_character(audio_file):
    vc_model.eval()
    
    def extract_voice_embeddings(audio_file):
        waveform, sample_rate = torchaudio.load(audio_file)

        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)
            sample_rate = 16000

        waveform = F.normalize(waveform)
        if waveform.ndimension() == 2:
            waveform = waveform.squeeze(0)
            
        inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)

        input_values = inputs['input_values'].to(device)

        with torch.no_grad():
            embeddings = model(input_values).last_hidden_state
        # create unique voice embeddings for each character 
        voice_embedding = torch.mean(embeddings, dim=1).squeeze().cpu().numpy()
        return voice_embedding
    
    # Extract voice embeddings from the audio file
    embedding = extract_voice_embeddings(audio_file)
    
    # Scale the extracted embedding using the fitted scaler
    embedding_scaled = scaler.transform([embedding])
    
    # Convert the scaled embedding to a PyTorch tensor
    embedding_tensor = torch.tensor(embedding_scaled, dtype=torch.float32).to(device)
    
    # Perform inference (no gradient computation needed)
    with torch.no_grad():
        output = vc_model(embedding_tensor)
        _, pred = torch.max(output, 1)
    
    # Return the predicted character label
    return char_folder[pred.item()]

In [None]:
record_audio('output.wav', duration=5)

In [None]:
predict_character("output.wav")