In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Before this step, ensure that you create your **environment** using the **environment.yml** file that was provided in the Github. That file contains all the needed packages to ensure this works.

Additionally, if you do not have the characters folders from our preprocessing step, look at our preprocessing notebook or download them from the following link: [Characters.zip](https://drive.google.com/file/d/1q3AdK38yMUIf4CRcbGazdCUVDl8n2RNB/view?usp=sharing)

To pull off a speaker identification task (SID) that is text independent, our model should receive a speech sample X and then through a neural network or model, determine the speaker of the sample. This notebook consists of processing the wav files for each character, training the model for **Logistic Regression**, and allow you to upload your own voice to see which character you sound like

# Text Processing / Feature Engineering
Here, we are feeding our voice lines through wav2vec2 and convering them into waveforms that we can use for training!

In [None]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [None]:
torchaudio.list_audio_backends()

In [None]:
# this is to speed up the computation for preprocessing by using your GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# load processors and model from Wav2Vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)


In [None]:
"""
Wav2Vec2 takes a waveform directly and outputs higher quality features or 
latent representations according to Neural Speech Recognition Lecture. 
Here, we are using Wav2Vec2 to go from acoustics to tensors using the 
processor to voicing embedding by transformers. Note that in class, 
we used Wav2Vec2 to go from waves to words, but here we go from waves 
to voice embeddings! This high dimensional embedding captures the speaker 
identity (tone, pitch, and accent), prosody, and phonetic content. 
"""

def extract_voice_embeddings(audio_file):
    waveform, sample_rate = torchaudio.load(audio_file)

    # resample the wav file to 16000 bc Wav2Vec2 is trained on those files
    # a perfect resample of this voiceline is not possible. According to the 
    # Nyquist Theorem, the highest freq is captured by a sample signal is one half
    # the sampling rate. The highest freq by a human voice is up to 20kHz, so the
    # frequencies of the voice should be captured for the most part according to the theorem
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    waveform = F.normalize(waveform)
    # change waveform to mono if it is stereo bc Wav2Vec2 is trained on that
    if waveform.ndimension() == 2:
        waveform = waveform.squeeze(0)
        
    # process the wavefrom into inputs 
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)

    input_values = inputs['input_values'].to(device)


    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state
    # create unique voice embeddings for each character 
    voice_embedding = torch.mean(embeddings, dim=1).squeeze().cpu().numpy()
    return voice_embedding

In [None]:
extract_voice_embeddings("data/characters/Albedo/0_audio.wav")

In [None]:
data_dir = "data/characters"
embeddings = []
labels = []

In [None]:
# get the character folder
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

In [None]:
len(char_folder)

In [None]:
for character in char_folder:
    character_dir = os.path.join(data_dir, character)
    print(f"Currently on Character: {character}")
    if os.path.isdir(character_dir):
        for file_name in os.listdir(character_dir):
            file_path = os.path.join(character_dir, file_name)
            if file_path.endswith(".wav"):
                # create embedding for each wav file
                embedding = extract_voice_embeddings(file_path)
                embeddings.append(embedding)
                # assign labels aka characters to each one 
                labels.append(character)

In [None]:
X = np.array(embeddings)
y = np.array(labels)

In [None]:
# Save the X and y variables as csv files, so you don't need to 
# go through processing again
np.savetxt("X.csv", X, delimiter=",")

In [None]:
np.savetxt("y.csv", y, fmt="%s")

# Training
If you want to skip processing the data, start by extracting the processed data from the respective csv files.

In [None]:
# if you don't want to set up CUDA and process text again,
# use this https://drive.google.com/file/d/1aMqL2mr9FmrDFtpVe6CoIwlpG33ZJ-XN/view?usp=sharing
X = np.genfromtxt("X.csv", delimiter=",")
X

In [None]:
y = np.array(pd.read_csv("y.csv", header=None).loc[:,0])
y

In [None]:
# Convert each character label into a numeric label 
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded

In [None]:
# Split the data into a testing and training set
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Train logistic regression on 500 iterations
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Given an audio file, it will use the model to predict a character you
# sound like
def predict_character(audio_file):
    # feed the embedding through processing
    embedding = extract_voice_embeddings(audio_file)
    embedding = embedding.reshape(1, -1)
    # predict the numeric label for a character
    pred = clf.predict(embedding)
    # convert from numeric label to character name 
    character = label_encoder.inverse_transform(pred)
    return character[0]

In [None]:
predict_character("data/characters/Yun Jin/0_audio.wav")

# Uploading your own sound

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write

In [None]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    #records the sound in 16000 Hz sample rate and mono channel
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [None]:
# start speaking into your microphone!
record_audio('output.wav', duration=5)


In [None]:
# A feminine voice like in feminine_voice.wav was identified as Razor,
# which is a very animal-like male character. This seems quite off, so 
# this model probably is not the best for this task. 
predict_character("feminine_voice.wav")