In [5]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch 
import torchaudio
import torch.nn.functional as F
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Before this step, ensure that you create your **environment** using the **environment.yml** file that was provided in the Github. That file contains all the needed packages to ensure this works.

Additionally, if you do not have the characters folders from our preprocessing step, look at our preprocessing notebook or download them from the following link: [Characters.zip](https://drive.google.com/file/d/1q3AdK38yMUIf4CRcbGazdCUVDl8n2RNB/view?usp=sharing)

# Text Processing
Here, we are feeding our voice lines through wav2vec2 and convering them into waveforms that we can use for training!

In [None]:
# if you are using mac, pip install sox
# otherwise, pip install PySoundFile

In [None]:
torchaudio.list_audio_backends()

In [19]:
# this is to speed up the computation for preprocessing by using your GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [21]:
# load processors and model from Wav2Vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def extract_voice_embeddings(audio_file):
    waveform, sample_rate = torchaudio.load(audio_file)

    # resample the wav file to 16000
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    waveform = F.normalize(waveform)
    # change waveform to mono if it is stereo
    if waveform.ndimension() == 2:
        waveform = waveform.squeeze(0)
        
    # process the wavefrom into inputs 
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)

    input_values = inputs['input_values'].to(device)


    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state
    # create unique voice embeddings for each character 
    voice_embedding = torch.mean(embeddings, dim=1).squeeze().cpu().numpy()
    return voice_embedding

In [None]:
extract_voice_embeddings("data/characters/Albedo/0_audio.wav")

In [2]:
data_dir = "data/characters"
embeddings = []
labels = []

In [3]:
# get the character folder
char_folder = [i for i in os.listdir(data_dir) if '.wav' not in i]
char_folder

['Albedo',
 'Alhaitham',
 'Aloy',
 'Amber',
 'Arataki Itto',
 'Baizhu',
 'Barbara',
 'Beidou',
 'Bennett',
 'Candace',
 'Charlotte',
 'Childe',
 'Chongyun',
 'Clorinde',
 'Collei',
 'Cyno',
 'Dehya',
 'Diluc',
 'Diona',
 'Dori',
 'Ei',
 'Eula',
 'Faruzan',
 'Fischl',
 'Freminet',
 'Furina',
 'Ganyu',
 'Gorou',
 'Hu Tao',
 'Jean',
 'Kaede',
 'Kaedehara Kazuha',
 'Kaeya',
 'Kamisato Ayaka',
 'Kamisato Ayato',
 'Kaveh',
 'Kazuha',
 'Keqing',
 'Kirara',
 'Klee',
 'Kujou Sara',
 'Kuki Shinobu',
 'Layla',
 'Lisa',
 'Lynette',
 'Lyney',
 'Mika',
 'Mona',
 'Nahida',
 'Navia',
 'Neuvillette',
 'Nilou',
 'Ningguang',
 'Noelle',
 'Paimon',
 'Qiqi',
 'Raiden Shogun',
 'Razor',
 'Rosaria',
 'Sangonomiya Kokomi',
 'Sayu',
 'Shenhe',
 'Shikanoin Heizou',
 'Sucrose',
 'Tartaglia',
 'Thoma',
 'Tighnari',
 'Traveler',
 'Venti',
 'Wanderer',
 'Wriothesley',
 'Xiangling',
 'Xiao',
 'Xingqiu',
 'Xinyan',
 'Yae Miko',
 'Yanfei',
 'Yaoyao',
 'Yelan',
 'Yoimiya',
 'Yun Jin',
 'Zhongli']

In [4]:
len(char_folder)

82

In [None]:
for character in char_folder:
    character_dir = os.path.join(data_dir, character)
    print(f"Currently on Character: {character}")
    if os.path.isdir(character_dir):
        for file_name in os.listdir(character_dir):
            file_path = os.path.join(character_dir, file_name)
            if file_path.endswith(".wav"):
                # create embedding for each wav file
                embedding = extract_voice_embeddings(file_path)
                embeddings.append(embedding)
                # assign labels aka characters to each one 
                labels.append(character)

In [31]:
X = np.array(embeddings)
y = np.array(labels)

In [None]:
np.savetxt("X.csv", X, delimiter=",")

In [None]:
np.savetxt("y.csv", y, fmt="%s")

# Training
If you want to skip processing the data, start by extracting the processed data from the respective csv files.

In [6]:
# if you don't want to set up CUDA and process text again,
# use this https://drive.google.com/file/d/1aMqL2mr9FmrDFtpVe6CoIwlpG33ZJ-XN/view?usp=sharing
X = np.genfromtxt("X.csv", delimiter=",")
X

array([[-0.05574915,  0.10306576,  0.17027552, ...,  0.13353802,
         0.41493431, -0.33430371],
       [ 0.04035579,  0.13937694,  0.04135553, ...,  0.16306001,
         0.43476447, -0.32703039],
       [-0.01938415,  0.12405293,  0.14782554, ...,  0.19527647,
         0.42044291, -0.43925557],
       ...,
       [-0.08063999,  0.02471673,  0.24933422, ...,  0.1882022 ,
         0.34023446, -0.30718702],
       [-0.09640292,  0.05205383,  0.25569594, ...,  0.14748293,
         0.38074917, -0.33967635],
       [-0.0697066 ,  0.0569925 ,  0.1762238 , ...,  0.1987751 ,
         0.37263355, -0.36635476]])

In [7]:
y = np.array(pd.read_csv("y.csv", header=None).loc[:,0])
y

array(['Albedo', 'Albedo', 'Albedo', ..., 'Zhongli', 'Zhongli', 'Zhongli'],
      dtype=object)

In [8]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded

array([ 0,  0,  0, ..., 81, 81, 81])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [10]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

In [11]:
y_pred = clf.predict(X_test)
y_pred

array([77, 64, 72, ..., 46, 10, 69])

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.78


In [13]:
def predict_character(audio_file):
    embedding = extract_voice_embeddings(audio_file)
    embedding = embedding.reshape(1, -1)
    pred = clf.predict(embedding)
    character = label_encoder.inverse_transform(pred)
    return character[0]

In [22]:
predict_character("data/characters/Yun Jin/0_audio.wav")

  return F.conv1d(input, weight, bias, self.stride,
  attn_output = torch.nn.functional.scaled_dot_product_attention(


'Yun Jin'

# Uploading your own sound

In [23]:
import sounddevice as sd
from scipy.io.wavfile import write

In [24]:
def record_audio(filename, duration, fs=16000):
    print("Recording...")
    #records the sound in 16000 Hz sample rate and mono channel
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    write(filename, fs, recording)
    print(f"Recording saved to {filename}")

In [27]:
# start speaking into your microphone!
record_audio('output.wav', duration=5)


Recording...
Recording saved to output.wav


In [26]:
predict_character("output.wav")

  return F.conv1d(input, weight, bias, self.stride,


'Razor'