In [6]:
import os
import glob
import numpy
import argparse
import torchaudio
from speechbrain.pretrained import EncoderClassifier

import torch
from tqdm import tqdm
import torch.nn.functional as F

In [8]:
spk_model = {
    "speechbrain/spkrec-xvect-voxceleb": 512, 
    "speechbrain/spkrec-ecapa-voxceleb": 192,
}

def f2embed(wav_file, classifier, size_embed):
    signal, fs = torchaudio.load(wav_file)
    assert fs == 16000, fs
    with torch.no_grad():
        embeddings = classifier.encode_batch(signal)
        embeddings = F.normalize(embeddings, dim=2)
        embeddings = embeddings.squeeze().cpu().numpy()
    assert embeddings.shape[0] == size_embed, embeddings.shape[0]
    return embeddings


In [10]:
### CREATE XVECTOR

# Parameters (normally passed as args)
single_wav_path = 'FEMALE VOICE.wav'  # specify the path to your .wav file
spkemb_root = '.'  # specify your output directory for speaker embeddings
speaker_embed = 'speechbrain/spkrec-xvect-voxceleb'  # specify your model identifier for loading the classifier

# Ensure the output directory exists
if not os.path.exists(spkemb_root):
    print(f"Create speaker embedding directory: {spkemb_root}")
    os.mkdir(spkemb_root)

# Set up device and load model
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = EncoderClassifier.from_hparams(source=speaker_embed, run_opts={"device": device}, savedir=os.path.join('/tmp', speaker_embed))
size_embed = 192  # Assuming size of the embedding, adjust based on your model's output

# Function to extract embeddings; assuming you have this function defined as f2embed
# Example: f2embed(file_path, classifier, embedding_size)
def f2embed(file_path, model, emb_size):
    signal = model.load_audio(file_path)
    
    embeddings = model.encode_batch(signal)

    # print(embeddings)

    return embeddings.squeeze(0).cpu().detach().numpy()  # Move tensor to CPU and convert to NumPy array

# Processing the single WAV file
utt_id = os.path.basename(single_wav_path).replace(".wav", "")
utt_emb = f2embed(single_wav_path, classifier, size_embed)
# numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb)

# print(f"Processed and saved embeddings for {utt_id}")

hyperparams.yaml:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

Processed and saved embeddings for FEMALE VOICE


In [44]:
def create_target_xvector(single_wav_path):

    # Parameters (normally passed as args)
    # single_wav_path = 'FEMALE VOICE.wav'  # specify the path to your .wav file
    # spkemb_root = '.'  # specify your output directory for speaker embeddings
    speaker_embed = 'speechbrain/spkrec-xvect-voxceleb'  # specify your model identifier for loading the classifier

    # Ensure the output directory exists
    if not os.path.exists(spkemb_root):
        print(f"Create speaker embedding directory: {spkemb_root}")
        os.mkdir(spkemb_root)

    # Set up device and load model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    classifier = EncoderClassifier.from_hparams(source=speaker_embed, run_opts={"device": device}, savedir=os.path.join('/tmp', speaker_embed))
    size_embed = 192  # Assuming size of the embedding, adjust based on your model's output

    # Function to extract embeddings; assuming you have this function defined as f2embed
    # Example: f2embed(file_path, classifier, embedding_size)
    def f2embed(file_path, model, emb_size):
        signal = model.load_audio(file_path)

        embeddings = model.encode_batch(signal)

        # print(embeddings)

        return embeddings.squeeze(0).cpu().detach().numpy()  # Move tensor to CPU and convert to NumPy array

    # Processing the single WAV file
    utt_id = os.path.basename(single_wav_path).replace(".wav", "")
    utt_emb = f2embed(single_wav_path, classifier, size_embed)
    
    return utt_emb

In [15]:
### LOAD MODEL
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan
from datasets import load_dataset

def load_voice_conversion_model():
    dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    dataset = dataset.sort("id")
    sampling_rate = dataset.features["audio"].sampling_rate
    # example_speech = dataset[0]["audio"]["array"] # this is the speaker 1 input!!!

    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
    model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    # inputs = processor(audio=example_speech, sampling_rate=sampling_rate, return_tensors="pt")
    
    return model

In [53]:
### LOAD XVECTOR
import soundfile as sf

def execute_voice_conversion(model, source_array, target_speaker_embeddings, output_name):
    speaker_embeddings = torch.tensor(target_speaker_embeddings)

    speech = model.generate_speech(source_array["input_values"], speaker_embeddings, vocoder=vocoder)

    sf.write(output_name, speech.numpy(), samplerate=16000)

In [20]:
execute_voice_conversion(load_voice_conversion_model(), utt_emb, "success.wav")

Some weights of SpeechT5ForSpeechToSpeech were not initialized from the model checkpoint at microsoft/speecht5_vc and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
example_speech

array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
       0.0010376 ])

In [24]:
import librosa

# Path to your WAV file
file_path = 'SOURCERECORDING.mp3'

# Load the audio file
audio, sr = librosa.load(file_path, sr=16000)  # 'sr=None' loads the file with its original sampling rate

# audio is the numpy array representing the audio signal
# sr is the sampling rate of the audio

print("Audio array:", audio)
print("Sampling rate:", sr)

Audio array: [ 1.2149215e-24 -6.7208425e-25  2.8434334e-24 ... -9.6224318e-04
 -1.0187828e-03 -1.2711850e-03]
Sampling rate: 16000


In [26]:
NEW_INPUT = processor(audio=audio, sampling_rate=16000, return_tensors="pt")

In [48]:
speaker_embeddings = torch.tensor(utt_emb)

speech = model.generate_speech(NEW_INPUT["input_values"], speaker_embeddings, vocoder=vocoder)
output_name = "TEST ME1.wav"
sf.write(output_name, speech.numpy(), samplerate=16000)

In [55]:
def generate_source_audio_array(file_path):
    
    
    # Load the audio file
    audio, sr = librosa.load(file_path, sr=16000)  # 'sr=None' loads the file with its original sampling rate

    # audio is the numpy array representing the audio signal
    # sr is the sampling rate of the audio

    print("Audio array:", audio)
    print("Sampling rate:", sr)
    
    NEW_INPUT = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
    
    return NEW_INPUT

In [56]:
execute_voice_conversion(load_voice_conversion_model(), generate_source_audio_array("SOURCERECORDING.mp3"), 
                        create_target_xvector("FEMALE VOICE.wav"), "YEEEE.wav")

Some weights of SpeechT5ForSpeechToSpeech were not initialized from the model checkpoint at microsoft/speecht5_vc and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Audio array: [ 1.2149215e-24 -6.7208425e-25  2.8434334e-24 ... -9.6224318e-04
 -1.0187828e-03 -1.2711850e-03]
Sampling rate: 16000


In [42]:
generate_source_audio_array("SOURCERECORDING.mp3")['input_values']

Audio array: [ 1.2149215e-24 -6.7208425e-25  2.8434334e-24 ... -9.6224318e-04
 -1.0187828e-03 -1.2711850e-03]
Sampling rate: 16000


tensor([[ 1.2149e-24, -6.7208e-25,  2.8434e-24,  ..., -9.6224e-04,
         -1.0188e-03, -1.2712e-03]])