## Voice Cloning
This is the first notebook of a series of three notebooks. In this notebook, the data collected from the [timit dataset](https://catalog.ldc.upenn.edu/LDC93S1) will be used to create fake audio. Microsoft's speech t5 model will be used to generate audio. The generated audio is saved in a seperate file for future use in the audio classification in notebook three

# Data Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Speech T5 Pretrained Model

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan
import os
import numpy as np
import torch
import soundfile as sf

# Initialize the SpeechT5 models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
speech_to_speech_model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Paths to your directories
npy_dir = '/content/drive/MyDrive/MyTTSDataset_6/NPY'
input_dir = "/content/drive/MyDrive/MyTTSDataset_6/wav48/"
output_dir = "/content/drive/MyDrive/MyTTSDataset_6/converted"
os.makedirs(output_dir, exist_ok=True)

# Load speaker embeddings
all_speaker_embeddings = []
for file_name in os.listdir(npy_dir):
    if file_name.endswith('.npy'):
        file_path = os.path.join(npy_dir, file_name)
        speaker_embeddings = np.load(file_path)
        speaker_embeddings_tensor = torch.tensor(speaker_embeddings).unsqueeze(0)
        all_speaker_embeddings.append(speaker_embeddings_tensor)

# Process each audio file and generate speech using corresponding speaker embeddings
for i, file_name in enumerate(os.listdir(input_dir)):
    if file_name.endswith(".WAV"):
        file_path = os.path.join(input_dir, file_name)
        speech_array, sampling_rate = sf.read(file_path)

        inputs = processor(audio=speech_array, sampling_rate=sampling_rate, return_tensors="pt").input_values

        if i < len(all_speaker_embeddings):
            speaker_embedding = all_speaker_embeddings[i]

            # Generating speech
            generated_speech = speech_to_speech_model.generate(inputs=inputs, forced_speaker_embedding=speaker_embedding)

            # Use vocoder to generate waveform if necessary
            audio_output = vocoder(generated_speech).audio

            output_file_path = os.path.join(output_dir, f"converted_{file_name}")
            sf.write(output_file_path, audio_output.numpy(), samplerate=16000)


# Initializing the SpeechT5 models

In [None]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
speech_to_speech_model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Paths to your directories

In [None]:
npy_dir = '/content/drive/MyDrive/MyTTSDataset_6/NPY'
input_dir = "/content/drive/MyDrive/MyTTSDataset_6/wav48/"
output_dir = "/content/drive/MyDrive/MyTTSDataset_6/converted"
os.makedirs(output_dir, exist_ok=True)

# Creating Dataset using Huggingface's datasets module

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow, dill, multiprocess, datasets
  Attempting uninstall: pyarrow
    Found exis

In [None]:
from datasets import load_dataset

dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/MyTTSDataset_6/wav48",)

Resolving data files:   0%|          | 0/425 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
!pip install Audio

Collecting Audio
  Downloading audio-1.5.0.tar.gz (2.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting audio.bitstream (from Audio)
  Downloading audio.bitstream-2.5.4.tar.gz (1.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting audio.coders (from Audio)
  Downloading audio.coders-5.0.2.tar.gz (931 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m931.5/931.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting audio.filters (from Audio)
  Downloading audio.filters-0.2.2.tar.gz (541 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.1/541.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a su

In [None]:
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-0.5.16-py3-none-any.whl (630 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m630.6/630.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.7/526.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ruamel.yaml.clib, ruamel.yaml, hyperpyyaml, speechbrain
Successfully i

# Create speaker embeddings

In [None]:
import os
import glob
import numpy as np
import torchaudio
from speechbrain.pretrained import EncoderClassifier
import torch
from tqdm.notebook import tqdm
import torch.nn.functional as F

# Define the model and its embedding size
spk_model = {
    "speechbrain/spkrec-xvect-voxceleb": 512,
    "speechbrain/spkrec-ecapa-voxceleb": 192,
}

# Function to extract embeddings
def f2embed(wav_file, classifier, size_embed):
    signal, fs = torchaudio.load(wav_file)
    assert fs == 16000, "The sampling rate is expected to be 16kHz."
    with torch.no_grad():
        embeddings = classifier.encode_batch(signal)
        embeddings = F.normalize(embeddings, dim=2)
        embeddings = embeddings.squeeze().cpu().numpy()
    assert embeddings.shape[0] == size_embed, f"Expected embedding size {size_embed}, but got {embeddings.shape[0]}"
    return embeddings

#set your parameters here
arctic_root = '/content/drive/MyDrive/MyTTSDataset_6/wav48'
output_root = '/content/drive/MyDrive/MyTTSDataset_6/speaker_embeddings_2'
speaker_embed = "speechbrain/spkrec-xvect-voxceleb"

# Ensure the output directory exists
os.makedirs(output_root, exist_ok=True)

# Select the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize the classifier
classifier = EncoderClassifier.from_hparams(source=speaker_embed, run_opts={"device": device}, savedir=os.path.join('/tmp', speaker_embed))

# Determine the size of the embeddings
size_embed = spk_model[speaker_embed]

# Collect all WAV files
wavlst = glob.glob(os.path.join(arctic_root, "*.WAV"))
print(f"Found {len(wavlst)} utterances.")

# Process each WAV file
for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"):
    utt_id = os.path.basename(utt_i).replace(".WAV", "")
    utt_emb = f2embed(utt_i, classifier, size_embed)
    np.save(os.path.join(output_root, f"{utt_id}.npy"), utt_emb)

print("Finished extracting and saving embeddings for all speakers.")


Found 425 utterances.


Extract:   0%|          | 0/425 [00:00<?, ?it/s]

Finished extracting and saving embeddings for all speakers.


# Generating speech using corresponding speaker embeddings

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import torch
import soundfile as sf
import os

# Load the dataset
dataset = load_dataset("/content/drive/MyDrive/MyTTSDataset_6/wav48", "default", split="train")
sampling_rate = dataset.features["audio"].sampling_rate

# Initialize the models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Directory for saving the generated speeches
output_dir = "/content/drive/MyDrive/MyTTSDataset_6/generated_speeches"
os.makedirs(output_dir, exist_ok=True)

# Directory containing speaker embeddings
speaker_embedding_dir = "/content/drive/MyDrive/MyTTSDataset_6/speaker_embeddings_2"

# Get a list of all speaker embedding files
speaker_embedding_files = [f for f in os.listdir(speaker_embedding_dir) if f.endswith('.npy')]

# Loop through the dataset and corresponding speaker embeddings
for i, item in enumerate(dataset):
    # Ensure there's a corresponding speaker embedding file
    if i < len(speaker_embedding_files):
        example_speech = item["audio"]["array"]
        inputs = processor(audio=example_speech, sampling_rate=sampling_rate, return_tensors="pt")

        # Load the corresponding speaker embedding
        speaker_embedding_file = os.path.join(speaker_embedding_dir, speaker_embedding_files[i])
        speaker_embeddings = np.load(speaker_embedding_file)
        speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)

        # Generate speech
        speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)

        # Save the generated speech
        output_file_path = os.path.join(output_dir, f"generated_speech_{i+1}.wav")
        sf.write(output_file_path, speech.numpy(), samplerate=16000)
    else:
        print(f"No corresponding speaker embedding file for dataset item {i+1}. Skipping.")

print("Finished generating speeches for all available speaker embeddings.")

