<h1 align="center">Voice Cloning Prototype</h1>

In [1]:
%%capture

!pip install pydub wget kaggle librosa pesq

## Data Preparation

In [2]:
import os

# Set the path to the Kaggle API credentials file
kaggle_credentials_path = '/content/kaggle.json'

# Set the environment variable for the Kaggle API credentials
os.environ['KAGGLE_CONFIG_DIR'] = os.path.dirname(kaggle_credentials_path)
!chmod 600 /content/kaggle.json

In [3]:
import kaggle

# Download the VCTK Corpus dataset from Kaggle
kaggle_dataset = 'showmik50/vctk-dataset'
output_path = '/content/VCTK-Corpus.zip'
kaggle.api.dataset_download_files(kaggle_dataset, path=output_path, unzip=True)

>
#### Information of Speakers

In [4]:
import pandas as pd

file_path = "/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corpus/speaker-info.txt"

# Define column names
columns = ["ID", "AGE", "GENDER", "ACCENTS", "REGION"]
data = []

# Read the text file line by line, skipping the first line
with open(file_path, "r") as file:
    next(file)  # Skip the first line
    for line in file:
        line_data = line.split()

        # Handle the case where the last column may contain multiple words
        last_column = " ".join(line_data[4:])

        # Append the data to the list
        data.append(line_data[:4] + [last_column])

# Create the dataframe from the data list
speaker_df = pd.DataFrame(data, columns=columns)

speaker_df

Unnamed: 0,ID,AGE,GENDER,ACCENTS,REGION
0,225,23,F,English,Southern England
1,226,22,M,English,Surrey
2,227,38,M,English,Cumbria
3,228,22,F,English,Southern England
4,229,23,F,English,Southern England
...,...,...,...,...,...
103,362,29,F,American,
104,363,22,M,Canadian,Toronto
105,364,23,M,Irish,Donegal
106,374,28,M,Australian,English


- We have total of 108 speakers of different age, gender, and accent.

In [5]:
text = os.listdir("/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corpus/txt")
audio = os.listdir("/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corpus/wav48")

len(text),len(audio)

(108, 109)

In [6]:
for i in audio:
  if i not in text:
    print(i)

p315


In [7]:
speaker_df[speaker_df["ID"]=='315']

Unnamed: 0,ID,AGE,GENDER,ACCENTS,REGION
82,315,18,M,American,New England


- We can observed that for Speaker of ID '315', text file is missing but there is his audio present.

## Selecting 10 Speakers for building Voice Cloning Prototype

- As I have limited amount of space in colab memory, I will randomly select 10 speakers for building model.

In [8]:
import os
import glob
import random

# Define the path to the extracted VCTK Corpus dataset
vctk_path = '/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corpus/wav48'

# Get the list of speaker directories
speaker_dirs = glob.glob(os.path.join(vctk_path, 'p*'))

# Select the desired number of speakers
num_speakers = 10

# Ensure the number of speakers does not exceed the available speakers
num_speakers = min(num_speakers, len(speaker_dirs))

# Select the specified number of speaker directories randomly
selected_speaker_dirs = random.sample(speaker_dirs, num_speakers)

# Collect the audio files for the selected speakers
audio_files = []
for speaker_dir in selected_speaker_dirs:
    speaker_id = os.path.basename(speaker_dir)
    speaker_files = glob.glob(os.path.join(speaker_dir, '*.wav'))
    audio_files.extend([(file, speaker_id) for file in speaker_files])

sample_audio_df = pd.DataFrame(audio_files, columns=['audio_path', 'speaker_id'])

# Define the path to the text files
txt_folder = '/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corpus/txt'

# Function to extract text from .txt files
def extract_text(txt_path):
    with open(txt_path, 'r') as file:
        text = file.read().strip()
    return text

# Iterate over the audio files and extract text from corresponding .txt files
for index, row in sample_audio_df.iterrows():
    speaker_id = row['speaker_id']
    file_name = os.path.splitext(os.path.basename(row['audio_path']))[0]
    txt_path = os.path.join(txt_folder, f'{speaker_id}/{file_name}.txt')
    text = extract_text(txt_path)
    sample_audio_df.loc[index, 'text'] = text


sample_audio_df

Unnamed: 0,audio_path,speaker_id,text
0,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p238,It has become a way of life.
1,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p238,"People look, but no one ever finds it."
2,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p238,His background is firmly in venture capital.
3,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p238,"Of course, on Tuesday, United were beaten desp..."
4,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p238,And they removed a Borderer.
...,...,...,...
4076,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p345,I feel sorry for anyone coming in here.
4077,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p345,It is sad and frustrating.
4078,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p345,It is a freedom.
4079,/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corp...,p345,I'm sure you all can relate to that.


In [9]:
list(sample_audio_df["speaker_id"].unique())

['p238',
 'p361',
 'p243',
 'p249',
 'p343',
 'p340',
 'p298',
 'p261',
 'p227',
 'p345']

In [10]:
a = sample_audio_df.groupby(["speaker_id"])["audio_path"].count().to_frame()
a.rename(columns={'audio_path': 'Audio_counts'}, inplace=True)
a

Unnamed: 0_level_0,Audio_counts
speaker_id,Unnamed: 1_level_1
p227,389
p238,454
p243,393
p249,335
p261,474
p298,405
p340,423
p343,388
p345,397
p361,423


### Playing Audios from 5 random Speakers.

In [11]:
import random
from pydub import AudioSegment
from IPython.display import Audio

# Get unique speaker IDs from the DataFrame
unique_speaker_ids = sample_audio_df['speaker_id'].unique()

# Select five random speaker IDs
selected_speaker_ids = random.sample(unique_speaker_ids.tolist(), k=5)

# Select one audio file per selected speaker ID
selected_audio_files = []
for speaker_id in selected_speaker_ids:
    audio_file = sample_audio_df[sample_audio_df['speaker_id'] == speaker_id].sample(n=1)['audio_path'].values[0]
    selected_audio_files.append([audio_file,speaker_id])

# Play the selected audio files
sample_df = pd.DataFrame(selected_audio_files, columns=['audio_path', 'speaker_id'])

for _, row in sample_df.iterrows():
    audio_path = row['audio_path']
    speaker_id = row['speaker_id']

    audio = AudioSegment.from_wav(audio_path)
    audio.export(f'/content/{speaker_id}.wav', format='wav')
    print(f"Playing Audio of Speaker {speaker_id} : ")
    print()
    display(Audio(f'/content/{speaker_id}.wav'))
    print()
    print()

Output hidden; open in https://colab.research.google.com to view.

## TorToiSe TTS

In [12]:
# the scipy version packaged with colab is not tolerant of misformated WAV files.
# install the latest version.
%%capture
!pip3 install -U scipy
!git clone https://github.com/jnordberg/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
!pip3 install transformers==4.19.0 einops==0.5.0 rotary_embedding_torch==0.1.5 unidecode==1.3.5
!python3 setup.py install

In [13]:
# Imports used through the rest of the notebook.
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/181 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading autoregressive.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth...


100% (1716988501 of 1716988501) |########| Elapsed Time: 0:00:09 Time:  0:00:09


Done.
Downloading classifier.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/classifier.pth...


100% (60938957 of 60938957) |############| Elapsed Time: 0:00:00 Time:  0:00:00


Done.
Downloading clvp2.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clvp2.pth...


100% (975620731 of 975620731) |##########| Elapsed Time: 0:00:04 Time:  0:00:04


Done.
Downloading cvvp.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth...


100% (151223901 of 151223901) |##########| Elapsed Time: 0:00:00 Time:  0:00:00


Done.
Downloading diffusion_decoder.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth...


100% (1169472627 of 1169472627) |########| Elapsed Time: 0:00:06 Time:  0:00:06


Done.
Downloading vocoder.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth...


100% (391384715 of 391384715) |##########| Elapsed Time: 0:00:01 Time:  0:00:01


Done.
Downloading rlg_auto.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth...


100% (25193729 of 25193729) |############| Elapsed Time: 0:00:00 Time:  0:00:00


Done.
Downloading rlg_diffuser.pth from https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth...


100% (100715777 of 100715777) |##########| Elapsed Time: 0:00:00 Time:  0:00:00


Done.


### Data Preprocessing

In [14]:
import os
from pydub import AudioSegment

def convert_sample_rate(input_folder, output_folder, target_sample_rate=22050):
    try:
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        # Randomly selecting 25 audio files for training
        wav_files = random.sample([file for file in os.listdir(input_folder) if file.endswith(".wav")],25)

        for file in wav_files:
            input_file_path = os.path.join(input_folder, file)
            output_file_path = os.path.join(output_folder, file)

            audio = AudioSegment.from_file(input_file_path)
            audio = audio.set_frame_rate(target_sample_rate)
            audio.export(output_file_path, format="wav")
            print(f"Conversion successful for {file}. New sample rate: {target_sample_rate} Hz")
    except Exception as e:
        print(f"Error converting the files: {e}")

CUSTOM_VOICE_NAME = "p248"
input_folder_path = f"/content/VCTK-Corpus.zip/VCTK-Corpus/VCTK-Corpus/wav48/{CUSTOM_VOICE_NAME}"
custom_voice_folder = f"/content/tortoise-tts/tortoise/voices/{CUSTOM_VOICE_NAME}"
convert_sample_rate(input_folder_path, custom_voice_folder)

Conversion successful for p248_376.wav. New sample rate: 22050 Hz
Conversion successful for p248_373.wav. New sample rate: 22050 Hz
Conversion successful for p248_297.wav. New sample rate: 22050 Hz
Conversion successful for p248_198.wav. New sample rate: 22050 Hz
Conversion successful for p248_040.wav. New sample rate: 22050 Hz
Conversion successful for p248_261.wav. New sample rate: 22050 Hz
Conversion successful for p248_300.wav. New sample rate: 22050 Hz
Conversion successful for p248_251.wav. New sample rate: 22050 Hz
Conversion successful for p248_309.wav. New sample rate: 22050 Hz
Conversion successful for p248_094.wav. New sample rate: 22050 Hz
Conversion successful for p248_223.wav. New sample rate: 22050 Hz
Conversion successful for p248_306.wav. New sample rate: 22050 Hz
Conversion successful for p248_055.wav. New sample rate: 22050 Hz
Conversion successful for p248_238.wav. New sample rate: 22050 Hz
Conversion successful for p248_143.wav. New sample rate: 22050 Hz
Conversion

- Here, we have selected speaker 248 to clone her voice, and diplaying some of her audio samples.

#### Original Audio Samples of Speaker

In [15]:
# Displaying audios of speaker 283.
from IPython.display import Audio

wav_files = random.sample([file for file in os.listdir(custom_voice_folder) if file.endswith(".wav")],3)
for audio_path in wav_files:
    input_file_path = os.path.join(custom_voice_folder, audio_path)
    print(f"Speaker {CUSTOM_VOICE_NAME} Audio sample {audio_path}:")
    print()
    display(Audio(input_file_path))
    print()
    print()

Speaker p248 Audio sample p248_314.wav:





Speaker p248 Audio sample p248_306.wav:





Speaker p248 Audio sample p248_170.wav:







### Defining Text and Preset option

In [16]:
# This is the text that will be spoken.
text = f"""Hello this is speaker {CUSTOM_VOICE_NAME}. I hope you learned something.
[I am really angry,] I really hate millennials, as one of them just killed my dog!."""

# Pick a "preset mode" to determine quality.
# Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}.
# 'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
# 'fast': Decent quality speech at a decent inference rate. A good choice for mass inference.
# 'standard': Very good quality. This is generally about as good as you are going to get.
# 'high_quality': Use if you want the absolute best. This is not really worth the compute, though.
preset = "standard"

In [17]:
# Generate speech with the custotm voice.
voice_samples, conditioning_latents = load_voice(CUSTOM_VOICE_NAME)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
                          preset=preset)
torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')

Generating autoregressive samples..


100%|██████████| 16/16 [02:39<00:00, 10.00s/it]


Computing best candidates using CLVP and CVVP


100%|██████████| 16/16 [01:33<00:00,  5.82s/it]


Transforming autoregressive outputs into audio..


100%|██████████| 200/200 [01:57<00:00,  1.70it/s]


## Evaluating Model

In [18]:
import pesq
import librosa
import numpy as np
from pydub import AudioSegment

def resample_audio(audio, target_sample_rate):
    if audio.frame_rate != target_sample_rate:
        audio = audio.set_frame_rate(target_sample_rate)
    return audio

def calculate_pesq(original_audio_file, cloned_audio_file):
    # Load audio files using librosa
    original_audio, sr_original = librosa.load(original_audio_file, sr=None)
    cloned_audio, sr_cloned = librosa.load(cloned_audio_file, sr=None)

    # Make sure both audios have the same sample rate (if not already)
    target_sample_rate = 16000  # PESQ supports narrow band mode at 8000 Hz
    if sr_original != target_sample_rate:
        original_audio = librosa.resample(original_audio, orig_sr = sr_original,
                                          target_sr = target_sample_rate)
    if sr_cloned != target_sample_rate:
        cloned_audio = librosa.resample(cloned_audio, orig_sr = sr_cloned,
                                        target_sr = target_sample_rate)

    # Convert audio to mono (if stereo)
    if len(original_audio.shape) > 1:
        original_audio = librosa.to_mono(original_audio)
    if len(cloned_audio.shape) > 1:
        cloned_audio = librosa.to_mono(cloned_audio)

    # Calculate PESQ score
    pesq_score = pesq.pesq(target_sample_rate, original_audio, cloned_audio, 'nb')
    return pesq_score

In [20]:
original_voice_file = "/content/tortoise-tts/tortoise/voices/p248/p248_170.wav"
cloned_voice_file = f"/content/tortoise-tts/generated-{CUSTOM_VOICE_NAME}.wav"

pesq_score = calculate_pesq(original_voice_file, cloned_voice_file)
print(f"PESQ score: {pesq_score}")

PESQ score: 1.1445003747940063
