# Bark text-to-speech voice cloning.
Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).
(This version of the notebook is made to work on Google Colab, make sure your runtime hardware accelerator is set to GPU)

# Google Colab: Clone the repository

In [1]:
!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/
%cd bark-voice-cloning-HuBERT-quantizer

Cloning into 'bark-voice-cloning-HuBERT-quantizer'...
remote: Enumerating objects: 1861, done.[K
remote: Counting objects: 100% (226/226), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 1861 (delta 130), reused 204 (delta 115), pack-reused 1635[K
Receiving objects: 100% (1861/1861), 319.75 MiB | 17.85 MiB/s, done.
Resolving deltas: 100% (131/131), done.
/content/bark-voice-cloning-HuBERT-quantizer


## Install packages

In [2]:
%pip install fairseq
%pip install tensorboardX
%pip install audiolm_pytorch
%pip install bark
%pip install -r requirements.txt
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/9.6 MB[0m [31m2.2 MB/s[0m eta [36m0:00:05[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/9.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m8.5/9.6 MB[0m [31m82.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m9.6/9.6 MB[0m [31m88.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing

In [3]:
from bark.generation import load_codec_model, SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic
from bark.api import generate_audio
from transformers import BertTokenizer
from encodec.utils import convert_audio
from encodec import EncodecModel
from encodec.utils import convert_audio
import numpy as np
import torch
import torchaudio

device = 'cuda' # or 'cpu'
model = load_codec_model(use_gpu=True if device == 'cuda' else False)

Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:00<00:00, 145MB/s]


## Load models

In [4]:
#From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
from bark_hubert_quantizer.hubert_manager import HuBERTManager
hubert_manager = HuBERTManager()
hubert_manager.make_sure_hubert_installed()
hubert_manager.make_sure_tokenizer_installed()

Downloading HuBERT base model
Downloaded HuBERT
Downloading HuBERT custom tokenizer


Downloading (…)rt_base_ls960_14.pth:   0%|          | 0.00/104M [00:00<?, ?B/s]

Downloaded tokenizer


'data/models/hubert/tokenizer.pth'

In [5]:
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
# Load bark_hubert_quantizer for semantic tokens
from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
from bark_hubert_quantizer.customtokenizer import CustomTokenizer

# Load the HuBERT model
hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)

# Load the CustomTokenizer model
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device)

## Load wav and create speaker history prompt

In [6]:
# Load and pre-process the audio waveform
audio_filepath = '/content/input.wav' # the audio you want to clone
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.to(device)

In [7]:
semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)

In [8]:
# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav.unsqueeze(0))
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

In [9]:
# move codes to cpu
codes = codes.cpu().numpy()
# move semantic tokens to cpu
semantic_tokens = semantic_tokens.cpu().numpy()

In [10]:
voice_name = 'output' # whatever you want the name of the voice to be
output_path = '/content/' + voice_name + '.npz'
history_prompt = output_path
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [11]:
# Enter your prompt and speaker here
text_prompt = "Education is the process of facilitating learning, or the acquisition of knowledge, skills, values, beliefs, and habits. Educational methods include teaching, training, storytelling, discussion, and directed research."

In [12]:
# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
    # path="models"
)

Downloading text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading coarse_2.pt:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

Downloading fine_2.pt:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

In [13]:
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt,
    temp=0.5,
)
audio_array = codec_decode(x_fine_gen)

100%|██████████| 100/100 [01:29<00:00,  1.12it/s]
100%|██████████| 35/35 [06:50<00:00, 11.74s/it]


In [14]:
from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [15]:
from scipy.io.wavfile import write as write_wav
from google.colab import files
# save audio
filepath = '/content/' + 'output' + '.wav' # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)
files.download(filepath)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>