# BARK TTS VOICE CLONING

This code is based on the following repositories:

- https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer 

- https://github.com/serp-ai/bark-with-voice-clone.git

In [None]:
%pip install -r requirements.txt
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
%pip install git+https://github.com/suno-ai/bark.git

In [None]:
# If you are having trouble executing the Fairseq installation, please refer to the README which contains information about some common errors that can occur during the pip installation.
%pip install git+https://github.com/One-sixth/fairseq.git

In [None]:
device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')

import torch
from encodec import EncodecModel
from encodec.utils import convert_audio
from bark_hubert_quantizer.hubert_manager import HuBERTManager
from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
from bark_hubert_quantizer.customtokenizer import CustomTokenizer


print('Loading HuBERT')
hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)

In [None]:
# Clone manually the portuguese .pth file to /data/models/hubert and execute this cell again
# https://huggingface.co/MadVoyager/bark-voice-cloning-portuguese-HuBERT-quantizer/blob/main/portuguese-HuBERT-quantizer_24_epoch.pth

print('Loading Quantizer')
quant_model = CustomTokenizer.load_from_checkpoint('data\models\hubert\portuguese-HuBERT-quantizer_24_epoch.pth', device)

# If you want to use the default english quantizer model use the following code

# large_quant_model = False  # Use the larger pretrained model
# model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
# quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)

print('Loading Encodec')
encodec_model = EncodecModel.encodec_model_24khz()
encodec_model.set_target_bandwidth(6.0)
encodec_model.to(device)

print('Downloaded and loaded models')

In [5]:
wav_file = '../audio/speaker_/input_speaker_01.wav'  # Put the path of the speaker you want to use here.
out_file = '../audio/speaker_/history_speaker_base.npz'  # Put the path to save the cloned speaker to here.

In [None]:
import torchaudio
import numpy as np

wav, sr = torchaudio.load(wav_file)
wav_hubert = wav.to(device)

if wav_hubert.shape[0] == 2:
    wav_hubert = wav_hubert.mean(0, keepdim=True)

print('Extracting semantics...')
semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)

print('Tokenizing semantics...')
semantic_tokens = quant_model.get_token(semantic_vectors)

print('Creating coarse and fine prompts...')
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)

wav = wav.to(device)

with torch.no_grad():
    encoded_frames = encodec_model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()

codes = codes.cpu()
semantic_tokens = semantic_tokens.cpu()

np.savez(out_file,
         semantic_prompt=semantic_tokens,
         fine_prompt=codes,
         coarse_prompt=codes[:2, :]
         )

In [None]:
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav

from bark.api import generate_audio
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

# Set your text prompt here
text_prompt = " "

# Simple generation

# voice_name = "audio/speaker_/history_speaker_base.npz" # Use your custom voice name here if you have on
voice_name = out_file

output_path = "../audio/speaker_/voice_output_custom01.wav"  # Define the path to save the output

audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)
write_wav(output_path, SAMPLE_RATE, audio_array)

# Play audio
Audio(audio_array, rate=SAMPLE_RATE)