In [None]:
from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio

import torchaudio
import torch

device = 'cuda' # or 'cpu'
model = load_codec_model(use_gpu=True if device == 'cuda' else False)

In [None]:
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
from hubert.hubert_manager import HuBERTManager
hubert_manager = HuBERTManager()
hubert_manager.make_sure_hubert_installed()
hubert_manager.make_sure_tokenizer_installed()

In [None]:
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer 
# Load HuBERT for semantic tokens
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer

# Load the HuBERT model
hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)

# Load the CustomTokenizer model
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device)  # Automatically uses the right layers

In [None]:
audio_filepath = 'howard-original-voice-10sec.wav'
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.to(device)

In [None]:
semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)

In [None]:
# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav.unsqueeze(0))
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

In [None]:
# move codes to cpu
codes = codes.cpu().numpy()
# move semantic tokens to cpu
semantic_tokens = semantic_tokens.cpu().numpy()

In [None]:
import numpy as np
voice_name = 'howard-clone-from-10sec-sample' # whatever you want the name of the voice to be
output_path = 'bark/assets/prompts/' + voice_name + '.npz'
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [None]:
# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'

In [None]:
# Heres the generation stuff copy-pasted for convenience

In [None]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

text_prompt = """
These men are shepherds, and they raise livestock. They have brought with them their flocks and herds and everything they own.
Then he said: when Pharaoh calls for you and asks you about your occupation, you must tell him, we, your servants, have raised livestock all our lives, as our ancestors have always done.
When you tell him this, he will let you live here in the region of Goshen, for the Egyptians despise shepherds.
""".replace("\n", " ").strip()

voice_name = "howard-clone-from-10sec-sample"

In [None]:
# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

In [None]:
import nltk
import numpy

nltk.download('punkt')

silence = numpy.zeros(int(0.25 * SAMPLE_RATE))

simple_gen_segments = []
for sentence in nltk.sent_tokenize(text_prompt):
    print(f'generating audio for sentence: {sentence}')
    audio_array = generate_audio(sentence, history_prompt=voice_name)
    simple_gen_segments += [audio_array, silence.copy()]

In [None]:
from IPython.display import Audio
Audio(numpy.concatenate(simple_gen_segments), rate=SAMPLE_RATE)
# Audio(audio_array, rate=SAMPLE_RATE)

In [None]:
# generation with more control
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt=voice_name,
    temp=0.5,
)
coase_audio = codec_decode(x_coarse_gen)
fine_audio = codec_decode(x_fine_gen)

In [None]:
from IPython.display import Audio
Audio(coase_audio, rate=SAMPLE_RATE)

In [None]:
from IPython.display import Audio
Audio(fine_audio, rate=SAMPLE_RATE)