In [51]:
# Imports

from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio

import torchaudio
import torch

model = load_codec_model(use_gpu=True)

In [52]:
# Load and pre-process the audio waveform
audio_filepath = 'jose1.wav' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)
device = 'cuda'
print(torch.cuda.is_available())
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.unsqueeze(0).to(device)

True


In [75]:
# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

In [76]:
# This is where we enter the text transcription, jose read from the librispeech text samples
text = "HER KNOWLEDGE BEING SUCH AS WITH NO SORT OF STANDING ROOM OR LENGTH OF LEVER COULD HAVE BEEN EXPECTED TO MOVE THE WORLD"

In [77]:
# get seconds of audio
seconds = wav.shape[-1] / model.sample_rate
# generate semantic tokens
semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7) # not 100% sure on this part


  0%|                                                                                          | 0/100 [00:00<?, ?it/s][A
  2%|█▋                                                                                | 2/100 [00:00<00:08, 11.15it/s][A
  4%|███▎                                                                              | 4/100 [00:00<00:08, 10.70it/s][A
  6%|████▉                                                                             | 6/100 [00:00<00:08, 10.80it/s][A
  8%|██████▌                                                                           | 8/100 [00:00<00:07, 11.52it/s][A
 10%|████████                                                                         | 10/100 [00:00<00:07, 11.49it/s][A
 12%|█████████▋                                                                       | 12/100 [00:01<00:07, 11.07it/s][A
 14%|███████████▎                                                                     | 14/100 [00:01<00:07, 10.75it/s][A
 16%|██████████

In [78]:
# move codes to cpu
codes = codes.cpu().numpy()

In [88]:
import numpy as np
voice_name = 'jose' # whatever you want the name of the voice to be
output_path = 'bark/assets/prompts/' + voice_name + '.npz'
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [81]:
# Heres the generation stuff copy-pasted for convenience

In [98]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

# Enter your prompt and speaker here
text_prompt = "Her knowledge being suchas with no sort of standing room or length of lever. [laughs]"
voice_name = "jose" # use your custom voice name here if you have one

# load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [99]:
# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

In [96]:
# simple generation
audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)


  0%|                                                                                          | 0/100 [00:00<?, ?it/s][A
  1%|▊                                                                                 | 1/100 [00:00<00:30,  3.19it/s][A
  2%|█▋                                                                                | 2/100 [00:00<00:19,  5.12it/s][A
  4%|███▎                                                                              | 4/100 [00:00<00:12,  7.73it/s][A
  6%|████▉                                                                             | 6/100 [00:00<00:09,  9.47it/s][A
  8%|██████▌                                                                           | 8/100 [00:00<00:09,  9.22it/s][A
 10%|████████                                                                         | 10/100 [00:01<00:08, 10.41it/s][A
 12%|█████████▋                                                                       | 12/100 [00:01<00:08, 10.62it/s][A
 14%|██████████

In [100]:
# generation with more control
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt=voice_name,
    temp=0.5,
)
audio_array = codec_decode(x_fine_gen)


  0%|                                                                                          | 0/100 [00:00<?, ?it/s][A
  1%|▊                                                                                 | 1/100 [00:00<00:31,  3.16it/s][A
  3%|██▍                                                                               | 3/100 [00:00<00:15,  6.19it/s][A
  5%|████                                                                              | 5/100 [00:00<00:11,  8.06it/s][A
  7%|█████▋                                                                            | 7/100 [00:00<00:09,  9.39it/s][A
  9%|███████▍                                                                          | 9/100 [00:01<00:09, 10.10it/s][A
 11%|████████▉                                                                        | 11/100 [00:01<00:08, 10.81it/s][A
 13%|██████████▌                                                                      | 13/100 [00:01<00:08, 10.62it/s][A
 15%|██████████

In [101]:
from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [None]:
from scipy.io.wavfile import write as write_wav
# save audio
filepath = "/output/audio.wav" # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)