<a href="https://colab.research.google.com/github/LucasMatuszewski/Python-colab-notebooks/blob/main/notebooks/Bark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install pip==24.0
!pip install encodec
!pip install funcy
!pip install fairseq
!pip install audiolm-pytorch==1.1.4
!git clone https://github.com/serp-ai/bark-with-voice-clone.git
%cd bark-with-voice-clone

# Imports

In [None]:
import torchaudio
import torch
import nltk
import numpy as np

from encodec.utils import convert_audio
from IPython.display import Audio
from transformers import BertTokenizer

# huBERT
from hubert.hubert_manager import HuBERTManager
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer

#Bark
from bark.generation import (
    load_codec_model,
    generate_text_semantic
)
from bark.api import (
    generate_audio,
    generate_text_semantic
)
from bark.generation import (
    SAMPLE_RATE,
    preload_models,
    codec_decode,
    generate_coarse,
    generate_fine,
    generate_text_semantic
)

# Init
nltk.download('punkt')
use_gpu = torch.cuda.is_available()
use_small = False
preload_models(
    text_use_gpu=use_gpu, text_use_small=use_small,
    coarse_use_gpu=use_gpu, coarse_use_small=use_small,
    fine_use_gpu=use_gpu, fine_use_small=use_small,
    codec_use_gpu=use_gpu, force_reload=use_small,
    path="models"
)

# Pipes

In [None]:
def create_clone(
    path : str,
    source_wav : str,
    voice_name : str,
    custom_hubert : str = 'data/models/hubert/hubert.pt',
    custom_tokenizer : str = 'data/models/hubert/tokenizer.pth'
) -> None:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    source_wav = path + '/voice_samples/' + source_wav + '.wav'
    voice_name = path + '/encoded_voices/' + voice_name + '.npz'

    hubert_manager = HuBERTManager()
    hubert_manager.make_sure_hubert_installed()
    hubert_manager.make_sure_tokenizer_installed()

    hubert_model = CustomHubert(checkpoint_path=custom_hubert).to(device)
    tokenizer = CustomTokenizer.load_from_checkpoint(custom_tokenizer).to(device)

    model = load_codec_model(use_gpu=torch.cuda.is_available())
    wav, source = torchaudio.load(source_wav)
    wav = convert_audio(wav, source, model.sample_rate, model.channels).to(device)
    semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
    semantic_tokens = tokenizer.get_token(semantic_vectors)
    with torch.no_grad():
        encoded_frames = model.encode(wav.unsqueeze(0))
    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze().cpu().numpy()
    semantic_tokens = semantic_tokens.cpu().numpy()
    np.savez(voice_name, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [None]:
def generate(
    path : str,
    prompt : str,
    voice_name : str,
    temperature : float = 60,
    top_k : int = 50,
    top_p : float = 0.95,
) -> np.ndarray:
    path = path + '/encoded_voices/' + voice_name + '.npz'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    x_semantic = generate_text_semantic(
        prompt,
        history_prompt=path,
        temp=temperature,
        top_k=top_k,
        top_p=top_p,
    )
    x_coarse_gen = generate_coarse(
        x_semantic,
        history_prompt=path,
        temp=temperature,
        top_k=top_k,
        top_p=top_p,
    )
    x_fine_gen = generate_fine(
        x_coarse_gen,
        history_prompt=path,
        temp=temperature,
    )
    return codec_decode(x_fine_gen)

# Inference

In [None]:
create_clone(
    path='/content/drive/MyDrive/DriveBlob/TTS',
    source_wav='us_female_deepvoice_narrator',
    voice_name='us_female_deepvoice_narrator',
)

In [None]:
GEN_TEMP = 0.80
TOP_P = 0.95
TOP_K = 50

In [None]:
wav = generate(
    path='/content/drive/MyDrive/DriveBlob/TTS',
    prompt="Every dwarf who has ever interacted with humans before says the same thing: they look like poorly drawn caricatures, their faces contort in exaggerated expressions and mannerisms.",
    voice_name='us_female_deepvoice_narrator',
    temperature=GEN_TEMP, top_k=TOP_K, top_p=TOP_P,
)
Audio(wav, rate=SAMPLE_RATE)