In [None]:
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav

from bark.api import generate_audio
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

In [None]:
semantic_path = "semantic_output_yennefer/pytorch_model.bin" 
coarse_path = "coarse_output_yennefer/pytorch_model.bin" 
fine_path = "fine_output_yennefer/pytorch_model.bin" 
use_rvc = False

In [None]:
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    text_model_path=semantic_path,
    coarse_use_gpu=True,
    coarse_use_small=False,
    coarse_model_path=coarse_path,
    fine_use_gpu=True,
    fine_use_small=False,
    fine_model_path=fine_path,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)


In [None]:
def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):
    x_semantic = generate_text_semantic(
        text_prompt,
        history_prompt=voice_name if use_semantic_history_prompt else None,
        temp=semantic_temp,
        top_k=semantic_top_k,
        top_p=semantic_top_p,
    )

    x_coarse_gen = generate_coarse(
        x_semantic,
        history_prompt=voice_name if use_coarse_history_prompt else None,
        temp=coarse_temp,
        top_k=coarse_top_k,
        top_p=coarse_top_p,
    )
    x_fine_gen = generate_fine(
        x_coarse_gen,
        history_prompt=voice_name if use_fine_history_prompt else None,
        temp=fine_temp,
    )

    if output_full:
        full_generation = {
            'semantic_prompt': x_semantic,
            'coarse_prompt': x_coarse_gen,
            'fine_prompt': x_fine_gen,
        }
        return full_generation, codec_decode(x_fine_gen)
    return codec_decode(x_fine_gen)

In [None]:
text_prompt = "Привет мир!"
filepath = "output/audio_setting_yennefer.wav" 

audio_array = generate_with_settings(
    text_prompt,
    semantic_temp=0.7, #было 0.7, норм
    semantic_top_k=50,
    semantic_top_p=0.9, #было 0.99, потом 0.9
    coarse_temp=0.3, #было 0.7, при 0.3 норм
    coarse_top_k=50,
    coarse_top_p=0.9, #было 0.95
    fine_temp=0.5, #было 0.5, норм
    voice_name="datasets/yennefer/tokens/yennefer_8.npz",
    use_semantic_history_prompt=True, 
    use_coarse_history_prompt=True, 
    use_fine_history_prompt=True, 
    output_full=False
)

write_wav(filepath, SAMPLE_RATE, audio_array)


Audio(audio_array, rate=SAMPLE_RATE)