# Bark text-to-speech voice cloning.
Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).

## Install packages

In [None]:
%pip install -r requirements.txt
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

## Load models

In [7]:
large_quant_model = False  # Use the larger pretrained model
device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')

import numpy as np
import torch
import torchaudio
from encodec import EncodecModel
from encodec.utils import convert_audio
from bark_hubert_quantizer.hubert_manager import HuBERTManager
from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
from bark_hubert_quantizer.customtokenizer import CustomTokenizer

model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')

print('Loading HuBERT...')
hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)
print('Loading Quantizer...')
quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
print('Loading Encodec...')
encodec_model = EncodecModel.encodec_model_24khz()
encodec_model.set_target_bandwidth(6.0)
encodec_model.to(device)

print('Downloaded and loaded models!')

Loading HuBERT...


  checkpoint = torch.load(checkpoint_path, map_location=device)
  state = torch.load(f, map_location=torch.device("cpu"))
  WeightNorm.apply(module, name, dim)


Loading Quantizer...


  model.load_state_dict(torch.load(path, map_location=map_location))


Loading Encodec...
Downloaded and loaded models!


In [4]:
!pip install librosa



    PyYAML (>=5.1.*)
            ~~~~~~^


## Load wav and create speaker history prompt

In [5]:
import librosa
import soundfile as sf

# Dosyayı yükle
y, sr = librosa.load("2023-2024_38_trabzonspor_mke-ankaragucu-iyileştirilmiş-v2.wav", sr=None)

# Sessizlikleri otomatik kırp
yt, index = librosa.effects.trim(y, top_db=20)

# Yeni dosyayı kaydet
sf.write("s_trimmed.wav", yt, sr)


In [13]:
wav_file = '2023-2024_38_trabzonspor_mke-ankaragucu-iyileştirilmiş-v2 (mp3cut.net).wav'  # Put the path of the speaker you want to use here.
out_file = 'speaker_son.npz'  # Put the path to save the cloned speaker to here.

wav, sr = torchaudio.load(wav_file)

wav_hubert = wav.to(device)

if wav_hubert.shape[0] == 2:  # Stereo to mono if needed
    wav_hubert = wav_hubert.mean(0, keepdim=True)

print('Extracting semantics...')
semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)
print('Tokenizing semantics...')
semantic_tokens = quant_model.get_token(semantic_vectors)
print('Creating coarse and fine prompts...')
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)

wav = wav.to(device)

with torch.no_grad():
    encoded_frames = encodec_model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()

codes = codes.cpu()
semantic_tokens = semantic_tokens.cpu()

np.savez(out_file,
         semantic_prompt=semantic_tokens,
         fine_prompt=codes,
         coarse_prompt=codes[:2, :]
         )

print('Done!')

Extracting semantics...
Tokenizing semantics...
Creating coarse and fine prompts...
Done!


In [14]:
import numpy as np

preset = np.load("speaker_son.npz")
print(preset.files)



['semantic_prompt', 'fine_prompt', 'coarse_prompt']


In [15]:
from bark import generate_audio, preload_models
from bark.generation import SAMPLE_RATE
import numpy as np
import scipy

# Bark modellerini yükle (ilk seferde indirir)
preload_models()

# Türkçe metin (vurgu için yazımı dikkatli yap)
text = "<|tr|> GOOOOOL! İnanılmaz bir vuruş, stadyum çılgınca coşuyor!"

# Voice preset (senin .npz dosyan)
preset = np.load("speaker_son.npz")

# Bark ile ses üret
audio_array = generate_audio(
    text,
    history_prompt={
        "semantic_prompt": preset["semantic_prompt"],
        "coarse_prompt": preset["coarse_prompt"],
        "fine_prompt": preset["fine_prompt"]
    }
)

# WAV olarak kaydet
scipy.io.wavfile.write("tts_son_output.wav", SAMPLE_RATE, audio_array)


100%|████████████████████████████████████████████████████████████████████████████████| 694/694 [00:26<00:00, 26.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [01:09<00:00,  1.99s/it]


In [12]:
import numpy as np

# .npz dosyasını yükle
preset = np.load("speaker1.npz")

# Anahtarları (içerdiği veri kümeleri) yazdır
print("Dosya içeriği:", preset.files)

# Her bir bileşenin boyutuna (shape) bak
print("semantic_prompt shape:", preset["semantic_prompt"].shape)
print("coarse_prompt shape:", preset["coarse_prompt"].shape)
print("fine_prompt shape:", preset["fine_prompt"].shape)


Dosya içeriği: ['semantic_prompt', 'fine_prompt', 'coarse_prompt']
semantic_prompt shape: (17465,)
coarse_prompt shape: (2, 26200)
fine_prompt shape: (8, 26200)
