# Imports and Setups

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["USER"] = "me" # TODO change this to your username

import torch
import torchaudio
import numpy as np
import random
from argparse import Namespace

from data.tokenizer import (
    AudioTokenizer,
    TextTokenizer,
)
from huggingface_hub import hf_hub_download

import shutil

  from .autonotebook import tqdm as notebook_tqdm


# Initialise VoiceCraft

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
voicecraft_name="830M_TTSEnhanced.pth" # or giga330M.pth, 330M_TTSEnhanced.pth, giga830M.pth

from models import voicecraft
model = voicecraft.VoiceCraft.from_pretrained(f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}")
phn2num = model.args.phn2num
config = vars(model.args)
model.to(device)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


VoiceCraft(
  (text_embedding): TokenEmbedding(
    (dropout): Dropout(p=0.0, inplace=False)
    (word_embeddings): Embedding(121, 2048)
  )
  (audio_embedding): ModuleList(
    (0-3): 4 x TokenEmbedding(
      (dropout): Dropout(p=0.0, inplace=False)
      (word_embeddings): Embedding(2052, 2048)
    )
  )
  (text_positional_embedding): SinePositionalEmbedding(
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (audio_positional_embedding): SinePositionalEmbedding(
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (decoder): TransformerEncoder(
    (layers): ModuleList(
      (0-15): 16 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=2048, out_features=2048, bias=True)
        )
        (linear1): Linear(in_features=2048, out_features=8192, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (linear2): Linear(in_features=8192, out_features=2048, bias=True)
        (dropout1): Dropout

# Initialize Tokenizers

In [3]:
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
if not os.path.exists(encodec_fn):
    print("Downloading encodec model...")
    os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
    shutil.move("encodec_4cb2048_giga.th", "./pretrained_models/encodec_4cb2048_giga.th") # for windows
    
audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu

text_tokenizer = TextTokenizer(backend="espeak")

    PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.2.2)
    Python  3.9.13 (you have 3.9.20)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
  _torch_pytree._register_pytree_node(
Dora directory: /tmp/audiocraft_me


# Initialize Whisper

In [4]:
import whisper

whisper_model = whisper.load_model("base", device="cuda")

In [5]:
orig_audio = "./demo/5895_34622_000026_000002.wav"
orig_audio = "./demo/paeall.wav"

result = whisper_model.transcribe("./demo/paeall.wav")
text = result['text']
lang = result['language']

if lang == "en":
    orig_transcript = text
elif lang == "th":
    from pythainlp.transliterate import romanize
    engine = "thai2rom"
    orig_transcript = romanize(text, engine)

print(f'Languague: {lang}')
print(f'Transcript: {orig_transcript}')

  a = scaled_dot_product_attention(


Languague: en
Transcript:  I love Thai culture. I would like to try cooking Thai food myself. Could you let comment a good restaurant for me? What do you like to do in your free time? Which Thai dish is your absolute favorite?


# Inference

In [6]:
# move the audio and transcript to temp folder
temp_folder = "./demo/temp"
os.makedirs(temp_folder, exist_ok=True)
shutil.copy(orig_audio, temp_folder)

filename = os.path.splitext(orig_audio.split("/")[-1])[0]
with open(f"{temp_folder}/{filename}.txt", "w") as f:
    f.write(orig_transcript)
    
# run MFA to get the alignment
align_temp = f"{temp_folder}/mfa_alignments"
!source ~/.bashrc && \
    conda activate voicecraft && \
    mfa align -v --clean -j 1 --output_format csv {temp_folder} \
        english_us_arpa english_us_arpa {align_temp}

'source' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
input_text = "The quick brown fox jumps over the lazy dog." # the text to be concatenated to the original transcript
target_transcript = f"{orig_transcript} {input_text}"

audio_fn = f"{temp_folder}/{filename}.wav"
info = torchaudio.info(audio_fn)
audio_dur = info.num_frames / info.sample_rate

cut_off_sec = audio_dur - 1e-9

assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
prompt_end_frame = int(cut_off_sec * info.sample_rate)

# run the model to get the output
# hyperparameters for inference
codec_audio_sr = 16000
codec_sr = 50
top_k = 0
top_p = 0.9 # can also try 0.8, but 0.9 seems to work better
temperature = 1
silence_tokens=[1388,1898,131]
kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model

# NOTE adjust the below three arguments if the generation is not as good
stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1
sample_batch_size = 3 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 4 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.
seed = 1 # change seed if you are still unhappy with the result

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
seed_everything(seed)

decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
from inference_tts_scale import inference_one_sample
concated_audio, gen_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)
        
# save segments for comparison
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")


# display the audio
from IPython.display import Audio
print("concatenate prompt and generated:")
display(Audio(concated_audio, rate=codec_audio_sr))

print("generated:")
display(Audio(gen_audio, rate=codec_audio_sr))

concatenate prompt and generated:


generated:


In [8]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1
