In [None]:
import librosa
import soundfile as sf
import torch
# from facodec import FACodecEncoderV2, FACodecDecoderV2
from huggingface_hub import hf_hub_download
from oz2.models.facodec import FACodecEncoder, FACodecDecoder


def load_audio(wav_path):
    wav = librosa.load(wav_path, sr=16000)[0]
    wav = torch.from_numpy(wav).float()
    wav = wav.unsqueeze(0).unsqueeze(0)
    return wav


fa_encoder = FACodecEncoder(
    ngf=32,
    up_ratios=[2, 4, 5, 5],
    out_channels=256,
)

fa_decoder = FACodecDecoder(
    in_channels=256,
    upsample_initial_channel=1024,
    ngf=32,
    up_ratios=[5, 5, 4, 2],
    vq_num_q_c=2,
    vq_num_q_p=1,
    vq_num_q_r=3,
    vq_dim=256,
    codebook_dim=8,
    codebook_size_prosody=10,
    codebook_size_content=10,
    codebook_size_residual=10,
    use_gr_x_timbre=True,
    use_gr_residual_f0=True,
    use_gr_residual_phone=True,
)

encoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_encoder.bin")
decoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_decoder.bin")

fa_encoder.load_state_dict(torch.load(encoder_ckpt))
fa_decoder.load_state_dict(torch.load(decoder_ckpt))

fa_encoder.eval()
fa_decoder.eval()

accelerator = 'cuda:0'
fa_encoder.to(accelerator)
fa_decoder.to(accelerator)

  WeightNorm.apply(module, name, dim)
  fa_encoder.load_state_dict(torch.load(encoder_ckpt))
  fa_decoder.load_state_dict(torch.load(decoder_ckpt))


FACodecDecoder(
  (quantizer): ModuleList(
    (0): ResidualVQ(
      (layers): ModuleList(
        (0): FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (1): ResidualVQ(
      (layers): ModuleList(
        (0-1): 2 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (2): ResidualVQ(
      (layers): ModuleList(
        (0-2): 3 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
  )
  (model): Sequential(
    (0): Conv1d(256, 

In [None]:
import os
import torch
from oz2 import OZ2
import soundfile as sf
from omegaconf import OmegaConf

root = '/cm/archive/nghiahnh'

exp_name = 'oz-v3.1-base-500h-libritts'
exp_version = 'version_5'
ckpt_name = 'last'
ckpt_path = os.path.join(root, f'exp/{exp_name}/lightning_logs/{exp_version}/checkpoints/{ckpt_name}.ckpt')

codec_cfg = OmegaConf.load(os.path.join(root, 'OZ2/configs/codec.yaml'))
codec_cfg['encoder']['device'] = accelerator
codec_cfg['decoder']['device'] = accelerator

cfg = OmegaConf.load(os.path.join(root, f'exp/{exp_name}/config.yaml'))
cfg['flow_matching']['device'] = accelerator
cfg['codes_generator']['device'] = accelerator

model = OZ2.from_pretrained(
    cfg=cfg, 
    ckpt_path=ckpt_path,
    device=accelerator,
    training_mode=False
)
model.to(accelerator)

In [3]:
import os
import time
import torch
import librosa
import numpy as np
from omegaconf import DictConfig


def synthesize(
    model, 
    text: str,
    acoustic_prompt: str | np.ndarray | torch.Tensor,
    sr: int = 16000,
    codec_cfg: DictConfig = None,
    codec_encoder: torch.nn.Module = None,
    codec_decoder: torch.nn.Module = None,
    temperature: float = 0.02,
    lexicon_path: str = None,
    cleaners: str = ['english_cleaners'],
    ):
    
    if codec_encoder is None or codec_decoder is None:
        if codec_cfg is None:
            raise ValueError('The codec_encoder or codec_decoder is set to None. To initialize the codec encoder or decoder, you need to provide a codec_cfg of type omegaconf.DictConfig.')
        codec_cfg['encoder'] = accelerator
        codec_cfg['decoder'] = accelerator
        codec_encoder, codec_decoder = model._get_codec_models(codec_cfg)
        
    # process acoustic prompt
    acoustic_prompt = model._preprocess_acoustic_prompt(acoustic_prompt, sr)

    start = time.time()
    
    enc_out = codec_encoder(acoustic_prompt)
    _, prompt, _, _, timbre = codec_decoder(enc_out, eval_vq=False, vq=True)
    prompt = prompt.permute(1, 0, 2)
    
    # process phoneme
    phonemes, _, _ = model._preprocess_english(text, lexicon_path, cleaners)
    phonemes = phonemes.to(accelerator)
    codes_generator_outputs = model.codes_generator(
        texts=phonemes,
        src_lens=torch.zeros(phonemes.size(0), device=accelerator) + phonemes.size(1),
        max_src_len=phonemes.shape[-1],
    )
    prior, prior_logits = codes_generator_outputs[0], codes_generator_outputs[1]
    
    # flow matching euler solving
    logits = model.flow_matching.sampling(
        prior=prior,
        x_len=torch.zeros(prior.size(0), device=accelerator) + prior.size(2),
        x_max_len=prior.size(2),
        prompts=prompt,
        temperature=temperature,
    )['logits']
    
    # revert codes into waveform
    prior_codes = prior_logits.softmax(1).argmax(1)
    prior_codes = prior_codes.permute(1, 0, 2)
    prior_embs = codec_decoder.vq2emb(prior_codes)
    prior_wav = codec_decoder.inference(prior_embs, timbre)
    prior_wav = prior_wav[0][0].detach().cpu().numpy()
    
    codes = logits.softmax(1).argmax(1)
    codes = codes.permute(1, 0, 2)
    embs = codec_decoder.vq2emb(codes)
    wav = codec_decoder.inference(embs, timbre)
    wav = wav[0][0].detach().cpu().numpy()
    
    end = time.time()
    # get ending time of the progress

    return wav, prior_wav, end - start

In [None]:
from IPython.display import Audio

idx = 0
data_root = '/cm/archive/nghiahnh/data/raw/LibriSpeech/LibriSpeech-Clipped-3s'

# target_audio = os.path.join(data_root, f'gt-{idx}.wav')
sample = open('/cm/archive/sonnn45/Amphion/test-clean-3s-long.txt', 'r').read().split('|')
gt, prompt = sample[0], sample[1]
sep = "   "
transcript1 = f"At the forefront of artificial intelligence innovation, FPT Software AI Center is recognized as the premier AI research and product development hub{sep}"
transcript2 = f"Our partnerships with the world’s leading AI institutions, including Landing AI (founded by Andrew Ng), Mila (led by Yoshua Bengio), and AItomatic (by Christopher C. Nguyen), underscore our commitment to excellence and collaboration{sep}"
transcript3 = f"As a proud member of the AI Alliance alongside giants like IBM, Dell, Meta, and esteemed universities and institutes worldwide, we are at the cutting edge of AI advancements{sep}"
transcript4 = f"Our team is a dynamic fusion of experienced experts, visionary scientists, and innovative engineers specializing in AI, Machine Learning, and Generative AI{sep}"
transcript5 = f"Driven by a relentless pursuit of knowledge and application, we are dedicated to conducting groundbreaking research and developing state-of-the-art products that leverage AI to address real-world challenges{sep}"
transcript6 = f"Our contributions to the global AI community are underscored by our publications at top AI conferences, positioning Vietnam as a significant player on the world AI map{sep}"

# prompt_audio = os.path.join(data_root, f'prompt-{idx}.wav')
# filename = '61-70968-0005'
prompt_audio = os.path.join(data_root, f'{prompt}')
# prompt_text = open(os.path.join(data_root, f'{filename}.normalized.txt'), 'r').read()

In [5]:
Audio(filename=prompt_audio)

In [6]:
wavs = []
for transcript in [transcript1, transcript2, transcript3, transcript4, transcript5, transcript6]:
    wav, non_styled, rtf = synthesize(
        model,
        text=transcript,
        acoustic_prompt=prompt_audio,
        codec_cfg=codec_cfg,
        codec_encoder=fa_encoder,
        codec_decoder=fa_decoder,
    )
    wavs.append(wav)

In [7]:
wav_concat = np.concatenate(tuple(wavs), axis=0)

In [8]:
Audio(data=wav_concat, rate=16000)

In [None]:
sf.write(f"/cm/archive/nghiahnh/concatenated_3space.wav", wav_concat, 16000)

In [108]:
# Audio(filename=target_audio)

In [13]:
DUR_TARGET = 5
data = []
with open(f'/cm/archive/nghiahnh/data/raw/LibriSpeech/test-clean-clipped-{DUR_TARGET}s.txt', 'r') as fin:
    for ith, line in enumerate(list(fin)):
        target_name, prompt_name, target_transcript, _, _, dur  = line.rstrip().split('|')
        data.append((target_name, prompt_name, target_transcript, dur))
data = data[:100]

In [14]:
from tqdm import tqdm


time_evaluation = []
for item in tqdm(data):
    target_name, prompt_name, target_text, dur = item
    prompt_audio = os.path.join(f'/cm/archive/nghiahnh/data/raw/LibriSpeech/LibriSpeech-Clipped-{DUR_TARGET}s', prompt_name)
    wav, non_styled, rtf = synthesize(
        model,
        text=target_text,
        acoustic_prompt=prompt_audio,
        codec_cfg=codec_cfg,
        codec_encoder=fa_encoder,
        codec_decoder=fa_decoder,
    )
    time_evaluation.append([target_name, dur, rtf])

100%|██████████| 100/100 [02:38<00:00,  1.59s/it]


In [15]:
import pandas as pd

df = pd.DataFrame(time_evaluation, columns=['Filename', 'Duration', 'RTF'])
df.to_csv(f'/cm/archive/nghiahnh/exp/oz-v3.1-base-500h-libritts/rft-prompt-{DUR_TARGET}s.csv')

In [110]:
sf.write(os.path.join(data_root, f"{filename}-synthesized.wav"), wav, 16000)
Audio(data=wav, rate=16000)

In [None]:
test_wav = librosa.load(prompt_audio, sr=16000)[0]
test_wav = torch.from_numpy(test_wav).float()
test_wav = test_wav.unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    enc_out = fa_encoder(test_wav)
    _, _, _, _, spk_embs = fa_decoder(enc_out, eval_vq=False, vq=True)

In [None]:
vq = torch.IntTensor(non_styled).unsqueeze(1)
vq = torch.clamp(vq, min=0, max=1023)
emb = fa_decoder.vq2emb(vq)
recon_wav = fa_decoder.inference(emb, spk_embs)
# sf.write(f"libriTTS-test-synthesized/prior-{idx}.wav", recon_wav[0][0].detach().cpu().numpy(), 16000)

Audio(data=recon_wav[0][0].detach().cpu().numpy(), rate=16000)

In [None]:
from tqdm import tqdm

for idx in tqdm(range(3, 20)):
    prompt_audio = os.path.join(data_root, f'prompt-{idx}.wav')
    target_text = open(os.path.join(data_root, f'gt-{idx}.txt'), 'r').read()
    
    wav, non_styled, text, phonemes_raw = synthesize(
        model,
        text=target_text,
        acoustic_prompt=prompt_audio,
        codec_cfg=codec_cfg,
        n_timesteps=1,
        sampling_strategy='onestep'
    )

    sf.write(f"libriTTS-test-synthesized/synthesized-{idx}.wav", wav, 16000)
    
    vq = torch.IntTensor(non_styled).unsqueeze(1)
    emb = fa_decoder.vq2emb(vq)
    recon_wav = fa_decoder.inference(emb, spk_embs)
    sf.write(f"libriTTS-test-synthesized/prior-{idx}.wav", recon_wav[0][0].detach().cpu().numpy(), 16000)

In [12]:
# import os
# import json

# with open('dump.json', 'r') as fin:
#     noise = json.load(fin)['quantizer']

In [4]:
import os
import librosa
from tqdm import tqdm
import soundfile as sf

def resample_audio(input_file, output_file, target_sr):
    """Resample audio to a target sample rate.
    
    Args:
        input_file: Path to the input audio file.
        output_file: Path to save the resampled audio.
        target_sr: Desired sample rate (in Hz).
    """
    # Load the audio file
    y, sr = librosa.load(input_file, sr=None)  # sr=None to preserve original sample rate

    # Resample the audio
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)

    # Save the resampled audio
    sf.write(output_file, y_resampled, target_sr)

# Example usage
# input_audio_file = 'input_audio.wav'  # Replace with your input file
# output_audio_file = 'output_audio.wav'  # Desired output file name
# target_sample_rate = 22050  # Desired sample rate in Hz

# resample_audio(input_audio_file, output_audio_file, 22050)

src = os.listdir('libriTTS-test-synthesized')
for audio in tqdm(src):
    resample_audio(
        os.path.join('libriTTS-test-synthesized', audio), 
        os.path.join('reshampled', audio),
        target_sr = 22050
)

100%|██████████| 40/40 [00:00<00:00, 56.56it/s]


In [2]:
import os
from tqdm import tqdm

audio_paths = []
root = '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean'
spks = os.listdir(root)

for spk in tqdm(spks):
    folders = os.listdir(os.path.join(root, spk))
    for folder in folders:
        audios = os.listdir(os.path.join(root, spk, folder))
        for audio in audios:
            if not audio.endswith('.wav'):
                continue
            audio_paths.append(os.path.join(root, spk, folder, audio))

  0%|          | 0/39 [00:00<?, ?it/s]

100%|██████████| 39/39 [00:00<00:00, 131.56it/s]


In [5]:
audio_paths[:10]

['/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/61/70970/61_70970_000007_000001.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000023_000000.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000012_000004.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000011_000002.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000021_000002.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000020_000000.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000024_000001.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000021_000001.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000019_000001.wav',
 '/cm/archive/nghiahnh/data/raw/LibriTTS/test-clean/1995/1837/1995_1837_000022_000003.wav']

In [346]:
import random
import librosa
from IPython.display import Audio

audio_paths = []
spk = random.choice(spks)
print(spk)

folders = os.listdir(os.path.join(root, spk))
for folder in folders:
    audios = os.listdir(os.path.join(root, spk, folder))
    for audio in audios:
        if not audio.endswith('.wav'):
            continue
        audio_paths.append(os.path.join(root, spk, folder, audio))

1995


In [347]:
prompt = random.choice(audio_paths)
print(librosa.get_duration(path=prompt))

sample = random.choice(audio_paths)
print(librosa.get_duration(path=sample))

5.51
8.36


In [348]:
Audio(filename=prompt, rate=16000)

In [349]:
Audio(filename=sample, rate=16000)