# Imports

In [1]:
import os
import time
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Loading model...")
config = XttsConfig()
config

Loading model...


XttsConfig(output_path='output', logger_uri=None, run_name='run', project_name=None, run_description='🐸Coqui trainer run.', print_step=25, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', save_on_interrupt=True, log_model_step=None, save_step=10000, save_n_checkpoints=5, save_checkpoints=True, save_all_best=False, save_best_after=0, target_loss=None, print_eval=False, test_delay_epochs=0, run_eval=True, run_eval_steps=None, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=False, precision='fp16', epochs=1000, batch_size=32, eval_batch_size=16, grad_clip=0.0, scheduler_after_epoch=True, lr=0.001, optimizer='radam', optimizer_params=None, lr_scheduler=None, lr_scheduler_params={}, use_grad_scaler=False, allow_tf32=False, cudnn_enable=True, cudnn_deterministic=False, cudnn_benchmark=False, training_seed=54321, model='xtts', num_loader_workers=0, num_eval_loader_workers=0, use_noise_augment=False, audio=XttsAudio

In [3]:
config.load_json("config.json")


In [4]:
model = Xtts.init_from_config(config)
model

Xtts(
  (gpt): GPT(
    (conditioning_encoder): ConditioningEncoder(
      (init): Conv1d(80, 1024, kernel_size=(1,), stride=(1,))
      (attn): Sequential(
        (0): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Identity()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (1): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Identity()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (2): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Ide

In [5]:
model.load_checkpoint(config, checkpoint_dir="/home/jack/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2", use_deepspeed=False)


In [6]:
model.cuda()

Xtts(
  (gpt): GPT(
    (conditioning_encoder): ConditioningEncoder(
      (init): Conv1d(80, 1024, kernel_size=(1,), stride=(1,))
      (attn): Sequential(
        (0): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Identity()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (1): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Identity()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (2): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Ide

In [7]:

print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["latent_GSH.wav"])
gpt_cond_latent, speaker_embedding

Computing speaker latents...


  resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq)


(tensor([[[-0.5341, -0.1185,  0.7173,  ..., -0.0443,  1.0063,  1.4897],
          [-1.6895, -0.4350,  0.4050,  ...,  0.3356,  1.0068,  0.9335],
          [-0.7692,  2.0739, -0.7732,  ..., -0.2530,  0.0115,  0.4117],
          ...,
          [ 0.4175,  1.0802, -1.3186,  ..., -0.2953,  0.1736,  1.1033],
          [-1.3716, -0.1179, -0.7175,  ..., -0.5734,  0.5188,  0.7919],
          [ 0.0989, -0.1894,  0.8981,  ...,  1.0205, -0.8232,  1.2470]]],
        device='cuda:0'),
 tensor([[[-1.1738e-02],
          [-6.4834e-03],
          [-3.0302e-02],
          [-2.8703e-02],
          [ 6.8789e-02],
          [-1.0001e-02],
          [-4.8841e-02],
          [-6.8565e-02],
          [ 6.2753e-02],
          [ 4.3769e-02],
          [-2.0328e-02],
          [-1.0037e-02],
          [ 5.7198e-02],
          [ 4.1895e-02],
          [-4.2394e-03],
          [-7.5549e-02],
          [ 1.7578e-01],
          [-1.5549e-01],
          [ 4.4289e-02],
          [ 1.7567e-02],
          [-3.3623e-02],


In [8]:
print("Inference...")
t0 = time.time()
chunks = model.inference_stream(
        """
The revolution will not be right back after a message about a white tornado white lightning or white people,
The revolution will not go better with Coke,
The revolution will be no re-run brothers,
The revolution will be live""",
    "en",
    gpt_cond_latent,
    speaker_embedding
)


Inference...


In [9]:
# #iterate over the tensor chunks
# for chunk in chunks:
#     print(chunk)
#     print(chunk.shape)

In [10]:

wav_chuncks = []
i=0
for chunk in chunks:
    if i == 0:
        print(f"Time to first chunck: {time.time() - t0}")
    print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
    wav_chuncks.append(chunk)
    i+=1
#wav = torch.cat(wav_chuncks, dim=0)
#torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)




Time to first chunck: 1.0819876194000244
Received chunk 0 of audio length 21248
Received chunk 1 of audio length 22272
Received chunk 2 of audio length 22272
Received chunk 3 of audio length 22272
Received chunk 4 of audio length 22272
Received chunk 5 of audio length 22272
Received chunk 6 of audio length 22272
Received chunk 7 of audio length 22272
Received chunk 8 of audio length 22272
Received chunk 9 of audio length 22272
Received chunk 10 of audio length 22272
Received chunk 11 of audio length 22272
Received chunk 12 of audio length 22272
Received chunk 13 of audio length 22528
Received chunk 14 of audio length 22272
Received chunk 15 of audio length 22272
Received chunk 16 of audio length 22272
Received chunk 17 of audio length 22272
Received chunk 18 of audio length 1024


In [11]:
wav_chuncks

[tensor([-0.0120, -0.0113, -0.0098,  ..., -0.0026, -0.0030, -0.0034],
        device='cuda:0'),
 tensor([-0.0035, -0.0029, -0.0036,  ..., -0.0053, -0.0082, -0.0087],
        device='cuda:0'),
 tensor([-0.0078, -0.0062, -0.0064,  ..., -0.1118, -0.0944, -0.0295],
        device='cuda:0'),
 tensor([0.0945, 0.2392, 0.3596,  ..., 0.0071, 0.0099, 0.0118], device='cuda:0'),
 tensor([ 0.0125,  0.0112,  0.0093,  ..., -0.0386,  0.1020,  0.1520],
        device='cuda:0'),
 tensor([ 0.0352, -0.1059, -0.0701,  ..., -0.0028, -0.0022, -0.0028],
        device='cuda:0'),
 tensor([-0.0041, -0.0065, -0.0079,  ...,  0.0327, -0.0348, -0.0940],
        device='cuda:0'),
 tensor([-0.1189, -0.0987, -0.0556,  ..., -0.0009, -0.0009, -0.0009],
        device='cuda:0'),
 tensor([-0.0005, -0.0003, -0.0006,  ..., -0.0265, -0.0606, -0.0648],
        device='cuda:0'),
 tensor([-0.0621, -0.0763, -0.1066,  ...,  0.0533,  0.0691,  0.0870],
        device='cuda:0'),
 tensor([0.1056, 0.1148, 0.1015,  ..., 0.0286, 0.0252,

In [12]:
wav = torch.cat(wav_chuncks, dim=0)
torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
print(f"Time to completion: {time.time() - t0}")