In [1]:
import sys
sys.path.append("..")

import IPython.display as ipd
import os
import torch
from torch.utils.data import DataLoader
from trainer import get_optimizer
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples, TTSDataset
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor

from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer

from cl_tts.benchmarks.dataset_formatters import vctk

In [3]:
ds_path = "/raid/hhemati/Datasets/Speech/CL-TTS/VCTK/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model

In [5]:
config = Tacotron2Config(
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(ds_path, "phonemes"),
    use_d_vector_file=True,
    d_vector_dim=256,
)

ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

d_vectors_file_path = os.path.join(ds_path, "speaker_embedding_means.json")
speaker_manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)

model = Tacotron2(config, ap, tokenizer, speaker_manager=speaker_manager)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [28]:
model.config

Tacotron2Config(output_path='output', logger_uri=None, run_name='run', project_name=None, run_description='üê∏Coqui trainer run.', print_step=25, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', log_model_step=None, save_step=10000, save_n_checkpoints=5, save_checkpoints=True, save_all_best=False, save_best_after=10000, target_loss=None, print_eval=False, test_delay_epochs=0, run_eval=True, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=False, epochs=1000, batch_size=32, eval_batch_size=16, grad_clip=5.0, scheduler_after_epoch=True, lr=0.0001, optimizer='RAdam', optimizer_params={'betas': [0.9, 0.998], 'weight_decay': 1e-06}, lr_scheduler='NoamLR', lr_scheduler_params={'warmup_steps': 4000}, use_grad_scaler=False, cudnn_enable=True, cudnn_benchmark=True, torch_seed=54321, model='tacotron2', num_loader_workers=0, num_eval_loader_workers=0, use_noise_augment=False, use_language_weighted_sampler=False, audio=

# Optimizer and Criterion

In [6]:
optimizer = get_optimizer(
    optimizer_name=config.optimizer,
    optimizer_params=config.optimizer_params,
    lr=config.lr,
    model=model,
)
criterion = model.get_criterion()


# Data

In [7]:
dataset_config = BaseDatasetConfig(
    name="vctk",  path=ds_path, meta_file_train="metadata.txt"
)
train_samples, eval_samples = load_tts_samples(dataset_config, formatter=vctk)

current_speakers = ["vctk_p336"]
train_samples2 = [x for x in train_samples if x["speaker_name"] in current_speakers]

print(len(train_samples))

 | > Found 44070 files in /raid/hhemati/Datasets/Speech/CL-TTS/VCTK
43630


In [8]:
samples = train_samples
is_eval = False

def get_dataset(samples, is_eval):
    dataset = TTSDataset(
        outputs_per_step=config.r if "r" in config else 1,
        compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec,
        compute_f0=config.get("compute_f0", False),
        f0_cache_path=config.get("f0_cache_path", None),
        samples=samples,
        ap=ap,
        return_wav=config.return_wav if "return_wav" in config else False,
        batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
        min_text_len=config.min_text_len,
        max_text_len=config.max_text_len,
        min_audio_len=config.min_audio_len,
        max_audio_len=config.max_audio_len,
        phoneme_cache_path=config.phoneme_cache_path,
        precompute_num_workers=config.precompute_num_workers,
        use_noise_augment=False if is_eval else config.use_noise_augment,
        verbose=False,
        # speaker_id_mapping=speaker_id_mapping,
        # d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
        tokenizer=tokenizer,
        start_by_longest=config.start_by_longest,
        # language_id_mapping=language_id_mapping,
    )

    return dataset

dataset = get_dataset(train_samples, False)
dataset.preprocess_samples()
data_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=False,  # shuffle is done in the dataset.
    collate_fn=dataset.collate_fn,
    drop_last=False,  # setting this False might cause issues in AMP training.
    sampler=None,
    num_workers=0,
    pin_memory=False,
)

print(dataset[0])

{'raw_text': 'I want him to take on Gomez.', 'token_ids': array([  4,  64, 130,  25,  44,  16,  22, 130,  10,  64,  15, 130,  22,
        49, 130,  22,   8,  64,  13, 130,  44,  16, 130,  56,  17,  90,
        15,  51,  28, 126], dtype=int32), 'wav': array([-3.0517578e-05,  3.0517578e-05,  6.1035156e-05, ...,
        4.5471191e-03,  4.5166016e-03,  4.3334961e-03], dtype=float32), 'pitch': None, 'attn': None, 'item_idx': '/raid/hhemati/Datasets/Speech/CL-TTS/VCTK/wavs/p323/p323_424.wav', 'speaker_name': 'vctk_p323', 'language_name': '', 'wav_file_name': 'p323_424.wav'}


In [20]:
batch = next(iter(data_loader))
print(batch["text_input"].shape)

KeyError: 'text_input'

In [10]:
batch = model.format_batch(batch)

speaker_embeddings = [model.speaker_manager.get_d_vectors_by_speaker(spk) for spk in batch["speaker_names"]]
speaker_embeddings = torch.FloatTensor(speaker_embeddings).squeeze(1)

batch["d_vectors"] = speaker_embeddings.to(device)

for k in batch.keys():
    if isinstance(batch[k], torch.Tensor):
        batch[k] = batch[k].to(device)

model.cuda()

Tacotron2(
  (embedding): Embedding(131, 512, padding_idx=0)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (activation): ReLU()
      )
      (1): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (activation): ReLU()
      )
      (2): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=Fa

# Train step

In [12]:
outputs, loss_dict = model.train_step(batch, criterion)

  alignment_lengths = mel_lengths // self.decoder.r
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [13]:
loss_dict["loss"].backward()
# if grad_clip > 0:
#     grad_norm = torch.nn.utils.clip_grad_norm_(self.master_params(optimizer), grad_clip)
optimizer.step()

In [None]:
# model.inference(batch["token_id"][0].unsqueeze(0),
#                 {"d_vectors": speaker_embeddings[0].unsqueeze(0)})