In [1]:
import sys
sys.path.append("..")

import IPython.display as ipd
import os
import torch
from torch.utils.data import DataLoader
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples, TTSDataset
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor

from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer

from cl_tts.benchmarks.formatters import vctk

In [2]:
ds_path = "/raid/hhemati/Datasets/Speech/CL-TTS/VCTK/"
dataset_config = BaseDatasetConfig(
    name="vctk",  path=ds_path, meta_file_train="metadata.txt"
)
train_samples, eval_samples = load_tts_samples(dataset_config, formatter=vctk)

 | > Found 44070 files in /raid/hhemati/Datasets/Speech/CL-TTS/VCTK


In [3]:
train_samples
current_speakers = ["vctk_p336"]
train_samples2 = [x for x in train_samples if x["speaker_name"] in current_speakers]
print(len(train_samples))

43630


In [4]:
config = Tacotron2Config(
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(ds_path, "phonemes"),
    use_d_vector_file=True,
    d_vector_dim=256,
)

ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

d_vectors_file_path = os.path.join(ds_path, "speaker_embedding_means.json")
speaker_manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)

model = Tacotron2(config, ap, tokenizer, speaker_manager=speaker_manager)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [5]:
samples = train_samples
is_eval = False

def get_dataset(samples, is_eval):
    dataset = TTSDataset(
        outputs_per_step=config.r if "r" in config else 1,
        compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec,
        compute_f0=config.get("compute_f0", False),
        f0_cache_path=config.get("f0_cache_path", None),
        samples=samples,
        ap=ap,
        return_wav=config.return_wav if "return_wav" in config else False,
        batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
        min_text_len=config.min_text_len,
        max_text_len=config.max_text_len,
        min_audio_len=config.min_audio_len,
        max_audio_len=config.max_audio_len,
        phoneme_cache_path=config.phoneme_cache_path,
        precompute_num_workers=config.precompute_num_workers,
        use_noise_augment=False if is_eval else config.use_noise_augment,
        verbose=False,
        # speaker_id_mapping=speaker_id_mapping,
        # d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
        tokenizer=tokenizer,
        start_by_longest=config.start_by_longest,
        # language_id_mapping=language_id_mapping,
    )

    return dataset

dataset = get_dataset(train_samples, False)
dataset.preprocess_samples()
data_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=False,  # shuffle is done in the dataset.
    collate_fn=dataset.collate_fn,
    drop_last=False,  # setting this False might cause issues in AMP training.
    sampler=None,
    num_workers=0,
    pin_memory=False,
)

In [6]:
dataset[0]

{'raw_text': 'I want him to take on Gomez.',
 'token_ids': array([  4,  64, 130,  25,  44,  16,  22, 130,  10,  64,  15, 130,  22,
         49, 130,  22,   8,  64,  13, 130,  44,  16, 130,  56,  17,  90,
         15,  51,  28, 126], dtype=int32),
 'wav': array([-3.0517578e-05,  3.0517578e-05,  6.1035156e-05, ...,
         4.5471191e-03,  4.5166016e-03,  4.3334961e-03], dtype=float32),
 'pitch': None,
 'attn': None,
 'item_idx': '/raid/hhemati/Datasets/Speech/CL-TTS/VCTK/wavs/p323/p323_424.wav',
 'speaker_name': 'vctk_p323',
 'language_name': '',
 'wav_file_name': 'p323_424.wav'}

In [79]:
batch = next(iter(data_loader))

In [82]:
speaker_embeddings = [model.speaker_manager.get_d_vectors_by_speaker(spk) for spk in batch["speaker_names"]]
speaker_embeddings = torch.FloatTensor(speaker_embeddings).squeeze(1)



In [83]:
batch["token_id"] = batch["token_id"].cuda()
batch["token_id_lengths"] = batch["token_id_lengths"].cuda()
batch["mel"] = batch["mel"].cuda()
batch["mel_lengths"] = batch["mel_lengths"].cuda()
speaker_embeddings = speaker_embeddings.cuda()
model.cuda()

Tacotron2(
  (embedding): Embedding(131, 512, padding_idx=0)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (activation): ReLU()
      )
      (1): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (activation): ReLU()
      )
      (2): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=Fa

In [84]:
model.train()
out = model(
    batch["token_id"],
    batch["token_id_lengths"],
    mel_specs=batch["mel"],
    mel_lengths=batch["mel_lengths"],
    aux_input={"d_vectors": speaker_embeddings}
)

In [88]:
model.inference(batch["token_id"][0].unsqueeze(0),
                {"d_vectors": speaker_embeddings[0].unsqueeze(0)})

{'model_outputs': tensor([[[ 1.4087e+00,  9.6026e-01,  3.3456e+00,  2.2402e+00,  3.4868e+00,
            1.5307e-01, -3.2522e-03, -1.9621e-01, -7.3938e-02, -9.5912e-01,
            3.4012e+00, -2.0375e+00, -8.5886e-01, -2.8008e-03, -3.1925e-02,
            8.3162e-01,  1.0890e-01, -1.3711e-01,  2.9591e+00,  2.2140e+00,
           -8.8970e-02, -2.9820e+00,  5.0595e-01,  7.5702e-04,  5.1531e-02,
            1.4500e-01, -1.8699e+00, -6.3906e-02, -5.1753e-02,  1.0715e+00,
           -1.4882e+00, -3.5094e-02, -2.6576e+00,  2.7956e+00, -9.1014e-02,
            6.2020e-02, -9.5604e-02, -9.6077e-02, -6.0496e-02,  1.7436e+00,
           -5.8411e-01,  6.8754e-01,  2.8475e+00,  1.0079e+00,  3.4597e+00,
           -5.5588e-02, -5.2296e-01, -4.0184e-02, -2.5774e+00, -3.1671e-02,
            2.4623e+00,  3.3097e+00,  1.0811e-01,  4.5793e-02,  3.3436e+00,
           -7.7866e-01,  1.5434e-01,  1.7471e-01,  9.7976e-01,  1.1042e+00,
           -7.9637e-02, -9.7339e-03,  6.8717e-02, -1.5057e+00, -1.2055e

In [108]:
class MyClass:
    def __int__(self):
        self.a = 5

    def func1(self, x):
        x += self.a
        return x


In [109]:
mc = MyClass()
f = mc.func1

In [110]:
f(5)

AttributeError: 'MyClass' object has no attribute 'a'