Skip to content

Commit

Permalink
Radtts 1.13 (#5451)
Browse files Browse the repository at this point in the history
* [TTS] Fixing RADTTS training - removing view buffer and fixing accuracy issue (#5358)
* [TTS] add CI test for RADTTS training recipe.

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
  • Loading branch information
3 people committed Nov 18, 2022
1 parent c170e03 commit 542ab14
Show file tree
Hide file tree
Showing 12 changed files with 238 additions and 251 deletions.
37 changes: 34 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -4111,7 +4111,9 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
sup_data_path=/home/TestData/an4_dataset/beta_priors \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
trainer.max_epochs=1 \
trainer.strategy=null \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
Expand All @@ -4127,14 +4129,41 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
~model.text_normalizer_call_kwargs'
}
}
stage('RADTTS') {
steps {
sh 'python examples/tts/radtts.py \
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
trainer.max_epochs=1 \
trainer.strategy=null \
model.pitch_mean=212.35873413085938 \
model.pitch_std=68.52806091308594 \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
model.validation_ds.dataloader_params.num_workers=0 \
export_dir=/home/TestData/radtts_test \
model.optim.lr=0.0001 \
model.modelConfig.decoder_use_partial_padding=True \
~trainer.check_val_every_n_epoch \
~model.text_normalizer \
~model.text_normalizer_call_kwargs'
}
}
stage('Mixer-TTS') {
steps {
sh 'python examples/tts/mixer_tts.py \
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
sup_data_path=/home/TestData/an4_dataset/sup_data \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
trainer.max_epochs=1 \
trainer.strategy=null \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
Expand All @@ -4151,7 +4180,9 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 +trainer.max_epochs=1 \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
+trainer.max_epochs=1 \
trainer.strategy=null \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/common/callbacks/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
import time

from pytorch_lightning.callbacks.base import Callback
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.utilities import rank_zero_only

# from sacrebleu import corpus_bleu
Expand Down
19 changes: 2 additions & 17 deletions nemo/collections/tts/models/radtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy
from nemo.collections.tts.losses.radttsloss import AttentionBinarizationLoss, RADTTSLoss
from nemo.collections.tts.models.base import SpectrogramGenerator
from nemo.collections.tts.modules.submodules import PartialConv1d
from nemo.core.classes import Exportable
from nemo.core.classes.common import typecheck
from nemo.core.neural_types.elements import Index, MelSpectrogramType, TokenIndex
Expand Down Expand Up @@ -159,7 +158,7 @@ def training_step(self, batch, batch_idx):
loss_outputs['binarization_loss'] = (binarization_loss, 1.0)

for k, (v, w) in loss_outputs.items():
self.log("train/" + k, loss_outputs[k][0])
self.log("train/" + k, loss_outputs[k][0], on_step=True)

return {'loss': loss}

Expand Down Expand Up @@ -229,7 +228,7 @@ def validation_epoch_end(self, outputs):

for k, v in loss_outputs.items():
if k != "binarization_loss":
self.log("val/" + k, loss_outputs[k][0])
self.log("val/" + k, loss_outputs[k][0], sync_dist=True, on_epoch=True)

attn = outputs[0]["attn"]
attn_soft = outputs[0]["attn_soft"]
Expand Down Expand Up @@ -407,17 +406,3 @@ def output_module(self):

def forward_for_export(self, text, lens, speaker_id, speaker_id_text, speaker_id_attributes):
return self.model.forward_for_export(text, lens, speaker_id, speaker_id_text, speaker_id_attributes)

def get_export_subnet(self, subnet=None):
return self.model.get_export_subnet(subnet)

def _prepare_for_export(self, **kwargs):
"""
Override this method to prepare module for export. This is in-place operation.
Base version does common necessary module replacements (Apex etc)
"""
PartialConv1d.forward = PartialConv1d.forward_no_cache
super()._prepare_for_export(**kwargs)

def _export_teardown(self):
PartialConv1d.forward = PartialConv1d.forward_with_cache
95 changes: 28 additions & 67 deletions nemo/collections/tts/modules/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
piecewise_linear_transform,
unbounded_piecewise_quadratic_transform,
)
from nemo.collections.tts.modules.submodules import ConvNorm, LinearNorm
from nemo.collections.tts.modules.submodules import ConvNorm, LinearNorm, MaskedInstanceNorm1d


@torch.jit.script
Expand All @@ -45,7 +45,7 @@ def get_mask_from_lengths_and_val(lengths, val):
max_len = val.shape[-1]
ids = torch.arange(0, max_len, device=lengths.device)
mask = ids < lengths.unsqueeze(1)
return mask.float()
return mask


@torch.jit.script
Expand Down Expand Up @@ -124,30 +124,31 @@ def lstm_tensor(self, context: Tensor, lens: Tensor, enforce_sorted: bool = Fals
seq = nn.utils.rnn.pack_padded_sequence(
context, lens.long().cpu(), batch_first=True, enforce_sorted=enforce_sorted
)
if not torch.jit.is_scripting():
if not (torch.jit.is_scripting() or torch.jit.is_tracing()):
self.bilstm.flatten_parameters()
ret, _ = self.bilstm(seq)
if hasattr(self.bilstm, 'forward'):
ret, _ = self.bilstm.forward(seq)
else:
ret, _ = self.bilstm.forward_1(seq)
return nn.utils.rnn.pad_packed_sequence(ret, batch_first=True)

@torch.jit.export
def lstm_sequence(self, seq: PackedSequence) -> Tuple[Tensor, Tensor]:
if not torch.jit.is_scripting():
if not (torch.jit.is_scripting() or torch.jit.is_tracing()):
self.bilstm.flatten_parameters()
ret, _ = self.bilstm(seq)
if hasattr(self.bilstm, 'forward'):
ret, _ = self.bilstm.forward(seq)
elif hasattr(self.bilstm, 'forward_1'):
ret, _ = self.bilstm.forward_1(seq)
return nn.utils.rnn.pad_packed_sequence(ret, batch_first=True)

@torch.jit.export
def sort_and_lstm_tensor(self, context: Tensor, lens: Tensor) -> Tensor:
lens_sorted, ids_sorted = torch.sort(lens, descending=True)
unsort_ids = torch.zeros_like(ids_sorted)
for i in range(ids_sorted.shape[0]):
unsort_ids[ids_sorted[i]] = i
context = context[ids_sorted]
context, lens_sorted, unsort_ids = sort_tensor(context, lens)
seq = nn.utils.rnn.pack_padded_sequence(
context, lens_sorted.long().cpu(), batch_first=True, enforce_sorted=True
)
ret, _ = self.bilstm(seq)
return nn.utils.rnn.pad_packed_sequence(ret, batch_first=True)[0][unsort_ids]
return self.lstm_sequence(seq)[0][unsort_ids]


class ConvLSTMLinear(BiLSTM):
Expand All @@ -161,14 +162,14 @@ def __init__(
p_dropout=0.1,
use_partial_padding=False,
norm_fn=None,
lstm_norm_fn="spectral",
):
super(ConvLSTMLinear, self).__init__(n_channels, int(n_channels // 2), 1)
self.out_dim = out_dim
self.convolutions = nn.ModuleList()

if n_layers > 0:
self.dropout = nn.Dropout(p=p_dropout)
self.convolutions = nn.ModuleList()

use_weight_norm = norm_fn is None

for i in range(n_layers):
conv_layer = ConvNorm(
Expand All @@ -179,85 +180,46 @@ def __init__(
padding=int((kernel_size - 1) / 2),
dilation=1,
w_init_gain='relu',
use_weight_norm=False,
use_weight_norm=use_weight_norm,
use_partial_padding=use_partial_padding,
norm_fn=norm_fn,
)
if norm_fn is not None:
print("Applying {} norm to {}".format(norm_fn, conv_layer))
else:
conv_layer = torch.nn.utils.weight_norm(conv_layer.conv)
print("Applying weight norm to {}".format(conv_layer))
self.convolutions.append(conv_layer)

self.dense = None
if out_dim is not None:
self.dense = nn.Linear(n_channels, out_dim)

@torch.jit.export
def conv_to_sequence(self, context: Tensor, lens: Tensor, enforce_sorted: bool = False) -> PackedSequence:
context_embedded = []
bs: int = context.shape[0]
b_ind: int = 0
for b_ind in range(bs): # TODO: speed up
curr_context = context[b_ind : b_ind + 1, :, : lens[b_ind]].clone()
for conv in self.convolutions:
curr_context = self.dropout(F.relu(conv(curr_context)))
context_embedded.append(curr_context[0].transpose(0, 1))
seq = torch.nn.utils.rnn.pack_sequence(context_embedded, enforce_sorted=enforce_sorted)
return seq

@torch.jit.export
def conv_to_padded_tensor(self, context: Tensor, lens: Tensor) -> Tensor:
context_embedded = []
bs: int = context.shape[0]
b_ind: int = 0
for b_ind in range(bs): # TODO: speed up
curr_context = context[b_ind : b_ind + 1, :, : lens[b_ind]].clone()
for conv in self.convolutions:
curr_context = self.dropout(F.relu(conv(curr_context)))
context_embedded.append(curr_context[0].transpose(0, 1))
ret = torch.nn.utils.rnn.pad_sequence(context_embedded, batch_first=True)
return ret

@torch.jit.export
def masked_conv_to_sequence(self, context: Tensor, lens: Tensor, enforce_sorted: bool = False) -> PackedSequence:
mask = get_mask_from_lengths_and_val(lens, context)
mask = mask.unsqueeze(1)
mask = mask.to(dtype=context.dtype).unsqueeze(1)
for conv in self.convolutions:
context = self.dropout(F.relu(conv(context, mask)))
context = torch.mul(context, mask)

context = context.transpose(1, 2)
seq = torch.nn.utils.rnn.pack_padded_sequence(
context, lens.long().cpu(), batch_first=True, enforce_sorted=enforce_sorted
)
return seq

def forward(self, context: Tensor, lens: Optional[Tensor] = None) -> Tensor:
if lens is None:
for conv in self.convolutions:
context = self.dropout(F.relu(conv(context)))
context = context.transpose(1, 2)
context, _ = self.bilstm(context)
else:
# borisf : does not match ADLR (values, lengths)
# seq = self.masked_conv_to_sequence(context, lens, enforce_sorted=False)
# borisf : does match ADLR
seq = self.conv_to_sequence(context, lens, enforce_sorted=False)
context, _ = self.lstm_sequence(seq)
def forward(self, context: Tensor, lens: Tensor) -> Tensor:
context, lens, unsort_ids = sort_tensor(context, lens)
seq = self.masked_conv_to_sequence(context, lens, enforce_sorted=True)
context, _ = self.lstm_sequence(seq)
context = context[unsort_ids]

if self.dense is not None:
context = self.dense(context).permute(0, 2, 1)

return context


def getRadTTSEncoder(
encoder_n_convolutions=3,
encoder_embedding_dim=512,
encoder_kernel_size=5,
norm_fn=nn.BatchNorm1d,
lstm_norm_fn=None,
def get_radtts_encoder(
encoder_n_convolutions=3, encoder_embedding_dim=512, encoder_kernel_size=5, norm_fn=MaskedInstanceNorm1d,
):
return ConvLSTMLinear(
in_dim=encoder_embedding_dim,
Expand All @@ -267,15 +229,14 @@ def getRadTTSEncoder(
p_dropout=0.5,
use_partial_padding=True,
norm_fn=norm_fn,
lstm_norm_fn=lstm_norm_fn,
)


class Invertible1x1ConvLUS(torch.nn.Module):
def __init__(self, c):
super(Invertible1x1ConvLUS, self).__init__()
# Sample a random orthonormal matrix to initialize weights
W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
W, _ = torch.linalg.qr(torch.FloatTensor(c, c).normal_())
# Ensure determinant is 1.0 not -1.0
if torch.det(W) < 0:
W[:, 0] = -1 * W[:, 0]
Expand Down
Loading

0 comments on commit 542ab14

Please sign in to comment.