In [1]:
# Copyright 2020 NVIDIA. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
import argparse
import math
import os
from functools import partial

from ruamel.yaml import YAML

import nemo
import nemo.collections.asr as nemo_asr
import nemo.collections.tts as nemo_tts
import nemo.utils.argparse as nm_argparse
from nemo.collections.tts import (
    tacotron2_eval_log_to_tb_func,
    tacotron2_log_to_tb_func,
    tacotron2_process_eval_batch,
    tacotron2_process_final_eval,
)

logging = nemo.logging

In [16]:
# Download config files
config_path = '../configs/tacotron2.yaml'


yaml = YAML(typ="safe")
with open(config_path) as file:
    tacotron2_config = yaml.load(file)
    labels = tacotron2_config["labels"]

In [19]:
def create_NMs(tacotron2_config, labels, decoder_infer=False, decoder_force=False):
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **tacotron2_config['AudioToMelSpectrogramPreprocessor']['init_params']
    )
    text_embedding = nemo_tts.TextEmbedding.import_from_config(
        tacotron2_config_file, "TextEmbedding", overwrite_params={"n_symbols": len(labels) + 3}
    )
    t2_enc = nemo_tts.Tacotron2Encoder.import_from_config(tacotron2_config_file, "Tacotron2Encoder")
    if decoder_infer:
        t2_dec = nemo_tts.Tacotron2DecoderInfer.import_from_config(tacotron2_config_file, "Tacotron2DecoderInfer")
    else:
        t2_dec = nemo_tts.Tacotron2Decoder.import_from_config(
            tacotron2_config_file, "Tacotron2Decoder", overwrite_params={"force": decoder_force}
        )
    t2_postnet = nemo_tts.Tacotron2Postnet.import_from_config(tacotron2_config_file, "Tacotron2Postnet")
    t2_loss = nemo_tts.Tacotron2Loss.import_from_config(tacotron2_config_file, "Tacotron2Loss")
    makegatetarget = nemo_tts.MakeGate()

    total_weights = text_embedding.num_weights + t2_enc.num_weights + t2_dec.num_weights + t2_postnet.num_weights

    logging.info('================================')
    logging.info(f"Total number of parameters: {total_weights}")
    logging.info('================================')

    return (
        data_preprocessor,
        text_embedding,
        t2_enc,
        t2_dec,
        t2_postnet,
        t2_loss,
        makegatetarget,
    )

In [20]:
neural_modules = create_NMs(tacotron2_config, labels, decoder_infer=True);

AttributeError: 'NoneType' object has no attribute 'placement'

# Create inference dags

In [7]:
(_, text_embedding, t2_enc, t2_dec, t2_postnet, _, _) = neural_modules

data_layer = nemo_asr.TranscriptDataLayer(
    path=infer_dataset,
    labels=labels,
    batch_size=infer_batch_size,
    num_workers=cpu_per_dl,
    # load_audio=False,
    bos_id=len(labels),
    eos_id=len(labels) + 1,
    pad_id=len(labels) + 2,
    shuffle=False,
)
transcript, transcript_len = data_layer()

transcript_embedded = text_embedding(char_phone=transcript)

transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len,)

mel_decoder, gate, alignments, mel_len = t2_dec(
    char_phone_encoded=transcript_encoded, encoded_length=transcript_len,
)

mel_postnet = t2_postnet(mel_input=mel_decoder)

[0;31mInit signature:[0m
[0mnemo_tts[0m[0;34m.[0m[0mTacotron2Decoder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_mel_channels[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_frames_per_step[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoder_embedding_dim[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m512[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgate_threshold[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprenet_dim[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m256[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_decoder_steps[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdecoder_rnn_dim[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1024[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mp_decoder_dropout[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;