In [1]:
# =============================================================================
# Copyright (c) 2020 NVIDIA. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
from functools import partial
from os.path import expanduser, join, abspath, dirname, exists
import tarfile

from ruamel.yaml import YAML

import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.helpers import monitor_asr_train_progress
from nemo.core import NeuralGraph, OperationMode, DeviceType
from nemo.utils import logging
from nemo.utils.app_state import AppState

# Create Neural(Module)Factory, use CPU.
nf = nemo.core.NeuralModuleFactory(placement=DeviceType.CPU)



### Tutorial II: The advanced functionality

In this first part of the Neural Graphs (NGs) tutorial we will focus on a more complex example: training of an End-to-End Convolutional Neural Acoustic Model called JASPER. We will build a "model graph" and show how we can nest it into another graphs, how we can freeze/unfreeze modules, use graph configuration and save/load graph checkpoints.

#### This part covers the following:
 * how to nest one graph into another
 * how to serialize and deserialize a graph
 * how to export and import configuration to/from YAML files
 * how to save and load graph checkpoints
 * how to freeze/unfreeze modules in a graph

In order to learn more about graph nesting and input/output binding please refer to the first part of the tutorial.


In [2]:
# Prepare the samples for training JASPER - we will use the data from NeMo tests.
data_folder = abspath("../../tests/data/")
logging.info("Looking up for test ASR data")
if not exists(join(data_folder, "asr")):
    logging.info("Extracting ASR data to: {0}".format(join(data_folder, "asr")))
    tar = tarfile.open(join(data_folder, "asr.tar.gz"), "r:gz")
    tar.extractall(path=data_folder)
    tar.close()
else:
    logging.info("ASR data found in: {0}".format(join(data_folder, "asr")))

[NeMo I 2020-05-18 15:36:22 <ipython-input-2-a64db28628d1>:3] Looking up for test ASR data
[NeMo I 2020-05-18 15:36:22 <ipython-input-2-a64db28628d1>:10] ASR data found in: /Users/tkornuta/workspace/nemo/tests/data/asr


In [3]:
# Set paths to model configuration, manifest and sample files.
model_config_file = abspath("../asr/configs/jasper_an4.yaml")
manifest_path = join(data_folder, 'asr/tarred_an4/tarred_audio_manifest.json')
tarpath = join(data_folder, 'asr/tarred_an4/audio_0.tar')

# Open the model config file and get vocabulary.
yaml = YAML(typ="safe")
with open(expanduser(model_config_file)) as f:
    config = yaml.load(f)
# Get labels (vocabulary).
vocab = config['labels']
vocab_len = len(vocab)

In [4]:
# Instantiate DataLayer that can load the tarred samples.
data_layer = nemo_asr.TarredAudioToTextDataLayer(
    audio_tar_filepaths=tarpath, manifest_filepath=manifest_path, labels=vocab, batch_size=16)

# Create rest of the modules using the Neural Module deserialization feature.
data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.deserialize(config["AudioToMelSpectrogramPreprocessor"])

jasper_encoder = nemo_asr.JasperEncoder.deserialize(config["JasperEncoder"])
jasper_decoder = nemo_asr.JasperDecoderForCTC.deserialize(
    config["JasperDecoderForCTC"], overwrite_params={"num_classes": vocab_len}
)
ctc_loss = nemo_asr.CTCLossNM(num_classes=vocab_len)
greedy_decoder = nemo_asr.GreedyCTCDecoder()

[NeMo I 2020-05-18 15:36:22 collections:154] Dataset loaded with 65 files totalling 0.05 hours
[NeMo I 2020-05-18 15:36:22 collections:155] 0 files were filtered totalling 0.00 hours
[NeMo I 2020-05-18 15:36:22 features:144] PADDING: 16
[NeMo I 2020-05-18 15:36:22 features:152] STFT using conv
[NeMo I 2020-05-18 15:36:22 neural_modules:441] Instantiated a new Neural Module named `audiotomelspectrogrampreprocessor0` of type `AudioToMelSpectrogramPreprocessor`
[NeMo I 2020-05-18 15:36:22 neural_modules:441] Instantiated a new Neural Module named `jasperencoder0` of type `JasperEncoder`
[NeMo I 2020-05-18 15:36:22 neural_modules:441] Instantiated a new Neural Module named `jasperdecoderforctc0` of type `JasperDecoderForCTC`


In [5]:
# Create the Jasper "model" graph.
with NeuralGraph(operation_mode=OperationMode.both, name="jasper") as Jasper:
    # Copy one input port definitions - using "user" port names.
    Jasper.inputs["input"] = data_preprocessor.input_ports["input_signal"]
    # Bind selected inputs - bind other using the default port name.
    i_processed_signal, i_processed_signal_len = data_preprocessor(input_signal=Jasper.inputs["input"], length=Jasper)
    i_encoded, i_encoded_len = jasper_encoder(audio_signal=i_processed_signal, length=i_processed_signal_len)
    i_log_probs = jasper_decoder(encoder_output=i_encoded)
    # Bind selected outputs - using "user" port names.
    Jasper.outputs["log_probs"] = i_log_probs
    Jasper.outputs["encoded_len"] = i_encoded_len

# Print the summary.
logging.info(Jasper.summary())

[NeMo I 2020-05-18 15:36:22 <ipython-input-5-ba7109a0a06a>:14] 
    The `jasper` Neural Graph [OperationMode.both] [NOT COMPLETE]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,);  elements_type: LengthsType
        * 1.jasperencoder0.outputs->2.jasperdecoderforctc0.encoder_output | axes: (batch, dimension, time);  elements_type: AcousticEncodedRepresentation
     * Graph Inputs (2):
        * input->0.audiotomelspectrogramprep

In [6]:
# Serialize the whole graph.
serialized_jasper = Jasper.serialize()
logging.info("Serialized JasperNet:\n {}".format(serialized_jasper))

[NeMo I 2020-05-18 15:36:22 <ipython-input-6-a7ab0364999d>:3] Serialized JasperNet:
     {'header': {'nemo_core_version': '0.10.2b0', 'full_spec': 'nemo.core.neural_graph.NeuralGraph', 'operation_mode': 'both'}, 'modules': {'audiotomelspectrogrampreprocessor0': {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collection_version': None, 'full_spec': 'nemo.collections.asr.audio_preprocessing.AudioToMelSpectrogramPreprocessor'}, 'init_params': {'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 'n_window_size': 320, 'n_window_stride': 160, 'window': 'hann', 'normalize': 'per_feature', 'n_fft': 512, 'preemph': 0.97, 'features': 64, 'lowfreq': 0, 'highfreq': None, 'log': True, 'log_zero_guard_type': 'add', 'log_zero_guard_value': 5.960464477539063e-08, 'dither': 1e-05, 'pad_to': 16, 'frame_splicing': 1, 'stft_conv': True, 'pad_value': 0, 'mag_power': 2.0}}, 'jasperencoder0': {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collect

In [7]:
# You can also serialize a single NeuralModule, e.g. a decoder.
logging.info("Serialized Jasper Decoder:\n {}".format(jasper_decoder.serialize()))

[NeMo I 2020-05-18 15:36:22 <ipython-input-7-ad20a66ca46f>:2] Serialized Jasper Decoder:
     {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collection_version': None, 'full_spec': 'nemo.collections.asr.jasper.JasperDecoderForCTC'}, 'init_params': {'feat_in': 1024, 'num_classes': 28, 'init_mode': 'xavier_uniform'}}


In [8]:
# We can also export the serialized configuration to a file.
Jasper.export_to_config("my_jasper.yml")

[NeMo I 2020-05-18 15:36:22 neural_graph:480] Configuration of graph `jasper` (NeuralGraph) exported to 'my_jasper.yml'


In [9]:
# Display the lists of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-18 15:36:22 <ipython-input-9-2c61c243edb7>:2] 
    Registry of graphs:
     * jasper (3) [OperationMode.both]
[NeMo I 2020-05-18 15:36:22 <ipython-input-9-2c61c243edb7>:3] 
    Registry of modules:
     * tarredaudiototextdatalayer0 (TarredAudioToTextDataLayer)
     * greedyctcdecoder0 (GreedyCTCDecoder)
     * jasperdecoderforctc0 (JasperDecoderForCTC)
     * jasperencoder0 (JasperEncoder)
     * ctclossnm0 (CTCLossNM)
     * audiotomelspectrogrampreprocessor0 (AudioToMelSpectrogramPreprocessor)


In [10]:
# Deserialize graph - create a copy of the JASPER "model".
# Please note that the modules exist, so we must enable the graph to "reuse" them.
# (Commenting out reuse_existing_modules will raise a KeyError.)
jasper_copy = NeuralGraph.deserialize(serialized_jasper, reuse_existing_modules=True)
serialized_jasper_copy = jasper_copy.serialize()
assert serialized_jasper == serialized_jasper_copy # THE SAME! Please note name of the graph is not exported.

[NeMo I 2020-05-18 15:36:22 neural_graph:674] Instantiated a new Neural Graph named `neuralgraph0` with mode `OperationMode.both`


In [11]:
# Alternativelly, import a copy of the JASPER "model" from config.
jasper_copy = NeuralGraph.import_from_config("my_jasper.yml", reuse_existing_modules=True, name="jasper_copy")

# Print the summary.
logging.info(jasper_copy.summary())

# Display list of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-18 15:36:22 neural_graph:601] Loading configuration of a new Neural Graph from the `my_jasper.yml` file
[NeMo I 2020-05-18 15:36:22 neural_graph:674] Instantiated a new Neural Graph named `jasper_copy` with mode `OperationMode.both`
[NeMo I 2020-05-18 15:36:22 <ipython-input-11-24a1dbd84e21>:5] 
    The `jasper_copy` Neural Graph [OperationMode.both] [NOT COMPLETE]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,

In [12]:
# Create the "training" graph.
with NeuralGraph(operation_mode=OperationMode.training) as training_graph:
    # Create the "implicit" training graph.
    o_audio_signal, o_audio_signal_len, o_transcript, o_transcript_len = data_layer()
    # Use Jasper module as any other neural module.
    o_log_probs, o_encoded_len = jasper_copy(input=o_audio_signal, length=o_audio_signal_len)
    o_predictions = greedy_decoder(log_probs=o_log_probs)
    o_loss = ctc_loss(
        log_probs=o_log_probs, targets=o_transcript, input_length=o_encoded_len, target_length=o_transcript_len
    )
    # Set the graph output.
    training_graph.outputs["o_loss"] = o_loss

# Print the summary.
logging.info(training_graph.summary())

[NeMo I 2020-05-18 15:36:22 <ipython-input-12-0a2d620d8e6e>:16] 
    The `neuralgraph0` Neural Graph [OperationMode.training] [COMPLETE]:
     * Modules (6):
        * `tarredaudiototextdatalayer0` (TarredAudioToTextDataLayer)
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
        * `greedyctcdecoder0` (GreedyCTCDecoder)
        * `ctclossnm0` (CTCLossNM)
     * Steps (6):
        0. tarredaudiototextdatalayer0
        1. audiotomelspectrogrampreprocessor0
        2. jasperencoder0
        3. jasperdecoderforctc0
        4. greedyctcdecoder0
        5. ctclossnm0
     * Connections (10):
        * 0.tarredaudiototextdatalayer0.audio_signal->1.audiotomelspectrogrampreprocessor0.input_signal | axes: (batch, time);  elements_type: AudioSignal
        * 0.tarredaudiototextdatalayer0.a_sig_length->1.audiotomelspectrogrampreprocessor0.length | axes: (batch,); 

In [13]:
# Create training callback.
tensors_to_evaluate = [o_loss, o_predictions, o_transcript, o_transcript_len]
train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=tensors_to_evaluate, print_func=partial(monitor_asr_train_progress, labels=vocab), step_freq=1
)

# Train the graph.
nf.train(
    training_graph=training_graph,
    optimizer="novograd",
    callbacks=[train_callback],
    optimization_params={"max_steps": 5, "lr": 0.01},
)

[NeMo I 2020-05-18 15:36:22 callbacks:187] Starting .....
[NeMo I 2020-05-18 15:36:22 callbacks:199] Starting epoch 0
[NeMo I 2020-05-18 15:36:28 callbacks:224] Step: 0
[NeMo I 2020-05-18 15:36:28 helpers:72] Loss: 605.7487182617188
[NeMo I 2020-05-18 15:36:28 helpers:73] training_batch_WER:  700.00%
[NeMo I 2020-05-18 15:36:28 helpers:74] Prediction: mlcdjmgcg gp pgdlmem tl e ' 'p depepm m pcp p pgfmpmpcgp'cpgipjgpj
[NeMo I 2020-05-18 15:36:28 helpers:75] Reference: yes
[NeMo I 2020-05-18 15:36:28 callbacks:239] Step time: 5.990310907363892 seconds
[NeMo I 2020-05-18 15:36:28 callbacks:207] Finished epoch 0 in 0:00:06.018388
[NeMo I 2020-05-18 15:36:28 callbacks:199] Starting epoch 1
[NeMo I 2020-05-18 15:36:34 callbacks:224] Step: 1
[NeMo I 2020-05-18 15:36:34 helpers:72] Loss: 495.0736389160156
[NeMo I 2020-05-18 15:36:34 helpers:73] training_batch_WER:  797.65%
[NeMo I 2020-05-18 15:36:34 helpers:74] Prediction: mcdjmgcjlgp pgjlmem tl e ' 'p dep pm m p p p pgmpmpcgp'cxgipjgpj
[NeMo

In [14]:
# Finally, I can save the graph checkpoint!
# Note that optionally you can indicate the names of the modules to be saved.
jasper_copy.save_to("my_jasper.chkpt")#, module_names=["jasperencoder0"])
# Please note only "trainable" modules will be saved.

[NeMo I 2020-05-18 15:36:51 neural_graph:1011] Saved  the 'jasper_copy' graph to a checkpoint `my_jasper.chkpt`:
      * Module 'jasperencoder0' (JasperEncoder) params saved 
      * Module 'jasperdecoderforctc0' (JasperDecoderForCTC) params saved 
    


In [15]:
# In this case saving the whole graph should result in the same checkpoint...
training_graph.export_to_config("my_whole_graph.yml")
training_graph.save_to("my_whole_graph.chkpt")

[NeMo I 2020-05-18 15:36:51 neural_graph:480] Configuration of graph `neuralgraph0` (NeuralGraph) exported to 'my_whole_graph.yml'
[NeMo I 2020-05-18 15:36:51 neural_graph:1011] Saved  the 'neuralgraph0' graph to a checkpoint `my_whole_graph.chkpt`:
      * Module 'jasperencoder0' (JasperEncoder) params saved 
      * Module 'jasperdecoderforctc0' (JasperDecoderForCTC) params saved 
    


In [16]:
# Finally, I can load everything and continue training.
new_training_graph = NeuralGraph.import_from_config("my_whole_graph.yml", reuse_existing_modules=True)

# Let's restore only the encoder
new_training_graph.restore_from("my_whole_graph.chkpt", module_names=["jasperencoder0"])

[NeMo I 2020-05-18 15:36:51 neural_graph:601] Loading configuration of a new Neural Graph from the `my_whole_graph.yml` file
[NeMo I 2020-05-18 15:36:51 neural_graph:674] Instantiated a new Neural Graph named `neuralgraph1` with mode `OperationMode.training`
[NeMo I 2020-05-18 15:36:51 neural_graph:1052] Loading modules constituting the 'neuralgraph0' graph from the `my_whole_graph.chkpt` checkpoint :
      * Module 'jasperencoder0' (JasperEncoder) params loaded
    


In [17]:
# Analogically - create a loss callback.
loss_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[new_training_graph.output_tensors["o_loss"]],
    print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'), step_freq=1
)

In [18]:
# So let us freeze the whole graph...
training_graph.freeze() #we can also freeze a subset, using "module_names=[]""
# ... and finetune only the decoder.
training_graph.unfreeze(module_names=["jasperdecoderforctc0"])

# Ok, let us see what the graph looks like now.
logging.info(training_graph.summary())

[NeMo I 2020-05-18 15:36:51 <ipython-input-18-7b917cb48826>:7] 
    The `neuralgraph0` Neural Graph [OperationMode.training] [COMPLETE]:
     * Modules (6):
        * `tarredaudiototextdatalayer0` (TarredAudioToTextDataLayer)
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder) [FROZEN]
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
        * `greedyctcdecoder0` (GreedyCTCDecoder)
        * `ctclossnm0` (CTCLossNM)
     * Steps (6):
        0. tarredaudiototextdatalayer0
        1. audiotomelspectrogrampreprocessor0
        2. jasperencoder0
        3. jasperdecoderforctc0
        4. greedyctcdecoder0
        5. ctclossnm0
     * Connections (10):
        * 0.tarredaudiototextdatalayer0.audio_signal->1.audiotomelspectrogrampreprocessor0.input_signal | axes: (batch, time);  elements_type: AudioSignal
        * 0.tarredaudiototextdatalayer0.a_sig_length->1.audiotomelspectrogrampreprocessor0.length | axes: (b

In [19]:
# And continue training...
nf.reset_trainer()
nf.train(
    training_graph=new_training_graph,
    optimizer="novograd",
    callbacks=[loss_callback],
    optimization_params={"max_steps": 5, "lr": 0.01},
)
# Please note that this will throw an error if you will freeze all the trainable modules!

[NeMo I 2020-05-18 15:36:51 callbacks:187] Starting .....
[NeMo I 2020-05-18 15:36:51 callbacks:199] Starting epoch 0
[NeMo I 2020-05-18 15:36:54 callbacks:224] Step: 0
[NeMo I 2020-05-18 15:36:54 <ipython-input-17-fe5652d86be1>:4] Train Loss: 98.85807800292969
[NeMo I 2020-05-18 15:36:54 callbacks:239] Step time: 2.189332962036133 seconds
[NeMo I 2020-05-18 15:36:54 callbacks:207] Finished epoch 0 in 0:00:02.208335
[NeMo I 2020-05-18 15:36:54 callbacks:199] Starting epoch 1
[NeMo I 2020-05-18 15:36:56 callbacks:224] Step: 1
[NeMo I 2020-05-18 15:36:56 <ipython-input-17-fe5652d86be1>:4] Train Loss: 96.85870361328125
[NeMo I 2020-05-18 15:36:56 callbacks:239] Step time: 2.239922285079956 seconds
[NeMo I 2020-05-18 15:36:56 callbacks:207] Finished epoch 1 in 0:00:02.273441
[NeMo I 2020-05-18 15:36:56 callbacks:199] Starting epoch 2
[NeMo I 2020-05-18 15:36:58 callbacks:224] Step: 2
[NeMo I 2020-05-18 15:36:58 <ipython-input-17-fe5652d86be1>:4] Train Loss: 93.3438720703125
[NeMo I 2020-05