In [1]:
# =============================================================================
# Copyright (c) 2020 NVIDIA. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
from functools import partial
from os.path import expanduser

from ruamel.yaml import YAML

import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.helpers import monitor_asr_train_progress
from nemo.core import NeuralGraph, OperationMode, DeviceType
from nemo.utils import logging
from nemo.utils.app_state import AppState

# Create Neural(Module)Factory, use CPU.
nf = nemo.core.NeuralModuleFactory(placement=DeviceType.CPU)



### Tutorial II: The advanced functionality

In this first part of the Neural Graphs (NGs) tutorial we will focus on a more complex example: training of an End-to-End Convolutional Neural Acoustic Model called JASPER. We will build a "model graph" and show how we can nest it into another graphs, how we can freeze/unfreeze modules, use graph configuration and save/load graph checkpoints.

#### This part covers the following:
 * how to nest one graph into another
 * how to serialize and deserialize a graph 
 * how to export and import configuration to/from YAML files
 * how to save and load graph checkpoints
 * how to freeze/unfreeze modules in a graph

In order to learn more about graph nesting and input/output binding please refer to the first part of the tutorial.


In [2]:
# Set paths to "manifests" and model configuration files.
train_manifest = "~/TestData/an4_dataset/an4_train.json"
val_manifest = "~/TestData/an4_dataset/an4_val.json"
model_config_file = "~/workspace/nemo/examples/asr/configs/jasper_an4.yaml"

yaml = YAML(typ="safe")
with open(expanduser(model_config_file)) as f:
    config = yaml.load(f)
# Get vocabulary.
vocab = config['labels']

In [3]:
# Create neural modules using the Neural Module deserialization feature.
data_layer = nemo_asr.AudioToTextDataLayer.deserialize(
    config["AudioToTextDataLayer_train"], overwrite_params={"manifest_filepath": train_manifest, "batch_size": 16},
)

data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.deserialize(config["AudioToMelSpectrogramPreprocessor"])

jasper_encoder = nemo_asr.JasperEncoder.deserialize(config["JasperEncoder"])
jasper_decoder = nemo_asr.JasperDecoderForCTC.deserialize(
    config["JasperDecoderForCTC"], overwrite_params={"num_classes": len(vocab)}
)
ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
greedy_decoder = nemo_asr.GreedyCTCDecoder()

[NeMo I 2020-05-15 11:48:33 collections:154] Dataset loaded with 897 files totalling 1.39 hours
[NeMo I 2020-05-15 11:48:33 collections:155] 0 files were filtered totalling 0.00 hours
[NeMo I 2020-05-15 11:48:33 neural_modules:437] Instantiated a new Neural Module named `audiototextdatalayer0` of type `AudioToTextDataLayer`
[NeMo I 2020-05-15 11:48:33 features:144] PADDING: 16
[NeMo I 2020-05-15 11:48:33 features:152] STFT using conv
[NeMo I 2020-05-15 11:48:33 neural_modules:437] Instantiated a new Neural Module named `audiotomelspectrogrampreprocessor0` of type `AudioToMelSpectrogramPreprocessor`
[NeMo I 2020-05-15 11:48:33 neural_modules:437] Instantiated a new Neural Module named `jasperencoder0` of type `JasperEncoder`
[NeMo I 2020-05-15 11:48:33 neural_modules:437] Instantiated a new Neural Module named `jasperdecoderforctc0` of type `JasperDecoderForCTC`


In [4]:
# Create the Jasper "model" graph.
with NeuralGraph(operation_mode=OperationMode.both, name="jasper") as Jasper:
    # Copy one input port definitions - using "user" port names.
    Jasper.inputs["input"] = data_preprocessor.input_ports["input_signal"]
    # Bind selected inputs - bind other using the default port name.
    i_processed_signal, i_processed_signal_len = data_preprocessor(input_signal=Jasper.inputs["input"], length=Jasper)
    i_encoded, i_encoded_len = jasper_encoder(audio_signal=i_processed_signal, length=i_processed_signal_len)
    i_log_probs = jasper_decoder(encoder_output=i_encoded)
    # Bind selected outputs - using "user" port names.
    Jasper.outputs["log_probs"] = i_log_probs
    Jasper.outputs["encoded_len"] = i_encoded_len

# Print the summary.
logging.info(Jasper.summary())

[NeMo I 2020-05-15 11:48:33 <ipython-input-4-ba7109a0a06a>:14] 
    The `jasper` Neural Graph [OperationMode.both]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,);  elements_type: LengthsType
        * 1.jasperencoder0.outputs->2.jasperdecoderforctc0.encoder_output | axes: (batch, dimension, time);  elements_type: AcousticEncodedRepresentation
     * Graph Inputs (2):
        * input->0.audiotomelspectrogrampreprocessor0.input

In [5]:
# Serialize graph
serialized_jasper = Jasper.serialize()
logging.info("Serialized JasperNet:\n {}".format(serialized_jasper))

[NeMo I 2020-05-15 11:48:33 <ipython-input-5-1b0206c7cbe6>:3] Serialized JasperNet:
     {'header': {'nemo_core_version': '0.10.2b0', 'full_spec': 'nemo.core.neural_graph.NeuralGraph', 'operation_mode': 'both'}, 'modules': {'audiotomelspectrogrampreprocessor0': {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collection_version': None, 'full_spec': 'nemo.collections.asr.audio_preprocessing.AudioToMelSpectrogramPreprocessor'}, 'init_params': {'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 'n_window_size': 320, 'n_window_stride': 160, 'window': 'hann', 'normalize': 'per_feature', 'n_fft': 512, 'preemph': 0.97, 'features': 64, 'lowfreq': 0, 'highfreq': None, 'log': True, 'log_zero_guard_type': 'add', 'log_zero_guard_value': 5.960464477539063e-08, 'dither': 1e-05, 'pad_to': 16, 'frame_splicing': 1, 'stft_conv': True, 'pad_value': 0, 'mag_power': 2.0}}, 'jasperencoder0': {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collect

In [6]:
# Serialize decoder.
logging.info("Serialized Jasper Decoder:\n {}".format(jasper_decoder.serialize()))

[NeMo I 2020-05-15 11:48:33 <ipython-input-6-b0ca699f7a0d>:2] Serialized Jasper Decoder:
     {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collection_version': None, 'full_spec': 'nemo.collections.asr.jasper.JasperDecoderForCTC'}, 'init_params': {'feat_in': 1024, 'num_classes': 28, 'init_mode': 'xavier_uniform'}}


In [7]:
# We can also export the serialized configuration to a file.
Jasper.export_to_config("my_jasper.yml")

[NeMo I 2020-05-15 11:48:33 neural_graph:480] Configuration of graph `jasper` (NeuralGraph) exported to 'my_jasper.yml'


In [8]:
# Display the lists of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 11:48:33 <ipython-input-8-2c61c243edb7>:2] 
    Registry of graphs:
     * jasper (3) [OperationMode.both]
[NeMo I 2020-05-15 11:48:33 <ipython-input-8-2c61c243edb7>:3] 
    Registry of modules:
     * ctclossnm0 (CTCLossNM)
     * jasperdecoderforctc0 (JasperDecoderForCTC)
     * audiotomelspectrogrampreprocessor0 (AudioToMelSpectrogramPreprocessor)
     * audiototextdatalayer0 (AudioToTextDataLayer)
     * jasperencoder0 (JasperEncoder)
     * greedyctcdecoder0 (GreedyCTCDecoder)


In [9]:
# Delete everything - aside of jasper encoder, just as a test to show that reusing work! ;)
del Jasper
del data_preprocessor
del jasper_encoder #
del jasper_decoder

# In "pure" python - that will remove ALL existing references (bot registries are Dicts with weak references!)

In [10]:
# Display list of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 11:48:33 <ipython-input-10-09741a5af73d>:2] 
    Registry of graphs:
     * jasper (3) [OperationMode.both]
[NeMo I 2020-05-15 11:48:33 <ipython-input-10-09741a5af73d>:3] 
    Registry of modules:
     * ctclossnm0 (CTCLossNM)
     * jasperdecoderforctc0 (JasperDecoderForCTC)
     * audiotomelspectrogrampreprocessor0 (AudioToMelSpectrogramPreprocessor)
     * audiototextdatalayer0 (AudioToTextDataLayer)
     * jasperencoder0 (JasperEncoder)
     * greedyctcdecoder0 (GreedyCTCDecoder)


In [12]:
# Deserialize graph - create a copy of the JASPER "model".
# PLease note that the modules exist, so we must enable the graph to "reuse" them.
jasper_copy = NeuralGraph.deserialize(serialized_jasper)#, reuse_existing_modules=True)
serialized_jasper_copy = jasper_copy.serialize()
assert serialized_jasper == serialized_jasper_copy # THE SAME! Please note name of the graph is not exported.

# Print the summary.
logging.info(jasper_copy.summary())

# Display list of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 11:48:48 neural_graph:674] Instantiated a new Neural Graph named `neuralgraph1` with mode `OperationMode.both`
[NeMo I 2020-05-15 11:48:48 <ipython-input-12-79f4663a0e1e>:8] 
    The `neuralgraph1` Neural Graph [OperationMode.both]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,);  elements_type: LengthsType
        * 1.jasperencoder0.outputs->2.jasperdecoderforctc0.encoder_output | axes: (batch, dimension, t

In [13]:
# Alternativelly, import a copy of the JASPER "model" from config.
jasper_copy = NeuralGraph.import_from_config("my_jasper.yml", reuse_existing_modules=True, name="jasper_copy2")

# Print the summary.
logging.info(jasper_copy.summary())

# Display list of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 11:48:53 neural_graph:601] Loading configuration of a new Neural Graph from the `my_jasper.yml` file
[NeMo I 2020-05-15 11:48:53 neural_graph:674] Instantiated a new Neural Graph named `jasper_copy2` with mode `OperationMode.both`
[NeMo I 2020-05-15 11:48:53 <ipython-input-13-57035402f623>:5] 
    The `jasper_copy2` Neural Graph [OperationMode.both]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,);  elements_

In [14]:
# Create the "training" graph.
with NeuralGraph(operation_mode=OperationMode.training) as training_graph:
    # Create the "implicit" training graph.
    o_audio_signal, o_audio_signal_len, o_transcript, o_transcript_len = data_layer()
    # Use Jasper module as any other neural module.
    o_log_probs, o_encoded_len = jasper_copy(input=o_audio_signal, length=o_audio_signal_len)
    o_predictions = greedy_decoder(log_probs=o_log_probs)
    o_loss = ctc_loss(
        log_probs=o_log_probs, targets=o_transcript, input_length=o_encoded_len, target_length=o_transcript_len
    )
    # Set graph output.
    training_graph.outputs["o_loss"] = o_loss
    # training_graph.outputs["o_predictions"] = o_predictions # DOESN'T WORK!

# Print the summary.
logging.info(training_graph.summary())

[NeMo I 2020-05-15 11:48:56 <ipython-input-14-0a2d620d8e6e>:16] 
    The `neuralgraph1` Neural Graph [OperationMode.training]:
     * Modules (6):
        * `audiototextdatalayer0` (AudioToTextDataLayer)
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
        * `greedyctcdecoder0` (GreedyCTCDecoder)
        * `ctclossnm0` (CTCLossNM)
     * Steps (6):
        0. audiototextdatalayer0
        1. audiotomelspectrogrampreprocessor0
        2. jasperencoder0
        3. jasperdecoderforctc0
        4. greedyctcdecoder0
        5. ctclossnm0
     * Connections (10):
        * 0.audiototextdatalayer0.audio_signal->1.audiotomelspectrogrampreprocessor0.input_signal | axes: (batch, time);  elements_type: AudioSignal
        * 0.audiototextdatalayer0.a_sig_length->1.audiotomelspectrogrampreprocessor0.length | axes: (batch,);  elements_type: LengthsType
        * 0.a

In [15]:
# Create training callback.
tensors_to_evaluate = [o_loss, o_predictions, o_transcript, o_transcript_len]
train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=tensors_to_evaluate, print_func=partial(monitor_asr_train_progress, labels=vocab), step_freq=1
)

# Train the graph.
nf.train(
    # tensors_to_optimize=[o_loss, o_predictions], # DOESN'T WORK!
    # tensors_to_optimize=[o_loss],
    training_graph=training_graph,
    optimizer="novograd",
    callbacks=[train_callback],
    optimization_params={"max_steps": 5, "lr": 0.01},
)

[NeMo I 2020-05-15 11:48:59 callbacks:187] Starting .....
[NeMo I 2020-05-15 11:48:59 callbacks:199] Starting epoch 0
[NeMo I 2020-05-15 11:49:02 callbacks:224] Step: 0
[NeMo I 2020-05-15 11:49:02 helpers:72] Loss: 696.0162353515625
[NeMo I 2020-05-15 11:49:02 helpers:73] training_batch_WER:  484.00%
[NeMo I 2020-05-15 11:49:02 helpers:74] Prediction:  y zxce amekzkaf'aeapaparpapaaazc' a'ypacp  sa raep ay nzyh pjypmpna'appo aepacyb c a ge'e p p pn
[NeMo I 2020-05-15 11:49:02 helpers:75] Reference: j u l i e
[NeMo I 2020-05-15 11:49:02 callbacks:239] Step time: 3.7683730125427246 seconds
[NeMo I 2020-05-15 11:49:07 callbacks:224] Step: 1
[NeMo I 2020-05-15 11:49:07 helpers:72] Loss: 655.5051879882812
[NeMo I 2020-05-15 11:49:07 helpers:73] training_batch_WER:  568.27%
[NeMo I 2020-05-15 11:49:07 helpers:74] Prediction: x lz ysy m p e y l pl pe pe z myayl lp eayapazqay p ae namamzamz pa s parpa'pyaepaapaeycaxcapwaza zapxapapwazap'rzagaped cpcd znancpzp zn a gse aoac sbgapaparnlcypya eae 

In [16]:
# Finally, I can save the graph checkpoint!
jasper_copy.save_to("my_jasper.chkpt")#, module_names=["jasperencoder0"])
# Please note only "trainable" modules will be saved.

[NeMo I 2020-05-15 11:49:20 neural_graph:1007] Saved  the 'jasper_copy2' graph to a checkpoint `my_jasper.chkpt`:
      * Module 'jasperencoder0' (JasperEncoder) params saved 
      * Module 'jasperdecoderforctc0' (JasperDecoderForCTC) params saved 
    


In [17]:
# In this case saving the whole graph should result in the same checkpoint...
training_graph.export_to_config("my_whole_graph.yml")
training_graph.save_to("my_whole_graph.chkpt")

# BUT !! class GreedyCTCDecoder(TrainableNM) !! so:

[NeMo I 2020-05-15 11:49:20 neural_graph:480] Configuration of graph `neuralgraph1` (NeuralGraph) exported to 'my_whole_graph.yml'
[NeMo I 2020-05-15 11:49:20 neural_graph:1007] Saved  the 'neuralgraph1' graph to a checkpoint `my_whole_graph.chkpt`:
      * Module 'jasperencoder0' (JasperEncoder) params saved 
      * Module 'jasperdecoderforctc0' (JasperDecoderForCTC) params saved 
      * Module 'greedyctcdecoder0' (GreedyCTCDecoder) params saved 
    


In [18]:
# Finally, I can load everything and continue training.
new_training_graph = NeuralGraph.import_from_config("my_whole_graph.yml", reuse_existing_modules=True)

# Let's restore only the encoder
new_training_graph.restore_from("my_whole_graph.chkpt", module_names=["jasperencoder0"])

[NeMo I 2020-05-15 11:49:20 neural_graph:601] Loading configuration of a new Neural Graph from the `my_whole_graph.yml` file
[NeMo I 2020-05-15 11:49:20 neural_graph:674] Instantiated a new Neural Graph named `neuralgraph2` with mode `OperationMode.training`
[NeMo I 2020-05-15 11:49:20 neural_graph:1048] Loading modules constituting the 'neuralgraph1' graph from the `my_whole_graph.chkpt` checkpoint :
      * Module 'jasperencoder0' (JasperEncoder) params loaded
    


In [19]:
# Or maybe not...
# Let's restore only the encoder
new_training_graph.restore_from("my_whole_graph.chkpt")

[NeMo I 2020-05-15 11:49:20 neural_graph:1048] Loading modules constituting the 'neuralgraph1' graph from the `my_whole_graph.chkpt` checkpoint :
      * Module 'jasperencoder0' (JasperEncoder) params loaded
      * Module 'jasperdecoderforctc0' (JasperDecoderForCTC) params loaded
      * Module 'greedyctcdecoder0' (GreedyCTCDecoder) params loaded
    


In [20]:
# Analogically - create a loss callback.
loss_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[new_training_graph.output_tensors["o_loss"]],
    print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'), step_freq=1
)

In [23]:
# And  what will happen if we will freeze our graph?
training_graph.freeze() #we can also freeze a subset, using "module_names=[]""
# Let us finetune only the decoder.
#training_graph.unfreeze(module_names=["jasperdecoderforctc0"])

# Ok, let us see what the graph looks like now.
logging.info(training_graph.summary())

[NeMo I 2020-05-15 11:49:51 <ipython-input-23-f4cf4031dfa3>:7] 
    The `neuralgraph1` Neural Graph [OperationMode.training]:
     * Modules (6):
        * `audiototextdatalayer0` (AudioToTextDataLayer)
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder) [FROZEN]
        * `jasperdecoderforctc0` (JasperDecoderForCTC) [FROZEN]
        * `greedyctcdecoder0` (GreedyCTCDecoder) [FROZEN]
        * `ctclossnm0` (CTCLossNM)
     * Steps (6):
        0. audiototextdatalayer0
        1. audiotomelspectrogrampreprocessor0
        2. jasperencoder0
        3. jasperdecoderforctc0
        4. greedyctcdecoder0
        5. ctclossnm0
     * Connections (10):
        * 0.audiototextdatalayer0.audio_signal->1.audiotomelspectrogrampreprocessor0.input_signal | axes: (batch, time);  elements_type: AudioSignal
        * 0.audiototextdatalayer0.a_sig_length->1.audiotomelspectrogrampreprocessor0.length | axes: (batch,);  elements_type:

In [24]:
# And continue training...
nf.reset_trainer() # I do not understand why do I have to "reset the trainer" when calling train() function again :]
nf.train(
    training_graph=new_training_graph,
    optimizer="novograd",
    callbacks=[loss_callback],
    optimization_params={"max_steps": 5, "lr": 0.01},
)

# This will throw an error as all trainable modules are frozen!

[NeMo I 2020-05-15 11:49:53 callbacks:187] Starting .....
[NeMo I 2020-05-15 11:49:53 callbacks:199] Starting epoch 0


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

# Neural Graph plans and extensions

## 1. Long-term goal: "training with graphs" (since November 2019;])

### training with training/evaluation graphs
 * train(training_graph=graph1, evaluation_graph=graph2 [OPTIONAL], ...)

### Expanded: training with callbacks 
 * train(training_graph=graph1, training_callbacks=callbacks1 [OPTIONAL], evaluation_graph=graph2 [OPTIONAL], evaluation_callbacks=callbacks2 [OPTIONAL], ...)

### Inference/evaluation
 * infer(evaluation_graph=graph2, ...)

### Expanded: inference with callbacks 
 * infer(evaluation_graph=graph2, evaluation_callbacks=callbacks2 [OPTIONAL], ...)
 

## 2. "Other main" graph actions

 * inputs/outputs binding [DONE]
 * graph nesting [DONE]
 * import_from_config()/export_to_config() [DONE]
 * serialize()/deserialize() [DONE]
 * save_to()/restore_from() [DONE]
 
 
## 2. "Partial" graph actions
### (will be used in the "main actions", but also could be called by the user directly)

 * freeze()/unfreeze() [DONE]
 * is_valid()
 * to(device)
 * graph nesting "with duplication" (@duplicate)
 * get_batch() -> batch
 * forward(batch) # Evelina's "infer with user input" (Complete Dialog Pipeline)
 * backward() (?)
