In [1]:
# =============================================================================
# Copyright (c) 2020 NVIDIA. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
from functools import partial
from os.path import expanduser

from ruamel.yaml import YAML

import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.helpers import monitor_asr_train_progress
from nemo.core import NeuralGraph, OperationMode, DeviceType
from nemo.utils import logging
from nemo.utils.app_state import AppState

nf = nemo.core.NeuralModuleFactory(placement=DeviceType.CPU)

logging.info(
    "This example shows how one can build, train, serialize and save a Jasper model using NeMo Neural Graphs."
)

[NeMo I 2020-05-15 07:42:11 <ipython-input-1-cac3d03954de>:31] This example shows how one can build, train, serialize and save a Jasper model using NeMo Neural Graphs.




In [2]:
# Set paths to "manifests" and model configuration files.
train_manifest = "~/TestData/an4_dataset/an4_train.json"
val_manifest = "~/TestData/an4_dataset/an4_val.json"
model_config_file = "~/workspace/nemo/examples/asr/configs/jasper_an4.yaml"

yaml = YAML(typ="safe")
with open(expanduser(model_config_file)) as f:
    config = yaml.load(f)
# Get vocabulary.
vocab = config['labels']

In [3]:
# Create neural modules using the Neural Module deserialization feature.
data_layer = nemo_asr.AudioToTextDataLayer.deserialize(
    config["AudioToTextDataLayer_train"], overwrite_params={"manifest_filepath": train_manifest, "batch_size": 16},
)

data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.deserialize(config["AudioToMelSpectrogramPreprocessor"])

jasper_encoder = nemo_asr.JasperEncoder.deserialize(config["JasperEncoder"])
jasper_decoder = nemo_asr.JasperDecoderForCTC.deserialize(
    config["JasperDecoderForCTC"], overwrite_params={"num_classes": len(vocab)}
)
ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
greedy_decoder = nemo_asr.GreedyCTCDecoder()

[NeMo I 2020-05-15 07:42:15 collections:154] Dataset loaded with 897 files totalling 1.39 hours
[NeMo I 2020-05-15 07:42:15 collections:155] 0 files were filtered totalling 0.00 hours
[NeMo I 2020-05-15 07:42:15 neural_modules:437] Instantiated a new Neural Module named `audiototextdatalayer0` of type `AudioToTextDataLayer`
[NeMo I 2020-05-15 07:42:15 features:144] PADDING: 16
[NeMo I 2020-05-15 07:42:15 features:152] STFT using conv
[NeMo I 2020-05-15 07:42:15 neural_modules:437] Instantiated a new Neural Module named `audiotomelspectrogrampreprocessor0` of type `AudioToMelSpectrogramPreprocessor`
[NeMo I 2020-05-15 07:42:15 neural_modules:437] Instantiated a new Neural Module named `jasperencoder0` of type `JasperEncoder`
[NeMo I 2020-05-15 07:42:15 neural_modules:437] Instantiated a new Neural Module named `jasperdecoderforctc0` of type `JasperDecoderForCTC`


In [4]:
# Create the Jasper "model" graph.
with NeuralGraph(operation_mode=OperationMode.both, name="jasper") as Jasper:
    # Copy one input port definitions - using "user" port names.
    Jasper.inputs["input"] = data_preprocessor.input_ports["input_signal"]
    # Bind selected inputs - bind other using the default port name.
    i_processed_signal, i_processed_signal_len = data_preprocessor(input_signal=Jasper.inputs["input"], length=Jasper)
    i_encoded, i_encoded_len = jasper_encoder(audio_signal=i_processed_signal, length=i_processed_signal_len)
    i_log_probs = jasper_decoder(encoder_output=i_encoded)
    # Bind selected outputs - using "user" port names.
    Jasper.outputs["log_probs"] = i_log_probs
    Jasper.outputs["encoded_len"] = i_encoded_len

# Print the summary.
logging.info(Jasper.summary())

[NeMo I 2020-05-15 07:42:43 <ipython-input-4-7991f6f730bf>:14] 
    The `neuralgraph0` Neural Graph [OperationMode.both]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,);  elements_type: LengthsType
        * 1.jasperencoder0.outputs->2.jasperdecoderforctc0.encoder_output | axes: (batch, dimension, time);  elements_type: AcousticEncodedRepresentation
     * Graph Inputs (2):
        * input->0.audiotomelspectrogrampreprocessor0

In [5]:
# Serialize graph
serialized_jasper = Jasper.serialize()
logging.info("Serialized JasperNet:\n {}".format(serialized_jasper))

[NeMo I 2020-05-15 07:42:48 <ipython-input-5-1b0206c7cbe6>:3] Serialized JasperNet:
     {'header': {'nemo_core_version': '0.10.2b0', 'full_spec': 'nemo.core.neural_graph.NeuralGraph', 'operation_mode': 'both'}, 'modules': {'audiotomelspectrogrampreprocessor0': {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collection_version': None, 'full_spec': 'nemo.collections.asr.audio_preprocessing.AudioToMelSpectrogramPreprocessor'}, 'init_params': {'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 'n_window_size': 320, 'n_window_stride': 160, 'window': 'hann', 'normalize': 'per_feature', 'n_fft': 512, 'preemph': 0.97, 'features': 64, 'lowfreq': 0, 'highfreq': None, 'log': True, 'log_zero_guard_type': 'add', 'log_zero_guard_value': 5.960464477539063e-08, 'dither': 1e-05, 'pad_to': 16, 'frame_splicing': 1, 'stft_conv': True, 'pad_value': 0, 'mag_power': 2.0}}, 'jasperencoder0': {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collect

In [6]:
# Serialize decoder.
logging.info("Serialized Jasper Decoder:\n {}".format(jasper_decoder.serialize()))

[NeMo I 2020-05-15 07:42:55 <ipython-input-6-b0ca699f7a0d>:2] Serialized Jasper Decoder:
     {'header': {'nemo_core_version': '0.10.2b0', 'collection_type': 'asr', 'collection_version': None, 'full_spec': 'nemo.collections.asr.jasper.JasperDecoderForCTC'}, 'init_params': {'feat_in': 1024, 'num_classes': 28, 'init_mode': 'xavier_uniform'}}


In [7]:
# We can also export the serialized configuration to a file.
Jasper.export_to_config("my_jasper.yml")

[NeMo I 2020-05-15 07:44:26 neural_graph:480] Configuration of graph `neuralgraph0` (NeuralGraph) exported to 'my_jasper.yml'


In [8]:
# Display the lists of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 07:44:41 <ipython-input-8-2c61c243edb7>:2] 
    Registry of graphs:
     * neuralgraph0 (3) [OperationMode.both]
[NeMo I 2020-05-15 07:44:41 <ipython-input-8-2c61c243edb7>:3] 
    Registry of modules:
     * audiotomelspectrogrampreprocessor0 (AudioToMelSpectrogramPreprocessor)
     * audiototextdatalayer0 (AudioToTextDataLayer)
     * greedyctcdecoder0 (GreedyCTCDecoder)
     * jasperencoder0 (JasperEncoder)
     * jasperdecoderforctc0 (JasperDecoderForCTC)
     * ctclossnm0 (CTCLossNM)


In [9]:
# Delete everything - aside of jasper encoder, just as a test to show that reusing work! ;)
del Jasper
del data_preprocessor
del jasper_encoder #
del jasper_decoder

# In "pure" python - that will remove ALL existing references (bot registries are Dicts with weak references!)

In [10]:
# Display list of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 07:45:28 <ipython-input-10-09741a5af73d>:2] 
    Registry of graphs:
     * neuralgraph0 (3) [OperationMode.both]
[NeMo I 2020-05-15 07:45:28 <ipython-input-10-09741a5af73d>:3] 
    Registry of modules:
     * audiotomelspectrogrampreprocessor0 (AudioToMelSpectrogramPreprocessor)
     * audiototextdatalayer0 (AudioToTextDataLayer)
     * greedyctcdecoder0 (GreedyCTCDecoder)
     * jasperencoder0 (JasperEncoder)
     * jasperdecoderforctc0 (JasperDecoderForCTC)
     * ctclossnm0 (CTCLossNM)


In [13]:
# Deserialize graph - create a copy of the JASPER "model".
jasper_copy = NeuralGraph.deserialize(serialized_jasper)#, reuse_existing_modules=True)
serialized_jasper_copy = jasper_copy.serialize()
assert serialized_jasper == serialized_jasper_copy # THE SAME! Please note name of the graph is not exported.

# Print the summary.
logging.info(jasper_copy.summary())

# Display list of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 07:46:25 neural_graph:674] Instantiated a new Neural Graph named `neuralgraph3` with mode `OperationMode.both`
[NeMo I 2020-05-15 07:46:25 <ipython-input-13-0105a48a2ff2>:7] 
    The `neuralgraph3` Neural Graph [OperationMode.both]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,);  elements_type: LengthsType
        * 1.jasperencoder0.outputs->2.jasperdecoderforctc0.encoder_output | axes: (batch, dimension, t

In [14]:
# Alternativelly, import a copy of the JASPER "model" from config.
jasper_copy = NeuralGraph.import_from_config("my_jasper.yml", reuse_existing_modules=True, name="jasper_copy2")

# Print the summary.
logging.info(jasper_copy.summary())

# Display list of graph and modules
logging.info(AppState().graphs.summary())
logging.info(AppState().modules.summary())

[NeMo I 2020-05-15 07:47:35 neural_graph:601] Loading configuration of a new Neural Graph from the `my_jasper.yml` file
[NeMo I 2020-05-15 07:47:35 neural_graph:674] Instantiated a new Neural Graph named `jasper_copy2` with mode `OperationMode.both`
[NeMo I 2020-05-15 07:47:35 <ipython-input-14-57035402f623>:5] 
    The `jasper_copy2` Neural Graph [OperationMode.both]:
     * Modules (3):
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
     * Steps (3):
        0. audiotomelspectrogrampreprocessor0
        1. jasperencoder0
        2. jasperdecoderforctc0
     * Connections (3):
        * 0.audiotomelspectrogrampreprocessor0.processed_signal->1.jasperencoder0.audio_signal | axes: (batch, dimension, time);  elements_type: MelSpectrogramType
        * 0.audiotomelspectrogrampreprocessor0.processed_length->1.jasperencoder0.length | axes: (batch,);  elements_

In [15]:
# Create the "training" graph.
with NeuralGraph(operation_mode=OperationMode.training) as training_graph:
    # Create the "implicit" training graph.
    o_audio_signal, o_audio_signal_len, o_transcript, o_transcript_len = data_layer()
    # Use Jasper module as any other neural module.
    o_log_probs, o_encoded_len = jasper_copy(input=o_audio_signal, length=o_audio_signal_len)
    o_predictions = greedy_decoder(log_probs=o_log_probs)
    o_loss = ctc_loss(
        log_probs=o_log_probs, targets=o_transcript, input_length=o_encoded_len, target_length=o_transcript_len
    )
    # Set graph output.
    training_graph.outputs["o_loss"] = o_loss
    # training_graph.outputs["o_predictions"] = o_predictions # DOESN'T WORK!

# Print the summary.
logging.info(training_graph.summary())

[NeMo I 2020-05-15 07:48:35 <ipython-input-15-ff8cdae70ffc>:16] 
    The `neuralgraph1` Neural Graph [OperationMode.both]:
     * Modules (6):
        * `audiototextdatalayer0` (AudioToTextDataLayer)
        * `audiotomelspectrogrampreprocessor0` (AudioToMelSpectrogramPreprocessor)
        * `jasperencoder0` (JasperEncoder)
        * `jasperdecoderforctc0` (JasperDecoderForCTC)
        * `greedyctcdecoder0` (GreedyCTCDecoder)
        * `ctclossnm0` (CTCLossNM)
     * Steps (6):
        0. audiototextdatalayer0
        1. audiotomelspectrogrampreprocessor0
        2. jasperencoder0
        3. jasperdecoderforctc0
        4. greedyctcdecoder0
        5. ctclossnm0
     * Connections (10):
        * 0.audiototextdatalayer0.audio_signal->1.audiotomelspectrogrampreprocessor0.input_signal | axes: (batch, time);  elements_type: AudioSignal
        * 0.audiototextdatalayer0.a_sig_length->1.audiotomelspectrogrampreprocessor0.length | axes: (batch,);  elements_type: LengthsType
        * 0.audio

In [17]:
# Create training callback.
tensors_to_evaluate = [o_loss, o_predictions, o_transcript, o_transcript_len]
train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=tensors_to_evaluate, print_func=partial(monitor_asr_train_progress, labels=vocab), step_freq=1
)

# Train the graph.
nf.train(
    # tensors_to_optimize=[o_loss, o_predictions], # DOESN'T WORK!
    # tensors_to_optimize=[o_loss],
    training_graph=training_graph,
    optimizer="novograd",
    callbacks=[train_callback],
    optimization_params={"max_steps": 5, "lr": 0.01},
)


[NeMo I 2020-05-15 07:49:21 callbacks:187] Starting .....
[NeMo I 2020-05-15 07:49:21 callbacks:199] Starting epoch 0
[NeMo I 2020-05-15 07:49:26 callbacks:224] Step: 0
[NeMo I 2020-05-15 07:49:26 helpers:72] Loss: 740.5501098632812
[NeMo I 2020-05-15 07:49:26 helpers:73] training_batch_WER:  102.94%
[NeMo I 2020-05-15 07:49:26 helpers:74] Prediction: qkrkperprskrqkeqerxrhrurkrqrnrbsrf'bkpelrybururqkqeqrekbrork'esehroqoroqrpe'yroqklklgwondonrnrnrbuyb'qgrqeqrkrk'r'roho'koblboefohxeqeoq'b'orqblbr'olerbr'ghbkrbobob'kqrbsrheqeq'oehkwr'vxrxhrhqhrbrbrqrbuoeqbnb'qbfr jkyek'krkrqnrkl
[NeMo I 2020-05-15 07:49:26 helpers:75] Reference: enter six one two five
[NeMo I 2020-05-15 07:49:26 callbacks:239] Step time: 5.018915891647339 seconds
[NeMo I 2020-05-15 07:49:31 callbacks:224] Step: 1
[NeMo I 2020-05-15 07:49:31 helpers:72] Loss: 671.6516723632812
[NeMo I 2020-05-15 07:49:31 helpers:73] training_batch_WER:  109.18%
[NeMo I 2020-05-15 07:49:31 helpers:74] Prediction: 'rkckrerpel emxkre'k'e'r'rq

In [24]:
# Finally, I can save the graph checkpoint!
jasper_copy.save_to("my_jasper.chkpt")#, module_names=["jasperencoder0"])
# Please note only "trainable" modules will be saved.

[NeMo I 2020-05-15 07:52:12 neural_graph:1004] Saved  the 'jasper_copy2' graph to a checkpoint `my_jasper.chkpt`:
      * Module 'jasperencoder0' (JasperEncoder) params saved 
    


In [19]:
# In this case saving the whole graph should result in the same checkpoint...
training_graph.export_to_config("my_whole_graph.yml")
training_graph.save_to("my_whole_graph.chkpt")

# BUT !! class GreedyCTCDecoder(TrainableNM) !! so:

[NeMo I 2020-05-15 07:50:28 neural_graph:480] Configuration of graph `neuralgraph1` (NeuralGraph) exported to 'my_whole_graph.yml'
[NeMo I 2020-05-15 07:50:28 neural_graph:1004] Saved  the 'neuralgraph1' graph to a checkpoint `my_whole_graph.chkpt`:
      * Module 'jasperencoder0' (JasperEncoder) params saved 
      * Module 'jasperdecoderforctc0' (JasperDecoderForCTC) params saved 
      * Module 'greedyctcdecoder0' (GreedyCTCDecoder) params saved 
    


In [None]:
# Greedy decoder has actually 0! trainable parameters, and, moreover this is its :
    def forward(self, log_probs):
        with torch.no_grad(): # !!!!
            argmx = log_probs.argmax(dim=-1, keepdim=False)
            return argmx

# BTW. This also triggers a question of using no_grad()
# in (NonTrainable) modules that are NOT TERMINAL NODES of the graph 
# (requires_grad = False VS torch.no_grad())

In [20]:
# Finally, I can load everything and continue training.
new_training_graph = NeuralGraph.import_from_config("my_whole_graph.yml", reuse_existing_modules=True)

# Let's restore only the encoder
new_training_graph.restore_from("my_whole_graph.chkpt", module_names=["jasperencoder0"])



[NeMo I 2020-05-15 07:50:38 neural_graph:601] Loading configuration of a new Neural Graph from the `my_whole_graph.yml` file
[NeMo I 2020-05-15 07:50:38 neural_graph:674] Instantiated a new Neural Graph named `neuralgraph3` with mode `OperationMode.both`
[NeMo I 2020-05-15 07:50:38 neural_graph:1045] Loading modules constituting the 'neuralgraph1' graph from the `my_whole_graph.chkpt` checkpoint :
      * Module 'jasperencoder0' (JasperEncoder) params loaded
    


In [21]:
# Or maybe not...
# Let's restore only the encoder
new_training_graph.restore_from("my_whole_graph.chkpt")


[NeMo I 2020-05-15 07:50:42 neural_graph:1045] Loading modules constituting the 'neuralgraph1' graph from the `my_whole_graph.chkpt` checkpoint :
      * Module 'jasperencoder0' (JasperEncoder) params loaded
      * Module 'jasperdecoderforctc0' (JasperDecoderForCTC) params loaded
      * Module 'greedyctcdecoder0' (GreedyCTCDecoder) params loaded
    


In [22]:
# Create loss callback.
loss_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[new_training_graph.output_tensors["o_loss"]],
    print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'), step_freq=1
)


# And continue training...
nf.reset_trainer() # I do not understand why do I have to "reset the trainer" :]
nf.train(
    training_graph=new_training_graph,
    optimizer="novograd",
    callbacks=[loss_callback],
    optimization_params={"max_steps": 5, "lr": 0.01},
)


[NeMo I 2020-05-15 07:50:48 callbacks:187] Starting .....
[NeMo I 2020-05-15 07:50:48 callbacks:199] Starting epoch 0
[NeMo I 2020-05-15 07:50:51 callbacks:224] Step: 0
[NeMo I 2020-05-15 07:50:51 <ipython-input-22-c3c0e972cbe6>:4] Train Loss: 404.2879333496094
[NeMo I 2020-05-15 07:50:51 callbacks:239] Step time: 2.6264939308166504 seconds
[NeMo I 2020-05-15 07:50:55 callbacks:224] Step: 1
[NeMo I 2020-05-15 07:50:55 <ipython-input-22-c3c0e972cbe6>:4] Train Loss: 596.5692138671875
[NeMo I 2020-05-15 07:50:55 callbacks:239] Step time: 4.829496145248413 seconds
[NeMo I 2020-05-15 07:50:59 callbacks:224] Step: 2
[NeMo I 2020-05-15 07:50:59 <ipython-input-22-c3c0e972cbe6>:4] Train Loss: 452.711181640625
[NeMo I 2020-05-15 07:50:59 callbacks:239] Step time: 3.9349939823150635 seconds
[NeMo I 2020-05-15 07:51:05 callbacks:224] Step: 3
[NeMo I 2020-05-15 07:51:05 <ipython-input-22-c3c0e972cbe6>:4] Train Loss: 507.37823486328125
[NeMo I 2020-05-15 07:51:05 callbacks:239] Step time: 5.12150001