# Proposed changes

* Drop (empty) multi-backend promise
* Make all Models and Modules Pytorch-compatible out-of-the box
* Provide training/fine-tuning examples using Pytorch Lightning
* Neural Types are now optional and can do more when used

## 1. NeMo/Jarvis fine-tuning example

Every Jarvis service will have a corresponding script (or more if several models are used) to fine-tune its models on the user data

In [2]:
import pytorch_lightning as pl
import nemo.collections.asr as nemo_asr

#### Method 1 - get model and pretrained weights from NVIDIA NGC cloud

In [3]:
# asr_model = nemo_asr.models.ASRConvCTCModel2.from_cloud(name="QuartzNet15x5-En")

#### Method 2 - get model and pretrained weights from local file

In [4]:
# asr_model = nemo_asr.models.ASRConvCTCModel2.restore_from("my_asr.nemo")

#### Method 3 - instantiate model from config (no pre-trained weights)

In [5]:
# asr_model = nemo_asr.models.ASRConvCTCModel2.from_config("quartznet.yaml")

#### Method 4 - instantiate model using constructor

In [6]:
from ruamel.yaml import YAML
yaml = YAML(typ="safe")
with open('/Users/okuchaiev/repos/NeMo/examples/asr/configs/jasper_an4-2.yaml') as f:
    model_config = yaml.load(f)
asr_model = nemo_asr.models.ASRConvCTCModel2(
    preprocessor_params=model_config['AudioToMelSpectrogramPreprocessor'],
    encoder_params=model_config['JasperEncoder'],
    decoder_params=model_config['JasperDecoder'],
)

[NeMo I 2020-06-17 23:09:44 features:154] PADDING: 16
[NeMo I 2020-06-17 23:09:44 features:162] STFT using conv
[NeMo I 2020-06-17 23:09:44 helpers:66] Instantiated a new Neural Module of type AudioToMelSpectrogramPreprocessor2
[NeMo I 2020-06-17 23:09:44 helpers:66] Instantiated a new Neural Module of type ConvASREncoder
[NeMo I 2020-06-17 23:09:44 helpers:66] Instantiated a new Neural Module of type ConvASRDecoder


In [7]:
# Setup where your training and validation data is
asr_model.setup_training_data(model_config['AudioToTextDataLayer'])
asr_model.setup_validation_data(model_config['AudioToTextDataLayer_eval'])
# Setup optimizer 
asr_model.setup_optimizer(optimizer_params={'lr': 0.0003})

[NeMo I 2020-06-17 23:09:45 collections:158] Dataset loaded with 897 files totalling 1.39 hours
[NeMo I 2020-06-17 23:09:45 collections:159] 0 files were filtered totalling 0.00 hours
[NeMo I 2020-06-17 23:09:45 helpers:66] Instantiated a new Neural Module of type AudioToTextDataLayer2
[NeMo I 2020-06-17 23:09:45 collections:158] Dataset loaded with 130 files totalling 0.20 hours
[NeMo I 2020-06-17 23:09:45 collections:159] 0 files were filtered totalling 0.00 hours
[NeMo I 2020-06-17 23:09:45 helpers:66] Instantiated a new Neural Module of type AudioToTextDataLayer2


In [9]:
# User Pytorch Lightning Trainer object instead of NeuralModuleFactory
#trainer = pl.Trainer(val_check_interval=5, amp_level='O1', gpus=1, precision=16, max_epochs=10,)
# CPU training on MacBook
trainer = pl.Trainer(val_check_interval=5, max_epochs=1)
# trainer = pl.Trainer(val_check_interval=5, amp_level='O1', precision=16, gpus=2, max_epochs=50, distributed_backend='ddp')
trainer.fit(asr_model)

GPU available: False, used: False
No environment variable for node rank defined. Set as 0.

   | Name                           | Type                               | Params
----------------------------------------------------------------------------------
0  | preprocessor                   | AudioToMelSpectrogramPreprocessor2 | 0     
1  | preprocessor.featurizer        | FilterbankFeatures                 | 0     
2  | preprocessor.featurizer.stft   | STFTPatch                          | 0     
3  | encoder                        | ConvASREncoder                     | 5 M   
4  | encoder.encoder                | Sequential                         | 5 M   
5  | encoder.encoder.0              | JasperBlock                        | 98 K  
6  | encoder.encoder.0.mconv        | ModuleList                         | 90 K  
7  | encoder.encoder.0.mconv.0      | MaskedConv1d                       | 90 K  
8  | encoder.encoder.0.mconv.0.conv | Conv1d                             | 90 K  
9  | 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Detected KeyboardInterrupt, attempting graceful shutdown...





1

### Export for deployment with Jarvis

In [12]:
asr_model.export(out='qn.nemo')

## 2. Pytorch Ecosystem Compatibility

In [13]:
import torch

In [14]:
isinstance(asr_model, torch.nn.Module)

True

In [15]:
isinstance(asr_model.preprocessor, torch.nn.Module)

True

# 3. How to create a new Neural Module?

Inherit from torch.nn.Tensor and NeuralModuleAPI

In [16]:
from nemo.collections.asr.parts.jasper import init_weights
from nemo.core.apis import NeuralModulePT

# Inherit from NeuralModulePT instead of torch.nn.Module
class ConvASRDecoderNOTTYPED(NeuralModulePT):
    def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary=None):
        super(ConvASRDecoder, self).__init__()
        self.__vocabulary = vocabulary
        self._feat_in = feat_in
        # Add 1 for blank char
        self._num_classes = num_classes + 1

        self.decoder_layers = torch.nn.Sequential(
            torch.nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True)
        )
        self.apply(lambda x: init_weights(x, mode=init_mode))

    def forward(self, encoder_output):
        return torch.nn.functional.log_softmax(self.decoder_layers(encoder_output).transpose(1, 2), dim=-1)

    @property
    def vocabulary(self):
        return self.__vocabulary

# 4. How to create a new Neural Model?

Instead of nemo.core.apis import NeuralModulePT, inherit from nemo.core.apis import NeuralModelPT and implement methods which are required by it (will contain methods for

In [None]:
from pytorch_lightning import LightningModule
from nemo.core.apis import NeuralModelAPI
class MyModel(LightningModule, NeuralModelAPI):
    pass

# 5. How to use Neural Types?

### Step 1.  Define input/output types of your modules/models

In [None]:
from nemo.core import AcousticEncodedRepresentation, LogprobsType, NeuralType
class ConvASRDecoderTYPED(torch.nn.Module, NeuralModuleAPI):
    @property
    def input_types(self):
        return {"encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation())}

    @property
    def output_types(self):
        return {"logprobs": NeuralType(('B', 'T', 'D'), LogprobsType())}
    
    def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary=None):
        super(ConvASRDecoderTYPED, self).__init__()
        self.__vocabulary = vocabulary
        self._feat_in = feat_in
        # Add 1 for blank char
        self._num_classes = num_classes + 1

        self.decoder_layers = torch.nn.Sequential(
            torch.nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True)
        )
        self.apply(lambda x: init_weights(x, mode=init_mode))

    def forward(self, encoder_output):
        return torch.nn.functional.log_softmax(self.decoder_layers(encoder_output).transpose(1, 2), dim=-1)

    @property
    def vocabulary(self):
        return self.__vocabulary
my_typed_module = ConvASRDecoderTYPED(feat_in=2, num_classes=8)

In [None]:
type(my_typed_module)

In [None]:
isinstance(my_typed_module, torch.nn.Module)

In [None]:
with torch.no_grad():
    x = torch.rand((2, 2, 2))
    y_t = my_typed_module.typed_forward(encoder_output=x)

In [None]:
y_t.neural_type

In [None]:
type(y_t)

#### That's it - now users of your Module will be able to use Type checking!

### Step 2. Use type-checking!

#### Simply use "typed_forward" instead of "forward". It's already there! No need to implement it.

In [None]:
def forward(self, input_signal, input_signal_length):
    # # Non-typed old-fashioned way
    # processed_signal, processed_signal_len = self.preprocessor(
    #     input_signal=input_signal, length=input_signal_length,
    # )
    # if self.spec_augmentation is not None:
    #     processed_signal = self.spec_augmentation(input_spec=processed_signal)
    # encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_len)
    # log_probs = self.decoder(encoder_output=encoded)
    # greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
    # return log_probs, encoded_len, greedy_predictions

    # Typed way -- good for "production-ready"
    processed_signal, processed_signal_len = self.preprocessor.typed_forward(
       input_signal=input_signal, length=input_signal_length,
    )
    if self.spec_augmentation is not None:
       processed_signal = self.spec_augmentation.typed_forward(input_spec=processed_signal)
    encoded, encoded_len = self.encoder.typed_forward(audio_signal=processed_signal, length=processed_signal_len)
    # log_probs = self.decoder.typed_forward(encoder_output=processed_signal)
    log_probs = self.decoder.typed_forward(encoder_output=encoded)
    greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
    return log_probs, encoded_len, greedy_predictions