In [None]:
# this is NEMO's "core" package
import nemo
# this is NEMO's ASR collection of speech recognition related neural modules
import nemo_asr

In [None]:
# Path to the data on which you want to run inference
inference_manifest = "<path_to_json_manifest>"

# Import Jasper model definition
# Note that we are using a much larger 15x5 model now instead of 12x1
from ruamel.yaml import YAML
yaml = YAML(typ="safe")
with open("<nemo_git_root>/examples/asr/configs/jasper15x5SEP.yaml") as f:
    jasper_model_definition = yaml.load(f)
labels = jasper_model_definition['labels']

In [None]:
# Instantiate necessary neural modules
data_layer = nemo_asr.AudioToTextDataLayer(
    shuffle=False,
    manifest_filepath=inference_manifest,
    labels=labels, batch_size=64)
data_preprocessor = nemo_asr.AudioPreprocessing()
jasper_encoder = nemo_asr.JasperEncoder(
    feat_in=64,
    **jasper_model_definition['JasperEncoder'])
jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                              num_classes=len(labels))
greedy_decoder = nemo_asr.GreedyCTCDecoder()

# Define inference DAG
audio_signal, audio_signal_len, transcripts, transcripts_len = data_layer()
processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal,
                                                           length=audio_signal_len)
encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
log_probs = jasper_decoder(encoder_output=encoded)
predictions = greedy_decoder(log_probs=log_probs)

eval_tensors=[predictions, transcripts, transcripts_len]

In [None]:
# Download checkpoint from here: https://drive.google.com/drive/folders/1b-TQYY7o8_CQgZsVEe-8_2kHWU0lYJ-z?usp=sharing
import os
# Instantiate BeamSearch NM
beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
    vocab=labels,
    beam_width=128,
    alpha=2.2,
    beta=0.5,
    lm_path="<checkpoint_folder>/6-gram-lm.binary",
    num_cpus=max(os.cpu_count(), 1))
beam_predictions = beam_search_with_lm(log_probs=log_probs, log_probs_length=encoded_len)
eval_tensors.append(beam_predictions)

In [None]:
from nemo_asr.helpers import post_process_predictions, \
                             post_process_transcripts, word_error_rate
infer_callback = nemo.core.InferenceCallback(
    eval_tensors=eval_tensors,
)

neural_factory = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch)

optimizer = neural_factory.get_trainer(params={})
evaluated_tensors = optimizer.infer(
    callback=infer_callback,
    checkpoint_dir="<checkpoint_folder>",
)

In [None]:
#hypotheses = post_process_predictions(evaluated_tensors[3], labels=labels)
beam_hypotheses = [] 
for i in evaluated_tensors[-1]:
    for j in i:
        beam_hypotheses.append(j[0][1])
references = post_process_transcripts(evaluated_tensors[1], labels=labels, 
                                      transcript_len_list=evaluated_tensors[2])
wer = word_error_rate(hypotheses=beam_hypotheses, references=references)

print("BEAM WER {:.2f}".format(wer*100))