<a href="https://colab.research.google.com/github/Nick088Official/Canary-1B-Google-Colab/blob/main/Canary_1B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Canary 1B Model from Nvidia ASR (Automatic Speech Recognition) Toolkit for Transcripting Audios

Made by [Nick088](https://linktr.ee/Nick088)

In [None]:
#@title Install NeMo ASR Canary 1B Model
#@markdown If you get a warning popup about restarting runtime, just click cancel, you don't have to restart it
!pip install Cython
!pip install git+https://github.com/NVIDIA/NeMo.git@0bb9e66a6d29b28e8831d1d1dd8a30310173ce46#egg=nemo_toolkit[all] # commit from main when canary buffered inference merged
!pip install ffmpeg
!pip install libsndfile1

import json
import librosa
import os
import soundfile as sf
import tempfile
import uuid
import torch
from IPython.display import clear_output

from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED

SAMPLE_RATE = 16000  # Hz

model = ASRModel.from_pretrained("nvidia/canary-1b")
model.eval()

# make sure beam size always 1 for consistency
model.change_decoding_strategy(None)
decoding_cfg = model.cfg.decoding
decoding_cfg.beam.beam_size = 1
model.change_decoding_strategy(decoding_cfg)

# setup for buffered inference
model.cfg.preprocessor.dither = 0.0
model.cfg.preprocessor.pad_to = 0

feature_stride = model.cfg.preprocessor['window_stride']
model_stride_in_secs = feature_stride * 8  # 8 = model stride, which is 8 for FastConformer

frame_asr = FrameBatchMultiTaskAED(
    asr_model=model,
    frame_len=40.0,
    total_buffer=40.0,
    batch_size=16,
)

amp_dtype = torch.float16

In [None]:
#@title Run Canary 1B
def convert_audio(audio_filepath, tmpdir, utt_id):
    """
    Convert all files to monochannel 16 kHz wav files.
    Do not convert and raise error if audio too long.
    Returns output filename and duration.
    """

    data, sr = librosa.load(audio_filepath, sr=None, mono=True)

    duration = librosa.get_duration(y=data, sr=sr)

    if sr != SAMPLE_RATE:
        data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)

    out_filename = os.path.join(tmpdir, utt_id + '.wav')

    # save output audio
    sf.write(out_filename, data, SAMPLE_RATE)

    return out_filename, duration

def transcribe(audio_filepath, src_lang, tgt_lang, pnc):

    if audio_filepath is None:
        raise ValueError("Please provide some input audio: either upload an audio file or use the microphone")

    utt_id = uuid.uuid4()
    with tempfile.TemporaryDirectory() as tmpdir:
        converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))

        # map src_lang and tgt_lang from long versions to short
        LANG_LONG_TO_LANG_SHORT = {
            "English": "en",
            "Spanish": "es",
            "French": "fr",
            "German": "de",
        }
        if src_lang not in LANG_LONG_TO_LANG_SHORT.keys():
            raise ValueError(f"src_lang must be one of {LANG_LONG_TO_LANG_SHORT.keys()}")
        else:
            src_lang = LANG_LONG_TO_LANG_SHORT[src_lang]

        if tgt_lang not in LANG_LONG_TO_LANG_SHORT.keys():
            raise ValueError(f"tgt_lang must be one of {LANG_LONG_TO_LANG_SHORT.keys()}")
        else:
            tgt_lang = LANG_LONG_TO_LANG_SHORT[tgt_lang]

       # infer taskname from src_lang and tgt_lang
        if src_lang == tgt_lang:
           taskname = "asr"
        else:
           taskname = "s2t_translation"

       # update pnc variable to be "yes"
           pnc = "yes" if pnc else "no"

       # make manifest file and save
        manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
        manifest_data = {
           "audio_filepath": converted_audio_filepath,
           "source_lang": src_lang,
           "target_lang": tgt_lang,
           "taskname": taskname,
           "pnc": pnc,
           "answer": "predict",
           "duration": str(duration),
       }
        with open(manifest_filepath, 'w') as fout:
           line = json.dumps(manifest_data)
           fout.write(line + '\n')

       # call transcribe, passing in manifest filepath
        if duration < 40:
           output_text = model.transcribe(manifest_filepath)[0]
        else:  # do buffered inference
            with torch.cuda.amp.autocast(dtype=amp_dtype):  # TODO: make it work if no cuda
               with torch.no_grad():
                   hyps = [get_buffered_pred_feat_multitaskAED(
                       frame_asr,
                       model.cfg.preprocessor,
                       model_stride_in_secs,
                       model.device,
                       manifest=manifest_filepath,
                       filepaths=None,
                   )]

                   output_text = hyps[0].text

    clear_output()
    return output_text

# USAGE

#@markdown Upload an audio file in Files, click the 3 dots next to him and click Copy Path and paste it down here.
audio_filepath = "test.wav" #@param {type:"string"}

#@markdown **WARNING: Either src_lang or tgt_lang, you need to select english atleast in one of them**

#@markdown Input audio is spoken in:
src_lang = "English" #@param ["English", "Spanish", "French", "German"]

#@markdown Transcribe in language:
tgt_lang = "English" #@param ["English", "Spanish", "French", "German"]

#@markdown Punctuation & Capitalization in transcript?
pnc = True #@param {type:"boolean"}

output_text = transcribe(audio_filepath, src_lang, tgt_lang, pnc)

print(output_text)