# Automatic Speech Recognition

In [None]:
# libraries to be installed

# !pip install transformers
# !pip install -U datasets
# !pip install soundfile
# !pip install librosa
# !pip install gradio

In [None]:
from transformers.utils import logging
logging.set_verbosity_error()

## Data Preparation

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("librispeech_asr",
                       split="train.clean.100",
                       streaming=True,
                       trust_remote_code=True)
# to load in interable dataset, streaming=True

In [None]:
example = next(iter(dataset))

In [None]:
dataset_head = dataset.take(5)
list(dataset_head)

In [None]:
list(dataset_head)[2]

In [None]:
example

In [None]:
from IPython.display import Audio as IPythonAudio

IPythonAudio(example["audio"]["array"],
             rate=example["audio"]["sampling_rate"])

## Building a data pipeline

In [None]:
from transformers import pipeline

In [None]:
# automatic speech recognition pipeline
asr = pipeline(task="automatic-speech-recognition",
               model="./models/distil-whisper/distil-small.en")

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
example['audio']['sampling_rate']

In [None]:
asr(example["audio"]["array"])

In [None]:
example["text"]

## Building a Gradio app

In [None]:
import os
import gradio as gr

In [None]:
demo = gr.Blocks()

In [None]:
def transcribe_speech(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    output = asr(filepath)
    return output["text"]

In [None]:
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never")

In [None]:
file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

In [None]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )

demo.launch(share=True, 
            server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

## For longer audio files

In [None]:
import soundfile as sf
import io

In [None]:
audio,sampling_rate=sf.read('narration_example.wav')

In [None]:
sampling_rate

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
# asr(audio)

# could throw error due to multichannel audio input

### Convert the audio from stereo to mono (Using librosa)

In [None]:
audio.shape

In [None]:
import numpy as np

audio_transposed = np.transpose(audio)

In [None]:
audio_transposed.shape

In [None]:
import librosa

In [None]:
audio_mono = librosa.to_mono(audio_transposed)

In [None]:
IPythonAudio(audio_mono,rate=sampling_rate)

In [None]:
# asr(audio_mono)

# the above may give wrong output due to difference in sampling rate

In [None]:
sampling_rate

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
audio_16KHz = librosa.resample(audio_mono,
                               orig_sr=sampling_rate,
                               target_sr=16000)

In [None]:
asr(
    audio_16KHz,
    chunk_length_s=30, # 30 seconds
    batch_size=4,
    return_timestamps=True,
)["chunks"]

# processing in batches
# renning each batch simultaneouly means theat number of models together
# theis will also use some overlapping and then connect the final result
# the results will be chunks of audio with timestanp

### Building Gradio Interface

In [None]:
import gradio as gr
demo = gr.Blocks()

In [None]:
def transcribe_long_form(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    output = asr(
      filepath,
      max_new_tokens=256,
      chunk_length_s=30,
      batch_size=8,
    )
    return output["text"]

In [None]:
mic_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never")

file_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

In [None]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )
demo.launch(share=True, 
            server_port=int(os.environ['PORT1']))

In [None]:
demo.close()