### **SET B2 - Automatic Speech Recognition**

------

In [None]:
!pip install transformers
!pip install -U datasets
!pip install soundfile
!pip install librosa
!pip install gradio

In [None]:
#Suppresses warning messages

from transformers.utils import logging
logging.set_verbosity_error()

**Data Preparation**

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("librispeech_asr",
                       split="train.clean.100",
                       streaming=True,
                       trust_remote_code=True)

In [None]:
example = next(iter(dataset))

In [None]:
dataset_head = dataset.take(5)
list(dataset_head)

In [None]:
list(dataset_head)[2]

In [None]:
example

In [None]:
#Text-To-Speech

from IPython.display import Audio as IPythonAudio

IPythonAudio(example["audio"]["array"],
             rate=example["audio"]["sampling_rate"])

**Building the pipeline**

In [None]:
from transformers import pipeline

In [None]:
asr = pipeline(task="automatic-speech-recognition",
               model="./models/distil-whisper/distil-small.en")

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
example['audio']['sampling_rate']

In [None]:
asr(example["audio"]["array"])

In [None]:
example["text"]

**Building a Shareable app with Gradio**

In [None]:
import os
import gradio as gr

In [None]:
demo = gr.Blocks()

In [None]:
def transcribe_speech(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    output = asr(filepath)
    return output["text"]

Here we are allowing the users to set up the microphone

In [None]:
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never")

Here we are allowing them to be able to upload audio files

In [None]:
file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

Add those two functionalities togther in gradio and start the demo

In [None]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )

demo.launch(share=True, 
            server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

Output 1^

------

(Contd.) with the same setup, but will be adding some additional codes so that it can handle longer audio files

In [None]:
import soundfile as sf
import io

audio, sampling_rate = sf.read('narration_example.wav')

In [None]:
sampling_rate

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
asr(audio)

*Note: Running the cell above will return:*

*ValueError: We expect a single channel audio input for AutomaticSpeechRecognitionPipeline*

Stereo channel/Stereo audio is the type which takes up different spaces to transmit audio (Spatial Audio) which enhances the listening experience.

But for the transformer model it must be in Mono i.e. in a single channel.


**Convert the audio from stereo to mono (Using librosa)**

In [None]:
#If it is 2D, then it has two channels
audio.shape

In [None]:
import numpy as np
audio_transposed = np.transpose(audio)

In [None]:
audio_transposed.shape

In [None]:
import librosa

In [None]:
audio_mono = librosa.to_mono(audio_transposed)

In [None]:
IPythonAudio(audio_mono,
             rate=sampling_rate)

In [None]:
asr(audio_mono)

*Warning: The cell above might throw a warning because the sample rate of the audio sample is not the same of the sample rate of the model.*

In [None]:
sampling_rate

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
audio_16KHz = librosa.resample(audio_mono,
                               orig_sr=sampling_rate,
                               target_sr=16000)

Now, the whisper model (Selected by us) can only handle 30 seconds at once. Therefore we divide it into chunks and there will be a certain overlap of where the first chunk ends and the next one starts (So that when the final outputs are put together, there is a continuation). Note that the chunks are processed individually.

![image.png](attachment:image.png)

In [None]:
asr(
    audio_16KHz,
    chunk_length_s=30, # 30 seconds
    batch_size=4,   #Adjust the batch size according to the memory your hardware can handle
    return_timestamps=True,
)["chunks"]

Building the Gradio interface

In [None]:
import gradio as gr
demo = gr.Blocks()

In [None]:
def transcribe_long_form(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    output = asr(
      filepath,
      max_new_tokens=256,
      chunk_length_s=30,
      batch_size=8,
    )
    return output["text"]

In [None]:
mic_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never")

file_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

In [None]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )
demo.launch(share=True, 
            server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

Output 2^

----------