## Automatic Speech Recognition

In [1]:
from transformers.utils import logging
logging.set_verbosity_error() 

* Data Preparation

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("librispeech_asr",
                       split="train.clean.100",
                       streaming=True,
                       trust_remote_code=True)

In [4]:
example = next(iter(dataset))

In [5]:
dataset_head  = dataset.take(5)

list(dataset_head)

[{'file': '374-180298-0000.flac',
  'audio': {'path': '374-180298-0000.flac',
   'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
          -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
   'sampling_rate': 16000},
  'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED',
  'speaker_id': 374,
  'chapter_id': 180298,
  'id': '374-180298-0000'},
 {'file': '374-180298-0001.flac',
  'audio': {'path': '374-180298-0001.flac',
   'array': array([-9.15527344e-05, -1.52587891e-04, -1.52587891e-04, ...,
          -2.13623047e-04, -1.83105469e-04, -2.74658203e-04]),
   'sampling_rate': 16000},
  'text': "MARGUERITE TO BE UNABLE TO LIVE APART FROM ME IT WAS THE DAY AFTER THE EVENING WHEN SHE CAME TO SEE ME THAT I SENT HER MANON LESCAUT FROM THAT TIME SEEING THAT I COULD NOT CHANGE MY MISTRESS'S LIFE I CHANGED MY OWN",
  'speaker_id': 374

In [6]:
example

{'file': '374-180298-0000.flac',
 'audio': {'path': '374-180298-0000.flac',
  'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
         -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
  'sampling_rate': 16000},
 'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED',
 'speaker_id': 374,
 'chapter_id': 180298,
 'id': '374-180298-0000'}

### Pipeline

In [7]:
from transformers import pipeline

  warn(


In [8]:
asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")  # it has feature extractor embedded into this

In [9]:
asr.feature_extractor.sampling_rate

16000

In [11]:
example["audio"]["sampling_rate"]

16000

In [12]:
asr(example["audio"]["array"])



{'text': ' Chapter 16 I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I too agree to whatever Marguerite wished.'}

In [13]:
example["text"]

'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED'

### Shareable App with Gradio

In [15]:
import os
import gradio as gr

In [16]:
demo = gr.Blocks()

In [17]:
def transcribe_speech(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    output = asr(filepath)
    return output["text"]

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),  # live input
    outputs=gr.Textbox(label="Transcription", lines=3),
    allow_flagging="never" 
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),  # uploading the file
    outputs=gr.Textbox(label="Transcription", lines=3),
    allow_flagging="never",
)
        

In [27]:
with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.Tab("Transcribe Microphone"):
            mic_transcribe = gr.Interface(
                fn=transcribe_speech,
                inputs=gr.Audio(sources="microphone", type="filepath"),
                outputs=gr.Textbox(label="Transcription", lines=3),
                allow_flagging="never"
            )

        with gr.Tab("Transcibe Audio File"):
            file_transcribe = gr.Interface(
                fn=transcribe_speech,
                inputs=gr.Audio(sources="upload", type="filepath"),
                outputs=gr.Textbox(label="Transcription", lines=3),
                allow_flagging="never",
            )
            

demo.launch(
    share=True,
    server_port=int(os.environ.get('PORT1', 7860)) 
)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://27dcc3fea5a53ab8fe.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [28]:
demo.close()

Closing server running on port: 7860


### Audio testing

In [29]:
import soundfile as sf
import io

In [None]:
audio, sampling_rate = sf.read('narration_example.wav')

In [None]:
sampling_rate

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
asr(audio)

* This expects single channel input for ASR. So, convert the audio from stereo to mono (LIBROSA)

In [None]:
audio.shape

In [None]:
import numpy as np

audio_transposed = np.transpose(audio)

In [None]:
audio_transposed.shape

In [None]:
import librosa

audio_mono = librosa.to_mono(audio_transposed)

In [30]:
from IPython.display import Audio as IPythonAudio

In [None]:
IPythonAudio(audio_mono,
             rate=sampling_rate)

In [None]:
audio_16KHz = librosa.resample(audio_mono,
                               orig_sr=sampling_rate,
                               target_sr=16000)

In [None]:
asr(
    audio_16KHz,
    chunk_length_s=30, # 30 seconds
    batch_size=4,
    return_timestamps=True,
)["chunks"]