In [7]:
!pip install gradio SpeechRecognition pydub indic-transliteration

Collecting gradio
  Downloading gradio-4.40.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.2.0 (from gradio)
  Downloading gradio_client-1.2.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Collecting ruff>=0.2.2

In [10]:
import gradio as gr
import speech_recognition as sr
from pydub import AudioSegment
from pydub.utils import mediainfo
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

def convert_audio_format(uploaded_file):
    audio = AudioSegment.from_file(uploaded_file)
    converted_file = "converted_audio.wav"
    audio.export(converted_file, format="wav")
    return converted_file

def transcribe_audio(audio_file, language_code):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
        transcribed_text = recognizer.recognize_google(audio_data, language=language_code)
    return transcribed_text

def get_word_timestamps(transliterated_text, search_word, audio_duration):
    words = transliterated_text.split()
    total_words = len(words)
    word_duration = audio_duration / total_words
    occurrences = []
    for i, word in enumerate(words):
        if search_word.lower() in word.lower():
            start_time = i * word_duration
            end_time = start_time + word_duration
            occurrences.append((start_time, end_time))
    return occurrences

def process_audio(audio_file, language_code, search_word):
    converted_file = convert_audio_format(audio_file)
    transcribed_text = transcribe_audio(converted_file, language_code)

    if language_code == "te":
        transliterated_text = transliterate(transcribed_text, sanscript.TELUGU, sanscript.ITRANS)
    elif language_code == "hi":
        transliterated_text = transliterate(transcribed_text, sanscript.DEVANAGARI, sanscript.ITRANS)
    elif language_code == "ta":
        transliterated_text = transliterate(transcribed_text, sanscript.TAMIL, sanscript.ITRANS)
    elif language_code == "ml":
        transliterated_text = transliterate(transcribed_text, sanscript.MALAYALAM, sanscript.ITRANS)
    elif language_code == "kn":
        transliterated_text = transliterate(transcribed_text, sanscript.KANNADA, sanscript.ITRANS)
    else:
        transliterated_text = transcribed_text

    audio_info = mediainfo(converted_file)
    audio_duration = float(audio_info['duration'])

    timestamps = get_word_timestamps(transliterated_text, search_word, audio_duration)

    return transcribed_text, transliterated_text, timestamps

iface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio File"),
        gr.Radio(["en", "te", "hi", "ta", "ml", "kn"], label="Select Language Code"),
        gr.Textbox(label="Enter the word to search for timestamps")
    ],
    outputs=[
        gr.Textbox(label="Transcribed Text"),
        gr.Textbox(label="Transliterated Text (English)"),
        gr.Textbox(label="Timestamps")
    ],
    title="Audio Transcription and Word Timestamp Extraction"
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://f2898936a863b24e63.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


