In [None]:
! pip install gradio
!pip install openai-whisper
!pip install sentencepiece

Collecting gradio
  Downloading gradio-5.23.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 



In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import whisper
import gradio as gr

def load_wav2vec_model(language):
    """Load the appropriate Wav2Vec2 model for English and Hindi."""
    model_mapping = {
        "english": "facebook/wav2vec2-large-960h",
        "hindi": "theainerd/Wav2Vec2-large-xlsr-hindi"
    }

    if language not in model_mapping:
        raise ValueError("Unsupported language for Wav2Vec2. Use Whisper for Tamil, Telugu, Malayalam.")

    model_name = model_mapping[language]
    print(f"Loading Wav2Vec2 model: {model_name}")
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    return processor, model

def transcribe_wav2vec(audio_path, language):
    """Transcribe speech using Wav2Vec2 model."""
    processor, model = load_wav2vec_model(language)

    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
    input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

def transcribe_whisper(audio_path):
    """Transcribe speech using Whisper model for Tamil, Telugu, Malayalam."""
    model = whisper.load_model("medium")
    result = model.transcribe(audio_path)
    return result["text"]

def transcribe(audio, language):
    """Gradio function to handle audio input and transcribe."""
    if audio is None:
        return "No audio file provided."

    print(f"Received audio file: {audio}")

    if language in ["english", "hindi"]:
        transcription = transcribe_wav2vec(audio, language)
    elif language in ["tamil", "telugu", "malayalam"]:
        transcription = transcribe_whisper(audio)
    else:
        return "Unsupported language."

    return f"Transcription: {transcription}"

iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Dropdown(["english", "hindi", "tamil", "malayalam", "telugu"], label="Select Language")
    ],
    outputs="text",
    title="Multilingual Speech-to-Text",
    description="Upload an audio file, select a language, and get the transcribed text."
)

if __name__ == "__main__":
    print("Launching Gradio interface...")
    iface.launch(debug=True)

Launching Gradio interface...
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://992a34359d8a245c42.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Received audio file: /tmp/gradio/71d5b98e4e1d0186b0e0f9cab4187a8cb6c20fcfbb27bd29c7ad9b78bb16c109/174-50561-0002.flac
Loading Wav2Vec2 model: facebook/wav2vec2-large-960h


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Received audio file: /tmp/gradio/175f6f1ba53563fefa2a97469e52b8a5945991101762a3c2af41ca41e37e9bc4/common_voice_hi_23795242.mp3
Loading Wav2Vec2 model: theainerd/Wav2Vec2-large-xlsr-hindi


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]