In [1]:
!pip install gradio torch torchvision torchaudio transformers pyttsx3 git+https://github.com/openai/whisper.git
!apt-get install -y libportaudio2 espeak


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-8l5isht3
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-8l5isht3
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
espeak is already the newest version (1.48.15+dfsg-3).
libportaudio2 is already the newest version (19.6.0-1.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [4]:
import gradio as gr
import whisper
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import pyttsx3

# Load models once
asr_model = whisper.load_model("small")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
vqa_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base")

# TTS
def speak_text(text):
    engine = pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

# Full pipeline
def pipeline(audio, image):
    # Transcribe
    result = asr_model.transcribe(audio)
    question = result['text']

    # Answer
    image = Image.open(image).convert("RGB")
    inputs = processor(image, question, return_tensors="pt")
    with torch.no_grad():
        out = vqa_model.generate(**inputs)
    answer = processor.decode(out[0], skip_special_tokens=True)

    # Speak
    speak_text(answer)

    return question, answer




In [None]:
iface = gr.Interface(
    fn=pipeline,
    inputs=[
        gr.Audio(type="filepath", label="Ask a Question (Max 10s)"),
        gr.Image(type="filepath", label="Upload an Image")
    ],
    outputs=[
        gr.Text(label="Transcribed Question"),
        gr.Text(label="VLM Answer")
    ],
    title="Ask-the-Image",
    description="Speak a question and upload an image. The app will describe or answer based on the image."
)

# Recommended for Colab
iface.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ba60795ace25353e05.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
