In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install transformers
!pip install torchvision
!pip install gradio
!pip install pyttsx3
!pip install sentencepiece
!pip install torchaudio
!pip install git+https://github.com/huggingface/transformers.git


In [None]:
import whisper
import tempfile
import torchaudio

model_asr = whisper.load_model("small")

def transcribe(audio):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio.read())
        result = model_asr.transcribe(tmp.name)
    return result["text"]


In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-small")
model_vqa = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-small", torch_dtype=torch.float16 if device == "cuda" else torch.float32)
model_vqa.to(device)

def generate_answer(image, question):
    inputs = processor(images=image, text=question, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
    output = model_vqa.generate(**inputs, max_new_tokens=50)
    return processor.decode(output[0], skip_special_tokens=True)


In [None]:
import pyttsx3
import tempfile
import os

def speak_text(text):
    engine = pyttsx3.init()
    _, path = tempfile.mkstemp(suffix=".mp3")
    engine.save_to_file(text, path)
    engine.runAndWait()
    return path


In [None]:
import gradio as gr
from asr import transcribe
from qa import generate_answer
from tts import speak_text
from PIL import Image

def process_pipeline(audio, image):
    question = transcribe(audio)
    answer = generate_answer(image, question)
    audio_path = speak_text(answer)
    return question, answer, audio_path

with gr.Blocks() as demo:
    with gr.Row():
        audio_input = gr.Audio(source="microphone", type="file", label="Speak Your Question")
        image_input = gr.Image(type="pil", label="Upload an Image")
    with gr.Row():
        submit_btn = gr.Button("Ask the Image")
    with gr.Row():
        question_output = gr.Textbox(label="Transcribed Question")
        answer_output = gr.Textbox(label="Generated Answer")
        audio_output = gr.Audio(label="Answer (Spoken)")
    submit_btn.click(fn=process_pipeline, inputs=[audio_input, image_input], outputs=[question_output, answer_output, audio_output])

demo.launch()
