In [None]:
pip install torch torchvision faiss-cpu openai sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

In [None]:
 pip install gradio

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [11]:
import gradio as gr
import torch
import numpy as np
from PIL import Image
import requests
import wikipedia
import faiss
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration

# Load models
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# --- Embedding utilities ---

def embed_text(texts):
    inputs = clip_processor(text=texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**inputs)
    return text_features.cpu().numpy()

def embed_image(image):
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)
    return image_features.cpu().numpy()

def embed_query_image_text(image, text):
    img_vec = embed_image(image)
    txt_vec = embed_text([text])
    return ((img_vec + txt_vec) / 2)[0]  # Simple average

# --- Image captioning with BLIP ---
def describe_image(image):
    inputs = blip_processor(image, return_tensors="pt")
    with torch.no_grad():
        out = blip_model.generate(**inputs)
    return blip_processor.decode(out[0], skip_special_tokens=True)

# --- Wikipedia search and embed ---
def fetch_and_embed_wiki_summaries(query, k=5):
    search_results = wikipedia.search(query, results=k)
    summaries, titles = [], []
    for title in search_results:
        try:
            summary = wikipedia.summary(title, sentences=2)
            summaries.append(summary)
            titles.append(title)
        except:
            continue
    if not summaries:
        return [], np.array([]), []
    embeddings = embed_text(summaries)
    return summaries, embeddings, titles

# --- OpenRouter GPT call ---
def call_openrouter_gpt(prompt, api_key, model="openai/gpt-3.5-turbo"):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://your-app-name.com",  # Optional
        "X-Title": "Multimodal RAG Demo"
    }
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}]
    }
    response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        return f"⚠️ OpenRouter Error {response.status_code}: {response.text}"



# --- Main Gradio function ---
def multimodal_rag(user_image, user_prompt, user_api_key, selected_model):
    if not user_api_key:
        return "❌ Please provide your OpenRouter API key."

    try:
        # Describe the image
        image_description = describe_image(user_image)

        # Embed image + text
        query_vector = embed_query_image_text(user_image, user_prompt).reshape(1, -1)

        # Wikipedia retrieval
        wiki_summaries, wiki_vectors, wiki_titles = fetch_and_embed_wiki_summaries(user_prompt, k=5)
        if not wiki_summaries:
            return "⚠️ No relevant Wikipedia context found."

        index = faiss.IndexFlatL2(wiki_vectors.shape[1])
        index.add(wiki_vectors)
        _, I = index.search(query_vector, k=1)
        top_summary = wiki_summaries[I[0][0]]
        top_title = wiki_titles[I[0][0]]

        # Compose final prompt
        full_prompt = f"""You are a helpful assistant.

Image description: {image_description}

Relevant Wikipedia context from the article titled "{top_title}":
{top_summary}

Answer the user's question:
{user_prompt}
"""

        return call_openrouter_gpt(full_prompt, user_api_key, model=selected_model)

    except Exception as e:
        return f"⚠️ Error: {str(e)}"

# --- Gradio UI ---
gr.Interface(
    fn=multimodal_rag,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Your Question"),
        gr.Textbox(label="OpenRouter API Key (kept private)", type="password"),
        gr.Dropdown(
            label="Choose OpenRouter Model",
            choices=[
                "openai/gpt-3.5-turbo",
                "openai/gpt-4",
                "anthropic/claude-3-haiku",
                "anthropic/claude-3-opus",
                "mistralai/mistral-7b-instruct",
                "meta-llama/llama-3-70b-instruct"
            ],
            value="openai/gpt-3.5-turbo"
        ),
    ],
    outputs=gr.Textbox(label="Answer"),
    title="📷 Multimodal RAG: Image + Wikipedia + OpenRouter",
    description="Ask a question about your uploaded image. CLIP + FAISS finds Wikipedia context. OpenRouter GPT answers it using image + text.",
).launch()


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://aebbc5426408584588.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [7]:
pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=38b9bc7bd8019c172e66e880cd48bd81703e36f3db6bcb2683d5de39ab6d6c75
  Stored in directory: /root/.cache/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
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