In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    torch_dtype=torch.float16
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def ask_image_question(image_path, question):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(image, question=question, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    return processor.decode(out[0], skip_special_tokens=True)

In [None]:
from llama_cpp import Llama
import os

openhermes_path = r"C:\GGUF\TheBloke\OpenHermes-2.5-Mistral-7B-GGUF\openhermes-2.5-mistral-7b.Q4_K_M.gguf"

OpenHermes = Llama(
    model_path=openhermes_path,
    n_gpu_layers=20,
    n_ctx=2048,
    n_batch=256,
    n_threads=6,
    use_mlock=True,
    verbose=True
)

In [None]:
def blip2_to_openhermes(image_path, question):
    print(f"❓ QnA: {question}")
    
    visual_answer = ask_image_question(image_path, question)
    print("📸 BLIP-2 Answer:", visual_answer)

    hermes_prompt = (
        f"The image was analyzed and the answer to the question "
        f"'{question}' is: '{visual_answer}'. Can you provide a deeper interpretation?"
    )
    full_prompt = f"<|user|>\n{hermes_prompt}\n<|assistant|>\n"

    response = OpenHermes(full_prompt, max_tokens=300, stop=["<|user|>"])
    hermes_text = response["choices"][0]["text"]