# Running Llava: a large multi-modal model on Google Colab

### Test Code

In [None]:
#!pip install bitsandbytes accelerate

In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering
import torch
from PIL import Image

# Load model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

# Load your image
image = Image.open("horse.jpeg").convert("RGB")

# Object classes detected by Detectron2 (replace with your actual output)
object_classes = ["horse"]  # Example — you can pass real Detectron2 classes here

# Ask BLIP about each object's action
for obj_class in object_classes:
    question = f"What is the {obj_class} doing in the image?"

    # Preprocess inputs
    inputs = processor(image, question, return_tensors="pt").to("cuda")

    # Run inference
    with torch.no_grad():
        output = model.generate(**inputs, max_length=15)

    # Decode response
    predicted_action = processor.decode(output[0], skip_special_tokens=True)

    print(f"Action for {obj_class}: {predicted_action}")


Action for horse: eating


In [None]:
## GET

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# Load BLIP captioning model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

In [None]:
# Load your image
image = Image.open("horse.jpeg").convert("RGB")

# Process image
inputs = processor(image, return_tensors="pt").to("cuda")

# Generate caption
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=30)

generated_caption = processor.decode(out[0], skip_special_tokens=True).strip()
print("Generated Caption:", generated_caption)

Generated Caption: a horse grazing in a field


VQA


In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import torch

# Load model & processor (already done earlier)
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")


In [None]:
# Load image
image = Image.open("horse.jpeg").convert("RGB")

scene_questions = [
    "Where is this photo taken?",
    "What is in the background of the image?",
    "Describe the overall scene.",
]

scene_answers = []
for q in scene_questions:
    inputs = processor(image, q, return_tensors="pt").to("cuda")

    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=20)

    answer = processor.decode(out[0], skip_special_tokens=True).strip()
    print(f"Q: {q}\nA: {answer}\n")
    scene_answers.append(answer)

Q: Where is this photo taken?
A: in field

Q: What is in the background of the image?
A: trees

Q: Describe the overall scene.
A: horse in field



ONLY NEED TO RUN CODE BELOW except for imports/installs

In [None]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering, BlipForConditionalGeneration, pipeline
from PIL import Image

# Load BLIP models
vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

# Load summarization/instruction-tuned LLM (can be OpenAI, or local like LLaMA, Mixtral, Mistral)


In [None]:
from transformers import pipeline

# Load Falcon3-1B-Instruct
llm = pipeline(
    "text-generation",
    model="tiiuae/Falcon3-1B-Instruct",
    device_map="auto",
    trust_remote_code=True,
    max_new_tokens=512
)

In [None]:
def evaluate_caption_with_llm(image_path, input_caption, detected_objects):
    image = Image.open(image_path).convert("RGB")

    # 1. Get object actions via BLIP-VQA
    object_actions = []
    for obj in detected_objects:
        question = f"What is the {obj} doing?"
        inputs = vqa_processor(image, question, return_tensors="pt").to("cuda")
        with torch.no_grad():
            output = vqa_model.generate(**inputs)
        action = vqa_processor.decode(output[0], skip_special_tokens=True)
        object_actions.append(f"The {obj} is {action}.")
    print(f'object actions: {object_actions}')

    # 2. Scene/background VQA
    scene_questions = ["Describe the background of the scene"]
    scene_descriptions = []
    for q in scene_questions:
        inputs = vqa_processor(image, q, return_tensors="pt").to("cuda")
        with torch.no_grad():
            output = vqa_model.generate(**inputs)
        scene_descriptions.append(vqa_processor.decode(output[0], skip_special_tokens=True))
    print(f'scene descriptions: {scene_descriptions}')
    # 3. BLIP image caption
    inputs = caption_processor(image, return_tensors="pt").to("cuda")
    with torch.no_grad():
        caption_ids = caption_model.generate(**inputs)
    blip_caption = caption_processor.decode(caption_ids[0], skip_special_tokens=True)
    print(f'blip caption: {blip_caption}')
    # 4. Prepare context for LLM
    llm_input = f"""
Ground truth information:
- BLIP Caption: {blip_caption}
- Detected objects: {', '.join(detected_objects)}
- Object actions: {' '.join(object_actions)}
- Scene: {','.join(scene_descriptions)}
Now evaluate the following caption:
"{input_caption}"
Answer these:
1. Does this caption accurately reflect the ground truth information?
2. Are there any details missing?
3. Are there any inaccuracies in the caption?
4. Provide a score between 0 and 10 to describe how accurate this caption is.
5. Use the ground truth information to generate a new caption if necessary.
"""
    print(f'llm input: {llm_input}')

    llm_response = llm(llm_input)[0]["generated_text"].split(llm_input)[-1].strip()

    return {
        "image_caption": blip_caption,
        "object_actions": object_actions,
        "scene_descriptions": scene_descriptions,
        "llm_evaluation": llm_response
    }

In [None]:
result = evaluate_caption_with_llm(
    image_path="horse.jpeg",
    input_caption="A dog in the street",
    detected_objects=["horse"]
)

from pprint import pprint
pprint(result["llm_evaluation"])

object actions: ['The horse is grazing.']
scene descriptions: ['forest']


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


blip caption: a horse grazing in a field
llm input: 
Ground truth information:
- BLIP Caption: a horse grazing in a field
- Detected objects: horse
- Object actions: The horse is grazing.
- Scene: forest
Now evaluate the following caption:
"A dog in the street"
Answer these:
1. Does this caption accurately reflect the ground truth information?
2. Are there any details missing?
3. Are there any inaccuracies in the caption?
4. Provide a score between 0 and 10 to describe how accurate this caption is.
5. Use the ground truth information to generate a new caption if necessary.

('<|assistant|>\n'
 '1. **Does this caption accurately reflect the ground truth information?**\n'
 '   - No, the ground truth information states that the subject is a horse '
 'grazing in a field, while the caption "A dog in the street" inaccurately '
 'describes the subject as a dog.\n'
 '\n'
 '2. **Are there any details missing?**\n'
 "   - Yes, the ground truth includes the subject's location (forest) and the "
 

In [None]:
import torch
import cv2
import numpy as np
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

In [None]:
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # confidence threshold
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")

predictor = DefaultPredictor(cfg)

model_final_280758.pkl: 167MB [00:01, 107MB/s]                           


In [None]:
# Run on image
image = cv2.imread("bullshit.jpg")
outputs = predictor(image)

# Extract class names
classes = outputs["instances"].pred_classes
metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
class_names = [metadata.thing_classes[i] for i in classes]

# Get unique object labels (no duplicates)
unique_objects = sorted(set(class_names))
print("Detected objects:", unique_objects)
print(class_names)

Detected objects: ['horse']
['horse', 'horse']


### MEMORY EFFICIENT (ONLY RUN BELOW CELLS)


In [None]:
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'


Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-wd7o2j_0
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-wd7o2j_0
  Resolved https://github.com/facebookresearch/detectron2.git to commit 400a49c1ec11a18dd25aea3910507bc3bcd15794
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.8 (from detectron2==0.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting fvcore<0.1.6,>=0.1.5 (from detectron2==0.6)
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6)
  Downloading iopath-0.1.9-py3-none-any.whl.metadata (370 bytes)
Collecting omegaconf<2.

In [1]:
 !pip install -U bitsandbytes # needed for TIFA

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [2]:
import torch
import cv2
import numpy as np
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering, BlipForConditionalGeneration, pipeline
# from detectron2.engine import DefaultPredictor
# from detectron2.config import get_cfg
# from detectron2 import model_zoo
# from detectron2.utils.visualizer import Visualizer
# from detectron2.data import MetadataCatalog

def detect_objects(image_path):
    # Load BLIP VQA model
    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
    model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")

    # Ask a general object detection question
    question = "What/who is in the image?"

    inputs = processor(image, question, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model.generate(**inputs)

    answer = processor.decode(output[0], skip_special_tokens=True)

    # Try to parse list of objects (assuming comma-separated)
    detected_objects = [obj.strip().lower() for obj in answer.split(",") if obj.strip()]

    print("🔍 Detected objects (via VQA):", detected_objects)
    return detected_objects

# ========== STEP 2: EVALUATE CAPTION ==========
def evaluate_caption_with_llm(image_path, input_caption, detected_objects):
    with Image.open(image_path).convert("RGB") as image:

        # Load processors
        vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
        caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        # Run VQA (on GPU)
        vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")
        object_actions = []
        for obj in detected_objects:
            question = f"What is the {obj} doing?"
            inputs = vqa_processor(image, question, return_tensors="pt").to("cuda")
            with torch.no_grad():
                output = vqa_model.generate(**inputs)
            action = vqa_processor.decode(output[0], skip_special_tokens=True)
            object_actions.append(f"The {obj} is {action}.")
            del inputs, output
            torch.cuda.empty_cache()

        # Background description
        scene_descriptions = []
        for q in ["Describe the background of the scene"]:
            inputs = vqa_processor(image, q, return_tensors="pt").to("cuda")
            with torch.no_grad():
                output = vqa_model.generate(**inputs)
            scene_descriptions.append(vqa_processor.decode(output[0], skip_special_tokens=True))
            del inputs, output
            torch.cuda.empty_cache()

        # Unload VQA
        vqa_model.cpu()
        del vqa_model
        torch.cuda.empty_cache()

        # Run Captioning (on GPU)
        caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
        inputs = caption_processor(image, return_tensors="pt").to("cuda")
        with torch.no_grad():
            caption_ids = caption_model.generate(**inputs)
        blip_caption = caption_processor.decode(caption_ids[0], skip_special_tokens=True)
        del caption_model, inputs, caption_ids
        torch.cuda.empty_cache()

    # Run LLM (on CPU for safety)
    llm = pipeline(
        "text-generation",
        model="tiiuae/Falcon3-1B-Instruct",
        device="cpu",  # Use "cuda:0" if memory is available
        trust_remote_code=True,
        max_new_tokens=512
    )

    llm_input = f"""
Ground truth information:
- BLIP Caption: {blip_caption}
- Detected objects: {', '.join(detected_objects)}
- Object actions: {' '.join(object_actions)}
- Scene: {', '.join(scene_descriptions)}

Now evaluate the following caption:
"{input_caption}"

Answer these:
1. Does this caption accurately reflect the ground truth information?
2. Are there any details missing?
3. Are there any inaccuracies in the caption?
4. Provide a score between 0 and 10 to describe how accurate this caption is.
5. Use the ground truth information to generate a new caption if necessary.
"""

    llm_response = llm(llm_input)[0]["generated_text"].split(llm_input)[-1].strip()

    return {
        "image_caption": blip_caption,
        "object_actions": object_actions,
        "scene_descriptions": scene_descriptions,
        "llm_evaluation": llm_response
    }

# ========== RUN THE FULL PIPELINE ==========
# image_path = "bullshit.jpg"
# input_caption = "A dog in the street"

# detected_objects = detect_objects(image_path)
# result = evaluate_caption_with_llm(
#     image_path=image_path,
#     input_caption=input_caption,
#     detected_objects=detected_objects
# )

# from pprint import pprint
# pprint(result["llm_evaluation"])


In [3]:
def run_tifa_evaluation(image_path, caption):
    import torch
    import re
    from difflib import SequenceMatcher
    from transformers import (
        AutoModelForCausalLM, AutoTokenizer,
        BlipProcessor, BlipForQuestionAnswering,
        BitsAndBytesConfig
    )
    from PIL import Image

    # Load TIFA Question Generator
    model_name = "tifa-benchmark/llama2_tifa_question_generation"
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    tifa_tokenizer = AutoTokenizer.from_pretrained(model_name)
    tifa_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto"
    )

    # Format TIFA prompt
    def create_qg_prompt(caption):
        intro = (
            "Given an image description, generate one or two multiple-choice questions "
            "that verifies if the image description is correct.\n"
            "Classify each concept into a type (object, human, animal, food, activity, "
            "attribute, counting, color, material, spatial, location, shape, other), and "
            "then generate a question for each type.\n"
        )
        return f"<s>[INST] <<SYS>>\n{intro}<</SYS>>\n\nDescription: {caption} [/INST] Entities:"

    prompt = create_qg_prompt(caption)
    inputs = tifa_tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output_ids = tifa_model.generate(
            **inputs,
            do_sample=False,
            num_beams=3,
            max_length=256
        )

    tifa_output = tifa_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract questions and expected answers
    questions = re.findall(r'Q:\s(.+?)\nChoices:', tifa_output)
    expected_answers = re.findall(r'A:\s(.+?)\n', tifa_output)

    # Load BLIP VQA model
    vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
    vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

    image = Image.open(image_path).convert("RGB")

    def compute_similarity(a, b):
        return SequenceMatcher(None, a.lower(), b.lower()).ratio()

    correct_count = 0
    similarity_scores = []

    print(f"\n📷 Evaluating Caption: \"{caption}\"")
    print(f"🧠 Generated {len(questions)} TIFA Questions\n")

    for i, (q, expected) in enumerate(zip(questions, expected_answers)):
        vqa_inputs = vqa_processor(image, q, return_tensors="pt").to("cuda")

        with torch.no_grad():
            vqa_output = vqa_model.generate(**vqa_inputs, max_length=10)

        predicted = vqa_processor.decode(vqa_output[0], skip_special_tokens=True)
        similarity = compute_similarity(predicted, expected)
        similarity_scores.append(similarity)
        exact = predicted.lower() == expected.lower()
        if exact:
            correct_count += 1

        print(f"Q{i+1}: {q}")
        print(f"Expected: {expected}")
        print(f"Predicted: {predicted}")
        print(f"Similarity: {similarity:.2f} {'✅' if exact else '❌'}")
        print("-" * 40)

    accuracy = (correct_count / len(questions)) * 100 if questions else 0
    avg_similarity = sum(similarity_scores) / len(questions) if questions else 0

    print(f"🎯 Final Accuracy: {accuracy:.2f}%")
    print(f"🔁 Avg Similarity Score: {avg_similarity:.2f}\n")

    return {
        "questions": questions,
        "expected_answers": expected_answers,
        "accuracy": accuracy,
        "avg_similarity": avg_similarity
    }


In [None]:
image_path = "2.jpg"
input_caption = "A man standing next to sheep"

detected_objects = detect_objects(image_path)
result = evaluate_caption_with_llm(
    image_path=image_path,
    input_caption=input_caption,
    detected_objects=detected_objects
)

from pprint import pprint
pprint(result["llm_evaluation"])

In [7]:
print(detected_objects)
pprint(result)

['jesus']
{'image_caption': 'a man in a blue robe and a sheep',
 'llm_evaluation': '<|assistant|>\n'
                   '1. **Does this caption accurately reflect the ground truth '
                   'information?**\n'
                   '   - **Answer:** No, this caption does not accurately '
                   'reflect the ground truth information. The ground truth '
                   'mentions a man in a blue robe and a sheep, but the caption '
                   'only states "A man standing next to sheep."\n'
                   '\n'
                   '2. **Are there any details missing?**\n'
                   '   - **Answer:** Yes, the ground truth provides specific '
                   "details such as the man's attire (blue robe) and the "
                   'presence of a sheep. The caption lacks these details.\n'
                   '\n'
                   '3. **Are there any inaccuracies in the caption?**\n'
                   '   - **Answer:** Yes, there is an inaccuracy. 

In [None]:
import re

llm_raw = result["llm_evaluation"]
llm_cleaned = llm_raw.encode('utf-8').decode('unicode_escape')

# Extract the caption answer after question 5
match = re.search(
    r'5\..*?\n\s*[-•]?\s*["“]?(.+?)["”]?\s*(?:\n|$)',
    llm_cleaned,
    re.DOTALL
)

if match:
    new_caption = match.group(1).strip(' "\'`*')
    print(f'\n✅ New Caption Suggested by LLM:\n"{new_caption}"')
else:
    print("❌ Could not extract the new caption.")



✅ New Caption Suggested by LLM:
"Answer:** "A man in a blue robe and a sheep standing next to a group of goats."


In [None]:
# new_caption = "A great white shark swimming in the ocean."
import gc
torch.cuda.empty_cache()
gc.collect()
run_tifa_evaluation("2.jpg", input_caption)        # Original caption
run_tifa_evaluation("2.jpg", new_caption)
