In [1]:
# Install required packages (works for both Colab and local Jupyter)
!pip install -U qwen-vl-utils datasets torch torchvision transformers accelerate
!pip install -U ipywidgets widgetsnbextension



In [2]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from datasets import load_dataset
from tqdm import tqdm
import torch

# evaluation
import os
import re
import sys

In [3]:
## setting of important macros
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
SUBSET_SIZE = 50
SHUFFLE_BUFFER_SIZE = SUBSET_SIZE * 10
SEED = 42

In [4]:
# Load VQAv2 dataset - using the mirror version that doesn't require dataset scripts
print("Loading VQAv2 dataset with streaming...")
dataset = load_dataset("Multimodal-Fatima/VQAv2_validation", split="validation", streaming=True)

# Shuffle dataset before taking subset to ensure representative sampling
print(f"Shuffle the dataset, buffer size of {SHUFFLE_BUFFER_SIZE}...")
shuffled_dataset = dataset.shuffle(seed=SEED, buffer_size=SHUFFLE_BUFFER_SIZE)

# Take subset and convert to list (needed for multiple iterations)
print(f"Taking subset of {SUBSET_SIZE} samples...")
samples = list(shuffled_dataset.take(SUBSET_SIZE))
print(f"Loaded {len(samples)} samples")

Loading VQAv2 dataset with streaming...


Resolving data files:   0%|          | 0/90 [00:00<?, ?it/s]

Shuffle the dataset, buffer size of 500...
Taking subset of 50 samples...
Loaded 50 samples


In [None]:
# Preview first sample
samples[0:5]

{'question_type': 'how many',
 'multiple_choice_answer': '1',
 'answers': ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'],
 'answers_original': [{'answer': '1',
   'answer_confidence': 'yes',
   'answer_id': 1},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 2},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 3},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 4},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 5},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 6},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 7},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 8},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 9},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 10}],
 'id_image': 232309,
 'answer_type': 'number',
 'question_id': 232309002,
 'question': 'How many kites are in the picture?',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x332>,
 'id': 169159,
 

In [6]:
# Check if running in Colab and set device accordingly
try:
    import google.colab
    IN_COLAB = True
    device_map = "cuda" if torch.cuda.is_available() else "cpu"
    print("Running in Google Colab")
except:
    IN_COLAB = False
    device_map = "auto"
    print("Running in local Jupyter notebook")

print(f"Device map: {device_map}")

# Disable download progress bars to avoid widget errors
# I guess it can be tested since it improves usability by a lot
# try removing these lines and if a layout error appears, then put them back
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
from huggingface_hub import utils as hf_utils
hf_utils.disable_progress_bars()

# Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    dtype="auto",
    device_map=device_map
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )
# Re-enable progress bars for everything else (dataset loading, inference loops, etc.)
del os.environ['HF_HUB_DISABLE_PROGRESS_BARS']
hf_utils.enable_progress_bars()

# default processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

## ONE OF THE EXPERIMENTS FOR VRAM USAGE ?
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
#
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

Running in local Jupyter notebook
Device map: auto


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [7]:
## ZERO-SHOT EVALUATION

# Get the appropriate device (works for both Colab and local)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Process each sample
results = []
for idx, sample in enumerate(tqdm(samples, desc="Processing VQAv2")):
    # Prepare message for this sample
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": sample["image"]},
                {"type": "text", "text": sample["question"]},
            ],
        }
    ]
    
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    # Store results with all metadata needed for VQA evaluation
    # FIXED: Extract answer strings from the list of answer dictionaries
    # VQAv2 dataset format has answers as list of dicts: [{'answer': 'text'}, ...]
    ground_truth_answers = [ans['answer'] if isinstance(ans, dict) else ans for ans in sample["answers"]]
    
    results.append({
        "question": sample["question"],
        "predicted_answer": output_text[0],
        "ground_truth_answers": ground_truth_answers,  # Now contains list of strings
        "question_type": sample.get("question_type", "unknown"),
        "answer_type": sample.get("answer_type", "unknown"),
        "question_id": sample.get("question_id", idx)
    })

print(f"\nProcessed {len(results)} samples")


Using device: cuda


Processing VQAv2: 100%|██████████| 50/50 [00:19<00:00,  2.56it/s]


Processed 50 samples





34min to precess 850 images (accidental keyboard interrupt) ~ around 4-5min each 100 samples

on my (jacopo) italian pc: around 0.5s per sample

In [8]:
# Check the structure of results
print("Sample result:")
print(results[0])
print("\nGround truth answers structure:")
print(results[0]['ground_truth_answers'])

Sample result:
{'question': 'How many kites are in the picture?', 'predicted_answer': 'There is one kite in the picture.', 'ground_truth_answers': ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'], 'question_type': 'how many', 'answer_type': 'number', 'question_id': 232309002}

Ground truth answers structure:
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']


In [16]:
results[:10]

[{'question': 'How many kites are in the picture?',
  'predicted_answer': 'There is one kite in the picture.',
  'ground_truth_answers': ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'],
  'question_type': 'how many',
  'answer_type': 'number',
  'question_id': 232309002},
 {'question': 'What does the blender have on it?',
  'predicted_answer': 'The blender has duct tape wrapped around it.',
  'ground_truth_answers': ['fruit',
   'fruit',
   'tape',
   'fruit',
   'tape',
   'buttons',
   'duct tape',
   'tape',
   'duct tape',
   'masking tape'],
  'question_type': 'what does the',
  'answer_type': 'other',
  'question_id': 519359000},
 {'question': 'Is this player on an organized team?',
  'predicted_answer': ' player on an organized team(641,191),(749,624)',
  'ground_truth_answers': ['no',
   'no',
   'no',
   'no',
   'no',
   'no',
   'no',
   'no',
   'no',
   'no'],
  'question_type': 'is this',
  'answer_type': 'yes/no',
  'question_id': 232511000},
 {'question': 'What is th

## VQAv2 Official Evaluation

Now we'll use the official VQA evaluation toolkit to compute accuracy.

In [None]:
# Install official VQA evaluation toolkit
!pip install --quiet matplotlib scikit-image
!git clone https://github.com/GT-Vision-Lab/VQA.git
!wget -q https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip
!wget -q https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip
!unzip -q v2_Annotations_Val_mscoco.zip
!unzip -q v2_Questions_Val_mscoco.zip

In [None]:
import json
import sys

# Add VQA toolkit to path
sys.path.append('VQA/PythonEvaluationTools')
sys.path.append('VQA/PythonHelperTools')

# Format results for VQA evaluation
# The official toolkit expects a list of dicts with 'question_id' and 'answer'
vqa_results = [
    {
        'question_id': int(result['question_id']),
        'answer': result['predicted_answer']  # Raw model output
    }
    for result in results
]

# Save results in VQA format
with open('qwen2b_vqa_results.json', 'w') as f:
    json.dump(vqa_results, f)

print(f"Saved {len(vqa_results)} results to qwen2b_vqa_results.json")
print(f"\nExample result format:")
print(json.dumps(vqa_results[0], indent=2))

In [None]:
from vqa import VQA
from vqaEval import VQAEval

# Paths to VQAv2 annotation files
annFile = 'v2_mscoco_val2014_annotations.json'
quesFile = 'v2_OpenEnded_mscoco_val2014_questions.json'
resFile = 'qwen2b_vqa_results.json'

# Load VQA annotations and questions
print("Loading VQA annotations and questions...")
vqa = VQA(annFile, quesFile)

# Load results
print("Loading results...")
vqaRes = vqa.loadRes(resFile, quesFile)

# Create VQAEval object and evaluate
print("\nEvaluating results...")
vqaEval = VQAEval(vqa, vqaRes, n=2)  # n=2 for VQAv2
vqaEval.evaluate()

print("\n" + "="*50)
print("VQAv2 EVALUATION RESULTS")
print("="*50)

In [None]:
# Overall accuracy
print(f"\n{'Overall Accuracy:':<30} {vqaEval.accuracy['overall']:.2f}%")

# Accuracy by answer type
print("\n" + "="*50)
print("ACCURACY BY ANSWER TYPE")
print("="*50)
for ansType in vqaEval.accuracy['perAnswerType']:
    print(f"{ansType:<30} {vqaEval.accuracy['perAnswerType'][ansType]:.2f}%")

# Accuracy by question type (top 10)
print("\n" + "="*50)
print("ACCURACY BY QUESTION TYPE (Top 10)")
print("="*50)
quesTypes = sorted(vqaEval.accuracy['perQuestionType'].items(), 
                   key=lambda x: x[1], reverse=True)[:10]
for quesType, acc in quesTypes:
    print(f"{quesType:<30} {acc:.2f}%")

In [None]:
# Show some example predictions vs ground truth
print("\n" + "="*50)
print("SAMPLE PREDICTIONS")
print("="*50)

import random
random.seed(42)
sample_indices = random.sample(range(len(results)), min(5, len(results)))

for idx in sample_indices:
    result = results[idx]
    print(f"\nQuestion: {result['question']}")
    print(f"Predicted: {result['predicted_answer']}")
    print(f"Ground Truth: {result['ground_truth_answers'][:3]}...")  # Show first 3
    print(f"Type: {result['answer_type']}")

In [None]:
# Save evaluation summary
eval_summary = {
    'model': MODEL_ID,
    'subset_size': len(results),
    'overall_accuracy': vqaEval.accuracy['overall'],
    'accuracy_by_answer_type': vqaEval.accuracy['perAnswerType'],
    'accuracy_by_question_type': vqaEval.accuracy['perQuestionType']
}

with open('qwen2b_evaluation_summary.json', 'w') as f:
    json.dump(eval_summary, f, indent=2)

print("\nEvaluation summary saved to qwen2b_evaluation_summary.json")