In [None]:
# Install required packages (works for both Colab and local Jupyter)
!pip install -U qwen-vl-utils datasets torch torchvision transformers accelerate
!pip install -U ipywidgets widgetsnbextension

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoot trust

Jupyter command `jupyter-nbextension` not found.
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--p

In [2]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from datasets import load_dataset
from tqdm import tqdm
import torch

# evaluation
import os
import re
import sys

In [3]:
## setting of important macros
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
SUBSET_SIZE = 50
SHUFFLE_BUFFER_SIZE = SUBSET_SIZE * 10
SEED = 42

In [4]:
# Load VQAv2 dataset - using the mirror version that doesn't require dataset scripts
print("Loading VQAv2 dataset with streaming...")
dataset = load_dataset("Multimodal-Fatima/VQAv2_validation", split="validation", streaming=True)

# Shuffle dataset before taking subset to ensure representative sampling
print(f"Shuffle the dataset, buffer size of {SHUFFLE_BUFFER_SIZE}...")
shuffled_dataset = dataset.shuffle(seed=SEED, buffer_size=SHUFFLE_BUFFER_SIZE)

# Take subset and convert to list (needed for multiple iterations)
print(f"Taking subset of {SUBSET_SIZE} samples...")
samples = list(shuffled_dataset.take(SUBSET_SIZE))
print(f"Loaded {len(samples)} samples")

Loading VQAv2 dataset with streaming...


Resolving data files:   0%|          | 0/90 [00:00<?, ?it/s]

Shuffle the dataset, buffer size of 500...
Taking subset of 50 samples...
Loaded 50 samples
Loaded 50 samples


In [5]:
# Preview first sample
samples[0]

{'question_type': 'how many',
 'multiple_choice_answer': '1',
 'answers': ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'],
 'answers_original': [{'answer': '1',
   'answer_confidence': 'yes',
   'answer_id': 1},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 2},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 3},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 4},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 5},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 6},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 7},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 8},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 9},
  {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 10}],
 'id_image': 232309,
 'answer_type': 'number',
 'question_id': 232309002,
 'question': 'How many kites are in the picture?',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x332>,
 'id': 169159,
 

In [6]:
# Check if running in Colab and set device accordingly
try:
    import google.colab
    IN_COLAB = True
    device_map = "cuda" if torch.cuda.is_available() else "cpu"
    print("Running in Google Colab")
except:
    IN_COLAB = False
    device_map = "auto"
    print("Running in local Jupyter notebook")

print(f"Device map: {device_map}")

# Disable download progress bars to avoid widget errors
# I guess it can be tested since it improves usability by a lot
# try removing these lines and if a layout error appears, then put them back
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
from huggingface_hub import utils as hf_utils
hf_utils.disable_progress_bars()

# Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    dtype="auto",
    device_map=device_map
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )
# Re-enable progress bars for everything else (dataset loading, inference loops, etc.)
del os.environ['HF_HUB_DISABLE_PROGRESS_BARS']
hf_utils.enable_progress_bars()

# default processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

## ONE OF THE EXPERIMENTS FOR VRAM USAGE ?
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
#
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

Running in local Jupyter notebook
Device map: auto


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [None]:
## ZERO-SHOT EVALUATION

# Get the appropriate device (works for both Colab and local)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Process each sample
results = []
for idx, sample in enumerate(tqdm(samples, desc="Processing VQAv2")):
    # Prepare message for this sample
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": sample["image"]},
                {"type": "text", "text": sample["question"]},
            ],
        }
    ]
    
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    # Store results with all metadata needed for VQA evaluation
    # Extract answer strings from the list of answer dictionaries
    #ground_truth_answers = [ans['answer'] if isinstance(ans, dict) else ans for ans in sample["answers"]]
    
    results.append({
        "question": sample["question"],
        "predicted_answer": output_text[0],
        "ground_truth_answers": sample["answers"],
        #"ground_truth_answers": ground_truth_answers,
        "question_type": sample.get("question_type", "unknown"),
        "answer_type": sample.get("answer_type", "unknown"),
        "question_id": sample.get("question_id", idx)
    })

print(f"\nProcessed {len(results)} samples")

Using device: cuda


Processing VQAv2: 100%|██████████| 50/50 [00:19<00:00,  2.60it/s]


Processed 50 samples





34min to precess 850 images (accidental keyboard interrupt) ~ around 4-5min each 100 samples

on my (jacopo) italian pc: around 0.5s per sample

In [12]:
# Check the structure of results
print("Sample result:")
print(results[0])
print("\nGround truth answers structure:")
print(results[0]['ground_truth_answers'])

Sample result:
{'question': 'How many kites are in the picture?', 'predicted_answer': 'There is one kite in the picture.', 'ground_truth_answers': ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'], 'question_type': 'how many', 'answer_type': 'number', 'question_id': 232309002}

Ground truth answers structure:
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']


# backup code / stuff

In [15]:
##############################################
## EVALUATION FOR VQA
## source: https://github.com/GT-Vision-Lab/VQA/blob/master/PythonEvaluationTools/vqaEvaluation/vqaEval.py
##############################################
# coding=utf-8

import re
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
import sys

#####
#	class for evaluating VQA results
#	params: vqa: VQA object with ground truth annotations
#			vqaRes: VQA object with results to be evaluated
#			n: number of decimal places for accuracy
#####
class VQAEval:
	def __init__(self, vqa, vqaRes, n=2):
		# Number of decimal places for rounding accuracy scores
		self.n 			  = n
		# Dictionary to store overall accuracy and breakdown by question/answer type
		self.accuracy     = {}
		# Dictionary to store per-question accuracy scores
		self.evalQA       = {}
		# Dictionary to store accuracy scores grouped by question type (e.g., "what", "where")
		self.evalQuesType = {}
		# Dictionary to store accuracy scores grouped by answer type (e.g., "yes/no", "number")
		self.evalAnsType  = {}
		# Ground truth VQA object containing correct annotations
		self.vqa 		  = vqa
		# Results VQA object containing model predictions to be evaluated
		self.vqaRes       = vqaRes
		# Parameters storing question IDs to evaluate
		self.params		  = {'question_id': vqa.getQuesIds()}

		# Dictionary mapping contracted forms to their expanded versions
		# Used to normalize answers like "cant" -> "can't"
		self.contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
							 "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
							 "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
							 "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
							 "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
							 "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
							 "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
							 "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
							 "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
							 "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
							 "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
							 "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
							 "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
							 "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
							 "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
							 "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
							 "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've", \
							 "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
							 "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
							 "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
							 "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
							 "youll": "you'll", "youre": "you're", "youve": "you've"}

		# Dictionary to convert number words to digit strings
		# Ensures "three" and "3" are treated as equivalent
		self.manualMap    = { 'none': '0',
							  'zero': '0',
							  'one': '1',
							  'two': '2',
							  'three': '3',
							  'four': '4',
							  'five': '5',
							  'six': '6',
							  'seven': '7',
							  'eight': '8',
							  'nine': '9',
							  'ten': '10'
							}

		# List of articles to be removed during normalization
		# Ensures "the dog" and "dog" are treated as equivalent
		self.articles     = ['a',
							 'an',
							 'the'
							]

		# Regex pattern to remove periods that are not part of decimal numbers
		self.periodStrip  = re.compile(r"(?!<=\d)(\.)(?!\d)")
		# Regex pattern to handle commas in numbers (e.g., "1,000")
		self.commaStrip   = re.compile(r"(\d)(\,)(\d)")
		# List of punctuation marks to be normalized or removed
		self.punct        = [';', r"/", '[', ']', '"', '{', '}',
							 '(', ')', '=', '+', '\\', '_', '-',
							 '>', '<', '@', '`', ',', '?', '!']


	def evaluate(self, quesIds=None):
		# If no specific question IDs provided, evaluate all questions
		if quesIds == None:
			quesIds = [quesId for quesId in self.params['question_id']]

		# Create dictionaries to hold ground truth and result answers for each question
		gts = {}
		res = {}
		for quesId in quesIds:
			gts[quesId] = self.vqa.qa[quesId]
			res[quesId] = self.vqaRes.qa[quesId]

		# =================================================
		# Compute accuracy
		# =================================================
		# List to store accuracy scores for all questions
		accQA       = []
		# Dictionary to accumulate accuracy scores by question type
		accQuesType = {}
		# Dictionary to accumulate accuracy scores by answer type
		accAnsType  = {}
		print("computing accuracy")
		step = 0

		# Iterate through each question to compute accuracy
		for quesId in quesIds:
			# Clean ground truth answers: replace newlines and tabs with spaces, strip whitespace
			for ansDic in gts[quesId]['answers']:
				ansDic['answer'] = ansDic['answer'].replace('\n', ' ')
				ansDic['answer'] = ansDic['answer'].replace('\t', ' ')
				ansDic['answer'] = ansDic['answer'].strip()

			# Clean the predicted answer in the same way
			resAns = res[quesId]['answer']
			resAns = resAns.replace('\n', ' ')
			resAns = resAns.replace('\t', ' ')
			resAns = resAns.strip()

			# List to store accuracy scores when comparing prediction to each ground truth answer
			gtAcc = []
			# Extract all ground truth answer texts
			gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]

			# Only apply normalization if there are multiple different ground truth answers
			# This preserves exact matches when all annotators agree
			if len(set(gtAnswers)) > 1:
				# Normalize ground truth answers: process punctuation and convert digits/articles
				for ansDic in gts[quesId]['answers']:
					ansDic['answer'] = self.processPunctuation(ansDic['answer'])
					ansDic['answer'] = self.processDigitArticle(ansDic['answer'])
				# Normalize the predicted answer in the same way
				resAns = self.processPunctuation(resAns)
				resAns = self.processDigitArticle(resAns)

			# VQA accuracy metric: for each ground truth answer, check how many OTHER annotators
			# gave the same answer as the prediction
			for gtAnsDatum in gts[quesId]['answers']:
				# Get all OTHER ground truth answers (excluding current one)
				otherGTAns = [item for item in gts[quesId]['answers'] if item!=gtAnsDatum]
				# Count how many of the other annotators gave the same answer as prediction
				matchingAns = [item for item in otherGTAns if item['answer']==resAns]
				# Accuracy is min(1, matching_count/3)
				# If 3+ annotators agree with prediction -> 100% accuracy
				# If 2 annotators agree -> 66.7% accuracy
				# If 1 annotator agrees -> 33.3% accuracy
				# If 0 annotators agree -> 0% accuracy
				acc = min(1, float(len(matchingAns))/3)
				gtAcc.append(acc)

			# Get metadata for this question
			quesType    = gts[quesId]['question_type']
			ansType     = gts[quesId]['answer_type']

			# Average the accuracy across all ground truth answers (usually 10 answers per question)
			avgGTAcc = float(sum(gtAcc))/len(gtAcc)

			# Store this question's accuracy in the overall list
			accQA.append(avgGTAcc)

			# Accumulate accuracy by question type (e.g., "what color", "how many")
			if quesType not in accQuesType:
				accQuesType[quesType] = []
			accQuesType[quesType].append(avgGTAcc)

			# Accumulate accuracy by answer type (e.g., "yes/no", "number", "other")
			if ansType not in accAnsType:
				accAnsType[ansType] = []
			accAnsType[ansType].append(avgGTAcc)

			# Store individual question accuracy
			self.setEvalQA(quesId, avgGTAcc)
			# Store accuracy grouped by question type
			self.setEvalQuesType(quesId, quesType, avgGTAcc)
			# Store accuracy grouped by answer type
			self.setEvalAnsType(quesId, ansType, avgGTAcc)

			# Update progress bar every 100 questions
			if step%100 == 0:
				self.updateProgress(step/float(len(quesIds)))
			step = step + 1

		# Calculate final accuracy metrics (overall and per type)
		self.setAccuracy(accQA, accQuesType, accAnsType)
		print("Done computing accuracy")

	def processPunctuation(self, inText):
		# Process punctuation in text for normalization
		outText = inText
		for p in self.punct:
			# If punctuation is surrounded by spaces or appears in comma-separated numbers
			if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):
				# Remove the punctuation completely
				outText = outText.replace(p, '')
			else:
				# Otherwise replace with space to separate words
				outText = outText.replace(p, ' ')
		# Remove periods that are not part of decimal numbers
		outText = self.periodStrip.sub("",
									  outText,
									  re.UNICODE)
		return outText

	def processDigitArticle(self, inText):
		# Process text to normalize digits, remove articles, and expand contractions
		outText = []
		# Convert to lowercase and split into words
		tempText = inText.lower().split()
		for word in tempText:
			# Convert number words to digits (e.g., "three" -> "3")
			# If word not in manualMap, keep it unchanged
			word = self.manualMap.setdefault(word, word)
			# Remove articles ("a", "an", "the")
			if word not in self.articles:
				outText.append(word)
			else:
				pass
		# Expand contractions (e.g., "can't" -> "cannot")
		for wordId, word in enumerate(outText):
			if word in self.contractions:
				outText[wordId] = self.contractions[word]
		# Join words back into a single string
		outText = ' '.join(outText)
		return outText

	def setAccuracy(self, accQA, accQuesType, accAnsType):
		# Calculate and store overall accuracy as percentage
		self.accuracy['overall']         = round(100*float(sum(accQA))/len(accQA), self.n)
		# Calculate and store average accuracy for each question type
		self.accuracy['perQuestionType'] = {quesType: round(100*float(sum(accQuesType[quesType]))/len(accQuesType[quesType]), self.n) for quesType in accQuesType}
		# Calculate and store average accuracy for each answer type
		self.accuracy['perAnswerType']   = {ansType:  round(100*float(sum(accAnsType[ansType]))/len(accAnsType[ansType]), self.n) for ansType in accAnsType}

	def setEvalQA(self, quesId, acc):
		# Store accuracy for a specific question (converted to percentage)
		self.evalQA[quesId] = round(100*acc, self.n)

	def setEvalQuesType(self, quesId, quesType, acc):
		# Store accuracy for a question grouped by its question type
		if quesType not in self.evalQuesType:
			self.evalQuesType[quesType] = {}
		self.evalQuesType[quesType][quesId] = round(100*acc, self.n)

	def setEvalAnsType(self, quesId, ansType, acc):
		# Store accuracy for a question grouped by its answer type
		if ansType not in self.evalAnsType:
			self.evalAnsType[ansType] = {}
		self.evalAnsType[ansType][quesId] = round(100*acc, self.n)

	def updateProgress(self, progress):
		# Display a text-based progress bar in the terminal
		barLength = 20
		status = ""
		# Convert integer to float if needed
		if isinstance(progress, int):
			progress = float(progress)
		# Validate progress value
		if not isinstance(progress, float):
			progress = 0
			status = "error: progress var must be float\r\n"
		if progress < 0:
			progress = 0
			status = "Halt...\r\n"
		if progress >= 1:
			progress = 1
			status = "Done...\r\n"
		# Calculate how many blocks to fill in progress bar
		block = int(round(barLength*progress))
		# Format progress bar: [####------------] 40%
		text = "\rFinshed Percent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), int(progress*100), status)
		sys.stdout.write(text)
		sys.stdout.flush()

In [16]:
##############################################
## WRAPPER TO USE VQAEval WITH RESULTS ARRAY
##############################################

# Create wrapper classes that adapt our results format to VQAEval's expected format
class SimpleVQA:
    """Wrapper for ground truth data - preserves all metadata"""
    def __init__(self, results):
        self.qa = {}
        for idx, result in enumerate(results):
            # Use question_id from dataset if available, otherwise use index
            qid = result.get('question_id', idx)
            self.qa[qid] = {
                'question': result['question'],
                'answers': [{'answer': ans} for ans in result['ground_truth_answers']],
                'question_type': result.get('question_type', 'unknown'),
                'answer_type': result.get('answer_type', 'unknown')
            }

    def getQuesIds(self):
        return list(self.qa.keys())

class SimpleVQARes:
    """Wrapper for predicted results"""
    def __init__(self, results):
        self.qa = {}
        for idx, result in enumerate(results):
            # Use question_id from dataset if available, otherwise use index
            qid = result.get('question_id', idx)
            self.qa[qid] = {
                'answer': result['predicted_answer']
            }

# Create VQA objects from your results
print("Creating VQA evaluation objects...")
vqa = SimpleVQA(results)
vqaRes = SimpleVQARes(results)

# Run evaluation
print("Running VQA evaluation...")
vqaEval = VQAEval(vqa, vqaRes, n=2)
vqaEval.evaluate()

# Print results
print("\n" + "="*60)
print("VQA EVALUATION RESULTS")
print("="*60)
print(f"Overall Accuracy: {vqaEval.accuracy['overall']:.2f}%")
print(f"\nEvaluated {len(results)} questions")

# Print per question type accuracy if available
if vqaEval.accuracy['perQuestionType']:
    print("\n" + "-"*60)
    print("Accuracy by Question Type:")
    print("-"*60)
    for qtype, acc in sorted(vqaEval.accuracy['perQuestionType'].items()):
        print(f"  {qtype:30s}: {acc:.2f}%")

# Print per answer type accuracy if available
if vqaEval.accuracy['perAnswerType']:
    print("\n" + "-"*60)
    print("Accuracy by Answer Type:")
    print("-"*60)
    for atype, acc in sorted(vqaEval.accuracy['perAnswerType'].items()):
        print(f"  {atype:30s}: {acc:.2f}%")

print("="*60)

Creating VQA evaluation objects...
Running VQA evaluation...
computing accuracy
Finshed Percent: [--------------------] 0% Done computing accuracy

VQA EVALUATION RESULTS
Overall Accuracy: 0.00%

Evaluated 50 questions

------------------------------------------------------------
Accuracy by Question Type:
------------------------------------------------------------
  are the                       : 0.00%
  are these                     : 0.00%
  are they                      : 0.00%
  can you                       : 0.00%
  could                         : 0.00%
  does this                     : 0.00%
  has                           : 0.00%
  how                           : 0.00%
  how many                      : 0.00%
  how many people are           : 0.00%
  is                            : 0.00%
  is that a                     : 0.00%
  is the                        : 0.00%
  is the man                    : 0.00%
  is there                      : 0.00%
  is there a                   

In [None]:
# Detailed debugging specific example
print("=== DEBUGGING EVALUATION ===\n")
print("First result:")
print(f"Question: {results[0]['question']}")
print(f"Predicted: '{results[0]['predicted_answer']}'")
print(f"Ground truth: {results[0]['ground_truth_answers']}")
print(f"\nQuestion type: {results[0]['question_type']}")
print(f"Answer type: {results[0]['answer_type']}")

# Check what the wrapper creates
print("\n=== WRAPPER OUTPUT ===")
vqa_test = SimpleVQA(results[:1])
vqaRes_test = SimpleVQARes(results[:1])
qid = list(vqa_test.qa.keys())[0]
print(f"Question ID: {qid}")
print(f"Ground truth structure: {vqa_test.qa[qid]}")
print(f"Prediction structure: {vqaRes_test.qa[qid]}")

print("\n=== INDIVIDUAL QUESTION ACCURACY ===")
quesId = results[0].get('question_id', 0)
acc = vqaEval.evalQA[quesId]
print(f"Accuracy for question ID {quesId}: {acc:.2f}%")

=== DEBUGGING EVALUATION ===

First result:
Question: How many kites are in the picture?
Predicted: 'There is one kite in the picture.'
Ground truth: ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']

Question type: how many
Answer type: number

=== WRAPPER OUTPUT ===
Question ID: 232309002
Ground truth structure: {'question': 'How many kites are in the picture?', 'answers': [{'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}], 'question_type': 'how many', 'answer_type': 'number'}
Prediction structure: {'answer': 'There is one kite in the picture.'}

=== INDIVIDUAL QUESTION ACCURACY ===
Accuracy for question ID 232309002: 0.00%


In [18]:
# Manual trace through evaluation logic for first question
print("=== MANUAL EVALUATION TRACE ===\n")

# Get first result
result = results[0]
print(f"Question: {result['question']}")
print(f"Predicted: '{result['predicted_answer']}'")
print(f"Ground truth: {result['ground_truth_answers']}")

# Simulate what VQAEval does
qid = result.get('question_id', 0)
vqa_test = SimpleVQA([result])
vqaRes_test = SimpleVQARes([result])

gts = vqa_test.qa[qid]
res = vqaRes_test.qa[qid]

print(f"\n=== AFTER WRAPPER ===")
print(f"Ground truth answers: {gts['answers']}")
print(f"Predicted answer: '{res['answer']}'")

# Simulate VQA evaluation logic
resAns = res['answer'].strip()
gtAnswers = [ans['answer'] for ans in gts['answers']]

print(f"\n=== VQA SCORING LOGIC ===")
print(f"Cleaned prediction: '{resAns}'")
print(f"All GT answers: {gtAnswers}")
print(f"Unique GT answers: {set(gtAnswers)}")

# Check matches for first GT answer
gtAnsDatum = gts['answers'][0]
otherGTAns = [item for item in gts['answers'] if item!=gtAnsDatum]
print(f"\nFor first GT answer '{gtAnsDatum['answer']}':")
print(f"Other GT answers: {[item['answer'] for item in otherGTAns]}")

matchingAns = [item for item in otherGTAns if item['answer']==resAns]
print(f"Matching answers: {[item['answer'] for item in matchingAns]}")
print(f"Match count: {len(matchingAns)}")
print(f"Accuracy contribution: {min(1, len(matchingAns)/3)}")

=== MANUAL EVALUATION TRACE ===

Question: How many kites are in the picture?
Predicted: 'There is one kite in the picture.'
Ground truth: ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']

=== AFTER WRAPPER ===
Ground truth answers: [{'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}, {'answer': '1'}]
Predicted answer: 'There is one kite in the picture.'

=== VQA SCORING LOGIC ===
Cleaned prediction: 'There is one kite in the picture.'
All GT answers: ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
Unique GT answers: {'1'}

For first GT answer '1':
Other GT answers: []
Matching answers: []
Match count: 0
Accuracy contribution: 0.0
