# VQA Evaluation for Verbose Answers - Adapted for Colab

This notebook adapts the official VQA evaluation code from https://github.com/GT-Vision-Lab/VQA to work in Google Colab for evaluating verbose answers from VQAv2.

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install qwen-vl-utils datasets torch torchvision transformers accelerate
!pip install ipywidgets widgetsnbextension

## 2. Download VQA Evaluation Code

In [None]:
# Download VQA evaluation files from GitHub
import os
import urllib.request

# Create directories
os.makedirs('vqa_eval', exist_ok=True)

# Base URL for raw GitHub files
base_url = 'https://raw.githubusercontent.com/GT-Vision-Lab/VQA/master/PythonEvaluationTools/'

# Download vqaEval.py
print('Downloading vqaEval.py...')
vqa_eval_url = base_url + 'vqaEvaluation/vqaEval.py'
urllib.request.urlretrieve(vqa_eval_url, 'vqa_eval/vqaEval.py')

# Download vqa.py (helper tools)
print('Downloading vqa.py...')
vqa_url = 'https://raw.githubusercontent.com/GT-Vision-Lab/VQA/master/PythonHelperTools/vqaTools/vqa.py'
urllib.request.urlretrieve(vqa_url, 'vqa_eval/vqa.py')

print('Download complete!')

## 3. Fix Python 2 to Python 3 Compatibility

In [None]:
# Convert Python 2 print statements to Python 3
import re

def convert_py2_to_py3(filename):
    with open(filename, 'r') as f:
        content = f.read()
    
    # Replace print statements
    content = re.sub(r"print '([^']*)'", r"print('\1')", content)
    content = re.sub(r'print "([^"]*)"', r'print("\1")', content)
    content = re.sub(r"print '%s: %s'%\(([^,]+), ([^)]+)\)", r"print(f'{\1}: {\2}')", content)
    content = re.sub(r"print '([^']*)'[ ]*%[ ]*\(([^)]*)\)", r"print('\1'.format(\2))", content)
    content = re.sub(r'print "([^"]*)"[ ]*%[ ]*\(([^)]*)\)', r'print("\1".format(\2))', content)
    
    with open(filename, 'w') as f:
        f.write(content)
    print(f'Converted {filename} to Python 3')

convert_py2_to_py3('vqa_eval/vqaEval.py')
convert_py2_to_py3('vqa_eval/vqa.py')

## 4. VQA Evaluation Classes (Python 3 Compatible)

In [None]:
# VQA Evaluation class adapted for Python 3 and verbose answers
import re
import json

class VQAEval:
    def __init__(self, vqa, vqaRes, n=2):
        self.n = n
        self.accuracy = {}
        self.evalQA = {}
        self.evalQuesType = {}
        self.evalAnsType = {}
        self.vqa = vqa
        self.vqaRes = vqaRes
        self.params = {'question_id': vqa.getQuesIds()}
        self.contractions = {
            "aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've",
            "couldnt": "couldn't", "couldn'tve": "couldn't've", "couldnt've": "couldn't've",
            "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't",
            "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't",
            "havent": "haven't", "hed": "he'd", "hed've": "he'd've", "he'dve": "he'd've",
            "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's",
            "Id've": "I'd've", "I'dve": "I'd've", "Im": "I'm", "Ive": "I've",
            "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've",
            "itll": "it'll", "let's": "let's", "maam": "ma'am", "mightnt": "mightn't",
            "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've",
            "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't",
            "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't",
            "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at",
            "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've",
            "shes": "she's", "shouldve": "should've", "shouldnt": "shouldn't",
            "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've",
            "somebody'd": "somebodyd", "somebodyd've": "somebody'd've",
            "somebody'dve": "somebody'd've", "somebodyll": "somebody'll",
            "somebodys": "somebody's", "someoned": "someone'd",
            "someoned've": "someone'd've", "someone'dve": "someone'd've",
            "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd",
            "somethingd've": "something'd've", "something'dve": "something'd've",
            "somethingll": "something'll", "thats": "that's", "thered": "there'd",
            "thered've": "there'd've", "there'dve": "there'd've", "therere": "there're",
            "theres": "there's", "theyd": "they'd", "theyd've": "they'd've",
            "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're",
            "theyve": "they've", "twas": "'twas", "wasnt": "wasn't",
            "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've",
            "werent": "weren't", "whatll": "what'll", "whatre": "what're",
            "whats": "what's", "whatve": "what've", "whens": "when's",
            "whered": "where'd", "wheres": "where's", "whereve": "where've",
            "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've",
            "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll",
            "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've",
            "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", "wouldn'tve": "wouldn't've",
            "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll",
            "yall'd've": "y'all'd've", "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've",
            "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've",
            "youll": "you'll", "youre": "you're", "youve": "you've"
        }
        self.manualMap = {
            'none': '0',
            'zero': '0',
            'one': '1',
            'two': '2',
            'three': '3',
            'four': '4',
            'five': '5',
            'six': '6',
            'seven': '7',
            'eight': '8',
            'nine': '9',
            'ten': '10'
        }
        self.articles = ['a', 'an', 'the']
        self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
        self.commaStrip = re.compile("(\d)(\,)(\d)")
        self.punct = [';', r"/", '[', ']', '"', '{', '}',
                      '(', ')', '=', '+', '\\', '_', '-',
                      '>', '<', '@', '`', ',', '?', '!']

    def evaluate(self, quesIds=None):
        if quesIds == None:
            quesIds = [quesId for quesId in self.params['question_id']]
        gts = {}
        res = {}
        for quesId in quesIds:
            gts[quesId] = self.vqa.qa[quesId]
            res[quesId] = self.vqaRes.qa[quesId]

        # Compute accuracy
        accQA = []
        accQuesType = {}
        accAnsType = {}
        print("Computing accuracy")
        
        for quesId in quesIds:
            resAns = res[quesId]['answer']
            resAns = resAns.replace('\n', ' ')
            resAns = resAns.replace('\t', ' ')
            resAns = resAns.strip()
            
            gtAcc = []
            gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]
            
            if len(set(gtAnswers)) > 1:
                for ansDic in gts[quesId]['answers']:
                    ansDic['answer'] = self.processPunctuation(ansDic['answer'])
                    ansDic['answer'] = self.processDigitArticle(ansDic['answer'])
                resAns = self.processPunctuation(resAns)
                resAns = self.processDigitArticle(resAns)

            for gtAnsDatum in gts[quesId]['answers']:
                otherGTAns = [item for item in gts[quesId]['answers'] if item != gtAnsDatum]
                matchingAns = [item for item in otherGTAns if item['answer'] == resAns]
                acc = min(1, float(len(matchingAns)) / 3)
                gtAcc.append(acc)
            
            quesType = gts[quesId]['question_type']
            ansType = gts[quesId]['answer_type']
            avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
            accQA.append(avgGTAcc)
            
            if quesType not in accQuesType:
                accQuesType[quesType] = []
            accQuesType[quesType].append(avgGTAcc)
            
            if ansType not in accAnsType:
                accAnsType[ansType] = []
            accAnsType[ansType].append(avgGTAcc)
            
            self.setEvalQA(quesId, avgGTAcc)
            self.setEvalQuesType(quesId, quesType, avgGTAcc)
            self.setEvalAnsType(quesId, ansType, avgGTAcc)

        self.setAccuracy(accQA, accQuesType, accAnsType)
        print("Done computing accuracy")

    def processPunctuation(self, inText):
        outText = inText
        for p in self.punct:
            if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):
                outText = outText.replace(p, '')
            else:
                outText = outText.replace(p, ' ')
        outText = self.periodStrip.sub("", outText, re.UNICODE)
        return outText

    def processDigitArticle(self, inText):
        outText = []
        tempText = inText.lower().split()
        for word in tempText:
            word = self.manualMap.setdefault(word, word)
            if word not in self.articles:
                outText.append(word)
            else:
                pass
        for wordId, word in enumerate(outText):
            if word in self.contractions:
                outText[wordId] = self.contractions[word]
        outText = ' '.join(outText)
        return outText

    def setAccuracy(self, accQA, accQuesType, accAnsType):
        self.accuracy['overall'] = round(100 * float(sum(accQA)) / len(accQA), self.n)
        self.accuracy['perQuestionType'] = {
            quesType: round(100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]), self.n)
            for quesType in accQuesType
        }
        self.accuracy['perAnswerType'] = {
            ansType: round(100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n)
            for ansType in accAnsType
        }

    def setEvalQA(self, quesId, acc):
        self.evalQA[quesId] = round(100 * acc, self.n)

    def setEvalQuesType(self, quesId, quesType, acc):
        if quesType not in self.evalQuesType:
            self.evalQuesType[quesType] = {}
        self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)

    def setEvalAnsType(self, quesId, ansType, acc):
        if ansType not in self.evalAnsType:
            self.evalAnsType[ansType] = {}
        self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)

In [None]:
# VQA class for loading annotations and questions
import json
import datetime
import copy

class VQA:
    def __init__(self, annotation_file=None, question_file=None):
        """
        Constructor of VQA helper class for reading and visualizing questions and answers.
        :param annotation_file (str): location of VQA annotation file
        :param question_file (str): location of VQA question file
        """
        # load dataset
        self.dataset = {}
        self.questions = {}
        self.qa = {}
        self.qqa = {}
        self.imgToQA = {}
        
        if annotation_file is not None and question_file is not None:
            print('Loading VQA annotations and questions into memory...')
            time_t = datetime.datetime.utcnow()
            dataset = json.load(open(annotation_file, 'r'))
            questions = json.load(open(question_file, 'r'))
            print(datetime.datetime.utcnow() - time_t)
            self.dataset = dataset
            self.questions = questions
            self.createIndex()

    def createIndex(self):
        # create index
        print('Creating index...')
        imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
        qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
        qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
        for ann in self.dataset['annotations']:
            imgToQA[ann['image_id']] += [ann]
            qa[ann['question_id']] = ann
        for ques in self.questions['questions']:
            qqa[ques['question_id']] = ques
        print('Index created!')

        # create class members
        self.qa = qa
        self.qqa = qqa
        self.imgToQA = imgToQA

    def info(self):
        """
        Print information about the VQA annotation file.
        """
        for key, value in self.dataset['info'].items():
            print(f'{key}: {value}')

    def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
        """
        Get question ids that satisfy given filter conditions.
        """
        imgIds = imgIds if type(imgIds) == list else [imgIds]
        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]

        if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
            anns = self.dataset['annotations']
        else:
            if not len(imgIds) == 0:
                anns = sum([self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA], [])
            else:
                anns = self.dataset['annotations']
            anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
            anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
        ids = [ann['question_id'] for ann in anns]
        return ids

    def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
        """
        Get image ids that satisfy given filter conditions.
        """
        quesIds = quesIds if type(quesIds) == list else [quesIds]
        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]

        if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
            anns = self.dataset['annotations']
        else:
            if not len(quesIds) == 0:
                anns = [self.qa[quesId] for quesId in quesIds if quesId in self.qa]
            else:
                anns = self.dataset['annotations']
            anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
            anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
        ids = [ann['image_id'] for ann in anns]
        return ids

    def loadQA(self, ids=[]):
        """
        Load questions and answers with the specified question ids.
        """
        if type(ids) == list:
            return [self.qa[id] for id in ids]
        elif type(ids) == int:
            return [self.qa[ids]]

    def showQA(self, anns):
        """
        Display the specified annotations.
        """
        if len(anns) == 0:
            return 0
        for ann in anns:
            quesId = ann['question_id']
            print(f"Question: {self.qqa[quesId]['question']}")
            for ans in ann['answers']:
                print(f"Answer {ans['answer_id']}: {ans['answer']}")

    def loadRes(self, resFile, quesFile):
        """
        Load result file and return a result object.
        """
        res = VQA()
        res.questions = json.load(open(quesFile))
        res.dataset['info'] = copy.deepcopy(self.questions['info'])
        res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
        res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
        res.dataset['data_subtype'] = copy.deepcopy(self.questions['data_subtype'])
        res.dataset['license'] = copy.deepcopy(self.questions['license'])

        print('Loading and preparing results...')
        time_t = datetime.datetime.utcnow()
        anns = json.load(open(resFile))
        assert type(anns) == list, 'results is not an array of objects'
        annsQuesIds = [ann['question_id'] for ann in anns]
        
        assert set(annsQuesIds) == set(self.getQuesIds()), \
            'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is at least one question id that does not belong to the question ids in the annotation file.'
        
        for ann in anns:
            quesId = ann['question_id']
            if res.dataset.get('annotations') is None:
                res.dataset['annotations'] = []
            res.dataset['annotations'].append({
                'question_id': quesId,
                'answer': ann['answer']
            })
        
        print('DONE (t={:0.2f}s)'.format((datetime.datetime.utcnow() - time_t).total_seconds()))
        res.createIndex()
        return res

## 5. Download VQAv2 Data

Download the annotations and questions files from the VQA website.

In [None]:
# Download VQAv2 validation annotations and questions
import urllib.request
import zipfile

# Create directories
os.makedirs('vqa_data', exist_ok=True)

# Download validation annotations
print('Downloading VQAv2 validation annotations...')
ann_url = 'https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip'
urllib.request.urlretrieve(ann_url, 'vqa_data/v2_Annotations_Val_mscoco.zip')

# Download validation questions
print('Downloading VQAv2 validation questions...')
ques_url = 'https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip'
urllib.request.urlretrieve(ques_url, 'vqa_data/v2_Questions_Val_mscoco.zip')

# Extract files
print('Extracting files...')
with zipfile.ZipFile('vqa_data/v2_Annotations_Val_mscoco.zip', 'r') as zip_ref:
    zip_ref.extractall('vqa_data/')

with zipfile.ZipFile('vqa_data/v2_Questions_Val_mscoco.zip', 'r') as zip_ref:
    zip_ref.extractall('vqa_data/')

print('Download complete!')
print('\nAnnotation file: vqa_data/v2_mscoco_val2014_annotations.json')
print('Question file: vqa_data/v2_OpenEnded_mscoco_val2014_questions.json')

## 6. Run Your Model Inference (Qwen-VL)

Generate answers using your model. This section is adapted from your existing notebook.

In [None]:
# Load your Qwen-VL model
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

# Load model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)

# Load processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

print("Model loaded successfully!")

In [None]:
# Generate predictions on VQAv2 validation set
from datasets import load_dataset
from tqdm.auto import tqdm
import json

# Load VQA dataset
vqa_dataset = load_dataset("HuggingFaceM4/VQAv2", split="validation")

# Generate predictions
results = []
num_samples = 1000  # Adjust this number for full evaluation

print(f"Generating predictions for {num_samples} samples...")
for i in tqdm(range(num_samples)):
    sample = vqa_dataset[i]
    
    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": sample['image']},
                {"type": "text", "text": sample['question']},
            ],
        }
    ]
    
    # Prepare for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Generate
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    # Store result
    results.append({
        'question_id': sample['question_id'],
        'answer': output_text.strip()
    })

# Save results to file
with open('vqa_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\nGenerated {len(results)} predictions")
print("Results saved to: vqa_results.json")

## 7. Evaluate Results

Use the VQA evaluation code to compute accuracy on your verbose answers.

In [None]:
# Run VQA evaluation
annFile = 'vqa_data/v2_mscoco_val2014_annotations.json'
quesFile = 'vqa_data/v2_OpenEnded_mscoco_val2014_questions.json'
resFile = 'vqa_results.json'

# Create VQA object for ground truth
print("Loading ground truth annotations...")
vqa = VQA(annFile, quesFile)

# Load results
print("\nLoading results...")
vqaRes = vqa.loadRes(resFile, quesFile)

# Create VQAEval object
print("\nEvaluating results...")
vqaEval = VQAEval(vqa, vqaRes, n=2)

# Evaluate
vqaEval.evaluate()

# Print results
print("\n" + "="*50)
print("EVALUATION RESULTS")
print("="*50)
print(f"\nOverall Accuracy: {vqaEval.accuracy['overall']:.2f}%")

print("\nAccuracy by Question Type:")
for quesType in sorted(vqaEval.accuracy['perQuestionType']):
    print(f"  {quesType}: {vqaEval.accuracy['perQuestionType'][quesType]:.2f}%")

print("\nAccuracy by Answer Type:")
for ansType in sorted(vqaEval.accuracy['perAnswerType']):
    print(f"  {ansType}: {vqaEval.accuracy['perAnswerType'][ansType]:.2f}%")

## 8. Save Detailed Results

In [None]:
# Save detailed evaluation results
detailed_results = {
    'overall_accuracy': vqaEval.accuracy['overall'],
    'per_question_type': vqaEval.accuracy['perQuestionType'],
    'per_answer_type': vqaEval.accuracy['perAnswerType'],
    'per_question_accuracy': vqaEval.evalQA
}

with open('vqa_evaluation_results.json', 'w') as f:
    json.dump(detailed_results, f, indent=2)

print("Detailed results saved to: vqa_evaluation_results.json")

## 9. Visualize Results (Optional)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot accuracy by question type
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Question type accuracy
quesTypes = list(vqaEval.accuracy['perQuestionType'].keys())
quesAccs = list(vqaEval.accuracy['perQuestionType'].values())

ax1.barh(quesTypes, quesAccs, color='steelblue')
ax1.set_xlabel('Accuracy (%)')
ax1.set_title('Accuracy by Question Type')
ax1.set_xlim([0, 100])

# Answer type accuracy
ansTypes = list(vqaEval.accuracy['perAnswerType'].keys())
ansAccs = list(vqaEval.accuracy['perAnswerType'].values())

ax2.bar(ansTypes, ansAccs, color='coral')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Accuracy by Answer Type')
ax2.set_ylim([0, 100])
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('vqa_accuracy_plots.png', dpi=300, bbox_inches='tight')
plt.show()

print("Plots saved to: vqa_accuracy_plots.png")

## 10. View Example Predictions

In [None]:
# Show some example predictions
import random
from IPython.display import display

num_examples = 5
example_ids = random.sample(list(vqaEval.evalQA.keys()), num_examples)

print("Example Predictions:")
print("="*80)

for quesId in example_ids:
    # Get question
    question = vqa.qqa[quesId]['question']
    
    # Get ground truth answers
    gt_answers = [ans['answer'] for ans in vqa.qa[quesId]['answers']]
    
    # Get predicted answer
    pred_answer = vqaRes.qa[quesId]['answer']
    
    # Get accuracy
    accuracy = vqaEval.evalQA[quesId]
    
    print(f"\nQuestion ID: {quesId}")
    print(f"Question: {question}")
    print(f"Ground Truth Answers: {', '.join(set(gt_answers))}")
    print(f"Predicted Answer: {pred_answer}")
    print(f"Accuracy: {accuracy:.2f}%")
    print("-"*80)