In [7]:
from datasets import load_dataset
import config

import os
os.environ['CUDA_VISIBLE_DEVICE']='0,1,2,3,4,5,6,7'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'
dataset = load_dataset(
    "./testbed/data/vqav2", split="validation", data_dir=config.vqav2_dir, images_dir=config.coco_dir,
    trust_remote_code=True
)

In [21]:
import json

result = json.load(open("./icl_shot1.json", "r"))
new_result = []
for item in result.values():
    new_result.append(
        {"question_id": item["question_id"], "answer": item["prediction"]}
    )

json.dump(new_result, open("./v2_OpenEnded_mscoco_train2014_results.json", "w"))

In [26]:
from tests.vqa_accuracy.vqa import VQA
from tests.vqa_accuracy.vqaEval import VQAEval
import testbed.data.vqav2 as vqav2
import evaluate

VQAv2_FILE_NAME = {
    "questions": {
        "train": "v2_OpenEnded_mscoco_train2014_questions.json",
        "val": "v2_mscoco_val2014_question_subdata.json",
        "test-dev": "v2_OpenEnded_mscoco_test-dev2015_questions.json",
        "test": "v2_OpenEnded_mscoco_test2015_questions.json",
    },
    "annotations": {
        "train": "v2_mscoco_train2014_annotations.json",
        "val": "v2_mscoco_val2014_annotations_subdata.json",
    },
}


def standard_evaluate(questions_path, annotations_path, result_path):
    vqa = VQA(annotations_path, questions_path)
    vqaRes = vqa.loadRes(result_path, questions_path)
    vqaEval = VQAEval(vqa, vqaRes, n=2)
    vqaEval.evaluate()
    return vqaEval.accuracy

import time
def custom_evaluate(questions_path, annotations_path, result_path):
    vqa = VQA(annotations_path, questions_path)
    vqaRes = vqa.loadRes(result_path, questions_path)
    vqaEval = VQAEval(vqa, vqaRes, n=2)

    vqa_acc = evaluate.load("testbed/evaluate/metrics/vqa_accuracy")

    accQA, accQuesType, accAnsType = [], {}, {}
    quesIds = [quesId for quesId in vqaEval.params["question_id"]]
    gts, res = {}, {}
    total_time = 0
    for quesId in quesIds:
        gts[quesId] = vqa.qa[quesId]
        res[quesId] = vqaRes.qa[quesId]
    for quesId in quesIds:
        prediction, reference = vqav2.postprocess_generation(
            res[quesId]["answer"], [ans["answer"] for ans in gts[quesId]["answers"]]
        )
        start_time = time.time()
        acc = vqa_acc.compute(predictions=prediction, references=reference)
        total_time += time.time() - start_time
        quesType = gts[quesId]["question_type"]
        ansType = gts[quesId]["answer_type"]
        avgGTAcc = acc["vqa_accuracy"]
        accQA.append(avgGTAcc)
        if quesType not in accQuesType:
            accQuesType[quesType] = []
        accQuesType[quesType].append(avgGTAcc)
        if ansType not in accAnsType:
            accAnsType[ansType] = []
        accAnsType[ansType].append(avgGTAcc)
        vqaEval.setEvalQA(quesId, avgGTAcc)
        vqaEval.setEvalQuesType(quesId, quesType, avgGTAcc)
        vqaEval.setEvalAnsType(quesId, ansType, avgGTAcc)
    vqaEval.setAccuracy(accQA, accQuesType, accAnsType)
    print(total_time)
    return vqaEval.accuracy

In [25]:
standard_evaluate(
    os.path.join(config.vqav2_dir, VQAv2_FILE_NAME["questions"]["val"]),
    os.path.join(config.vqav2_dir, VQAv2_FILE_NAME["annotations"]["val"]),
    os.path.join("tests", "vqa_accuracy", "v2_OpenEnded_mscoco_val2014_results.json"),
)

loading VQA annotations and questions into memory...
0:00:00.113425
creating index...
index created!
Loading and preparing results...     
DONE (t=0.01s)
creating index...
index created!
computing accuracy
Finshed Percent: [####################] 99% Done computing accuracy


{'overall': 96.37,
 'perQuestionType': {'what is the man': 94.5,
  'are there': 100.0,
  'what kind of': 92.37,
  'is the': 99.95,
  'how many': 97.59,
  'what is': 94.55,
  'do you': 100.0,
  'none of the above': 95.22,
  'what color is the': 97.85,
  'is this an': 100.0,
  'is this a': 99.81,
  'is there a': 100.0,
  'is it': 100.0,
  'what time': 94.17,
  'what is the': 94.28,
  'are the': 100.0,
  'what': 92.18,
  'what animal is': 97.84,
  'what type of': 91.44,
  'how': 76.94,
  'does the': 100.0,
  'where is the': 84.72,
  'how many people are': 95.83,
  'what color': 98.16,
  'what is on the': 92.83,
  'are they': 99.4,
  'is this': 100.0,
  'are': 100.0,
  'what brand': 92.95,
  'is there': 100.0,
  'what is the name': 97.18,
  'is the man': 100.0,
  'are these': 100.0,
  'is': 99.93,
  'what are the': 94.79,
  'what number is': 99.21,
  'what color is': 99.26,
  'what does the': 91.15,
  'which': 91.3,
  'who is': 93.77,
  'is this person': 99.8,
  'is the woman': 100.0,
  'i

In [17]:
print(standard)

{'overall': 15.28, 'perQuestionType': {'what is the man': 39.47, 'are there': 1.77, 'what kind of': 38.05, 'is the': 3.18, 'how many': 1.75, 'what is': 40.55, 'do you': 0.0, 'none of the above': 11.1, 'what color is the': 15.96, 'is this an': 2.35, 'is this a': 2.41, 'is there a': 0.46, 'is it': 3.85, 'what time': 18.19, 'what is the': 40.5, 'are the': 2.18, 'what': 28.87, 'what animal is': 30.81, 'what type of': 36.95, 'how': 15.95, 'does the': 1.08, 'where is the': 26.4, 'how many people are': 1.79, 'what color': 15.39, 'what is on the': 33.1, 'are they': 3.88, 'is this': 1.47, 'are': 0.5, 'what brand': 49.55, 'is there': 0.0, 'what is the name': 32.31, 'is the man': 1.3, 'are these': 2.29, 'is': 1.51, 'what are the': 42.47, 'what number is': 33.95, 'what color is': 22.94, 'what does the': 27.5, 'which': 17.4, 'who is': 29.06, 'is this person': 0.6, 'is the woman': 2.94, 'is the person': 5.76, 'what is the woman': 35.68, 'is he': 4.08, 'does this': 0.94, 'where are the': 20.16, 'what

In [27]:
custom_evaluate(
    os.path.join(config.vqav2_dir, VQAv2_FILE_NAME["questions"]["val"]),
    os.path.join(config.vqav2_dir, VQAv2_FILE_NAME["annotations"]["val"]),
    os.path.join("tests", "vqa_accuracy", "v2_OpenEnded_mscoco_val2014_results.json"),
)

loading VQA annotations and questions into memory...
0:00:00.391851
creating index...
index created!
Loading and preparing results...     
DONE (t=0.01s)
creating index...
index created!
32.573668003082275


{'overall': 37.08,
 'perQuestionType': {'what is the man': 41.68,
  'are there': 30.48,
  'what kind of': 39.61,
  'is the': 41.3,
  'how many': 23.38,
  'what is': 43.82,
  'do you': 37.5,
  'none of the above': 40.91,
  'what color is the': 43.87,
  'is this an': 38.24,
  'is this a': 47.25,
  'is there a': 28.13,
  'is it': 32.82,
  'what time': 19.58,
  'what is the': 45.7,
  'are the': 36.13,
  'what': 33.13,
  'what animal is': 39.73,
  'what type of': 38.32,
  'how': 21.16,
  'does the': 46.89,
  'where is the': 26.58,
  'how many people are': 19.52,
  'what color': 42.37,
  'what is on the': 33.89,
  'are they': 40.6,
  'is this': 39.58,
  'are': 32.58,
  'what brand': 49.55,
  'is there': 31.61,
  'what is the name': 32.31,
  'is the man': 30.16,
  'are these': 41.93,
  'is': 39.21,
  'what are the': 46.92,
  'what number is': 41.84,
  'what color is': 51.18,
  'what does the': 29.58,
  'which': 27.9,
  'who is': 30.94,
  'is this person': 30.8,
  'is the woman': 30.29,
  'is 

In [2]:
from testbed.models import Idefics2
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)
device = torch.device("cuda:7")
model = Idefics2(
    config.idefics2_8b_base_path,
    precision="bf16",
    device=device,
    quantization_config=quantization_config,
)

Loading checkpoint shards: 100%|██████████| 7/7 [00:07<00:00,  1.03s/it]


In [3]:
from testbed.data import prepare_dataloader
import evaluate
from torch.utils.data.sampler import RandomSampler
dataloader = prepare_dataloader(dataset, batch_size=1, num_shots=2, sampler=RandomSampler(dataset))

total_acc = evaluate.load("./testbed/evaluate/metrics/vqa_accuracy")
result = []

In [5]:
from testbed.data.vqav2 import postprocess_generation
from testbed.data import prepare_vqa_input
from tqdm import tqdm
torch.cuda.empty_cache()
for batch in tqdm(dataloader, desc=f"Evaluating {model.model_name}..."):
    images, text = prepare_vqa_input(
        batch, instruction="Provide an answer to the question. Use the image to answer."
    )
    predictions = model.generate(text, images, max_new_tokens=10, num_beams=3)
    for pred, context in zip(predictions, batch):
        last_qa = context[-1]
        gt_answer = [item["answer"] for item in last_qa["answers"]]
        prediction, reference = postprocess_generation(pred, gt_answer)
        total_acc.add_batch(predictions=prediction, references=reference)
        result.append({
            "question_id": last_qa["question_id"],
            "answer" : pred
        })

evaluate.save("./v2_OpenEnded_mscoco_train2014_resutls.json")
print(total_acc.compute())

Evaluating idefics2-8b-base...:   0%|          | 1/147919 [00:02<118:42:34,  2.89s/it]

['no']
[['no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no']]


Evaluating idefics2-8b-base...:   0%|          | 2/147919 [00:05<113:18:58,  2.76s/it]

['1']
[['no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'yes']]


Evaluating idefics2-8b-base...:   0%|          | 3/147919 [00:08<114:13:51,  2.78s/it]

['2']
[['street', 'no crossing', 'none', 'none', 'none', 'street', 'not possible', 'stop', 'no signs', 'caution']]


Evaluating idefics2-8b-base...:   0%|          | 4/147919 [00:11<114:37:34,  2.79s/it]

['']
[['white', 'white', 'white', 'white', 'white', 'white', 'white', 'white', 'white', 'white']]


Evaluating idefics2-8b-base...:   0%|          | 5/147919 [00:13<114:59:30,  2.80s/it]

['no']
[['no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes']]


Evaluating idefics2-8b-base...:   0%|          | 5/147919 [00:16<132:47:40,  3.23s/it]


KeyboardInterrupt: 