In [7]:
from datasets import load_dataset
import config

import os
os.environ['CUDA_VISIBLE_DEVICE']='0,1,2,3,4,5,6,7'

dataset = load_dataset(
    "./testbed/data/vqav2", split="validation", data_dir=config.vqav2_dir, images_dir=config.coco_dir,
    trust_remote_code=True
)

In [14]:
import json
from testbed.data.vqav2 import postprocess_generation

result = json.load(open("./icl_shot1.json", "r"))
new_result = []
for item in result.values():
    new_result.append(
        {"question_id": item["question_id"], "answer": postprocess_generation(item["prediction"])[0]}
    )

json.dump(new_result, open("./v2_OpenEnded_mscoco_val2014_results.json", "w"))

In [15]:
from testbed.evaluate.vqa import compute_vqa_accuracy
import os
import config

compute_vqa_accuracy(
    os.path.join(".", "v2_OpenEnded_mscoco_val2014_results.json"),
    os.path.join(config.vqav2_dir, "v2_mscoco_val2014_question_subdata.json"),
    os.path.join(config.vqav2_dir, "v2_mscoco_val2014_annotations_subdata.json"),
)

loading VQA annotations and questions into memory...
0:00:00.123837
creating index...
index created!
Loading and preparing results...     
DONE (t=0.01s)
creating index...
index created!
computing accuracy
Finshed Percent: [####################] 99% Done computing accuracy


{'overall': 60.33,
 'perQuestionType': {'is the': 74.84,
  'what is the': 56.79,
  'how': 21.16,
  'does the': 80.0,
  'what': 47.59,
  'what is on the': 44.51,
  'how many people are': 50.48,
  'what are the': 51.03,
  'is there': 65.18,
  'is there a': 67.4,
  'are these': 76.21,
  'are the': 74.36,
  'is': 74.74,
  'is it': 77.05,
  'what kind of': 50.89,
  'is this a': 83.61,
  'how many': 40.67,
  'what is': 52.91,
  'are they': 76.42,
  'what color is': 74.71,
  'what color is the': 66.86,
  'is this': 79.98,
  'which': 39.9,
  'where is the': 28.45,
  'what is the man': 55.42,
  'what is this': 69.58,
  'what color are the': 73.24,
  'what type of': 47.31,
  'none of the above': 57.78,
  'is the woman': 74.41,
  'what is the person': 67.27,
  'is this an': 81.37,
  'has': 59.35,
  'can you': 59.77,
  'what is in the': 60.57,
  'what does the': 49.37,
  'what time': 20.97,
  'are': 60.92,
  'where are the': 21.8,
  'what brand': 60.91,
  'are there': 67.21,
  'who is': 36.6,
  'd

In [2]:
from testbed.models import Idefics2
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)
device = torch.device("cuda:7")
model = Idefics2(
    config.idefics2_8b_base_path,
    precision="bf16",
    device=device,
    quantization_config=quantization_config,
)

Loading checkpoint shards: 100%|██████████| 7/7 [00:07<00:00,  1.03s/it]


In [3]:
from testbed.data import prepare_dataloader
import evaluate
from torch.utils.data.sampler import RandomSampler
dataloader = prepare_dataloader(dataset, batch_size=1, num_shots=2, sampler=RandomSampler(dataset))

total_acc = evaluate.load("./testbed/evaluate/metrics/vqa_accuracy")
result = []

In [5]:
from testbed.data.vqav2 import postprocess_generation
from testbed.data import prepare_vqa_input
from tqdm import tqdm
torch.cuda.empty_cache()
for batch in tqdm(dataloader, desc=f"Evaluating {model.model_name}..."):
    images, text = prepare_vqa_input(
        batch, instruction="Provide an answer to the question. Use the image to answer."
    )
    predictions = model.generate(text, images, max_new_tokens=10, num_beams=3)
    for pred, context in zip(predictions, batch):
        last_qa = context[-1]
        gt_answer = [item["answer"] for item in last_qa["answers"]]
        prediction, reference = postprocess_generation(pred, gt_answer)
        total_acc.add_batch(predictions=prediction, references=reference)
        result.append({
            "question_id": last_qa["question_id"],
            "answer" : pred
        })

evaluate.save("./v2_OpenEnded_mscoco_train2014_resutls.json")
print(total_acc.compute())

Evaluating idefics2-8b-base...:   0%|          | 1/147919 [00:02<118:42:34,  2.89s/it]

['no']
[['no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no']]


Evaluating idefics2-8b-base...:   0%|          | 2/147919 [00:05<113:18:58,  2.76s/it]

['1']
[['no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'yes']]


Evaluating idefics2-8b-base...:   0%|          | 3/147919 [00:08<114:13:51,  2.78s/it]

['2']
[['street', 'no crossing', 'none', 'none', 'none', 'street', 'not possible', 'stop', 'no signs', 'caution']]


Evaluating idefics2-8b-base...:   0%|          | 4/147919 [00:11<114:37:34,  2.79s/it]

['']
[['white', 'white', 'white', 'white', 'white', 'white', 'white', 'white', 'white', 'white']]


Evaluating idefics2-8b-base...:   0%|          | 5/147919 [00:13<114:59:30,  2.80s/it]

['no']
[['no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes']]


Evaluating idefics2-8b-base...:   0%|          | 5/147919 [00:16<132:47:40,  3.23s/it]


KeyboardInterrupt: 