In [1]:
from datasets import load_dataset
import sys

sys.path.insert(0, "..")
from testbed.data import prepare_dataloader
import config

dataset = load_dataset(
    "../testbed/data/coco",
    data_dir=config.karpathy_coco_caption_dir,
    images_dir=config.coco_dir,
    trust_remote_code=True,
)

hparams = {
    "batch_size": 4,
    "num_shots": 2,
    "precision": "bf16",
    "generate_args": {"num_beams": 3, "max_new_tokens": 15},
}

dataloader = prepare_dataloader(
    [dataset["train"], dataset["validation"]],
    batch_size=hparams["batch_size"],
    num_shots=hparams["num_shots"],
    num_per_dataset=[hparams["num_shots"], 1],
    shuffle=True,
)

In [2]:
import sys
sys.path.insert(0, "..")
from testbed.models import Idefics
import torch
import config

device = torch.device("cuda:1")
model = Idefics(
    config.idefics_9b_path,
    dtype=torch.bfloat16,
    device=device,
)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [7]:
from testbed.models.model_base import HookType


HookType.TEXT_MODEL_LAYER.name

'TEXT_MODEL_LAYER'

In [3]:
from testbed.data.coco import postprocess_generation
from testbed.data import prepare_caption_input

batch = next(iter(dataloader))
single_context = batch[0]
print(single_context)
# text, images = prepare_caption_input([single_context])
# inputs = model.process_input(text, images, return_tensors="pt", padding=True).to(device)
# seq_len = inputs.input_ids.shape[-1]
# generated_ids = model.generate(**inputs, max_new_tokens=15, num_beams=5)
# generated_ids = generated_ids[:, seq_len:]
# raw_output = model.processor.batch_decode(generated_ids, skip_special_tokens=True)
# prediction = postprocess_generation(raw_output)
# print(prediction)
# print(single_context[-1]["sentences_raw"]) # gt

[{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7F5B0F43ABB0>, 'filepath': 'COCO_train2014_000000057870.jpg', 'sentids': [787980, 789366, 789888, 791316, 794853], 'filename': 'COCO_train2014_000000057870.jpg', 'imgid': 40504, 'split': 'train', 'caption': 'A restaurant has modern wooden tables and chairs.', 'sentences_tokens': [['a', 'restaurant', 'has', 'modern', 'wooden', 'tables', 'and', 'chairs'], ['a', 'long', 'restaurant', 'table', 'with', 'rattan', 'rounded', 'back', 'chairs'], ['a', 'long', 'table', 'with', 'a', 'plant', 'on', 'top', 'of', 'it', 'surrounded', 'with', 'wooden', 'chairs'], ['a', 'long', 'table', 'with', 'a', 'flower', 'arrangement', 'in', 'the', 'middle', 'for', 'meetings'], ['a', 'table', 'is', 'adorned', 'with', 'wooden', 'chairs', 'with', 'blue', 'accents']], 'sentences_raw': ['A restaurant has modern wooden tables and chairs.', 'A long restaurant table with rattan rounded back chairs.', 'a long table with a plant on top of it sur

## Step 4. Evaluate
For image captioning task, it uses [CIDEr](../testbed/evaluate/metrics/CIDEr/CIDEr.py) to evaluate, which has already been implemented with [`evaluate`](https://huggingface.co/docs/evaluate/index) library that comes from hugging face. It is thoroughly tested to ensure full consistency with the [official CIDEr implementation](https://github.com/tylin/coco-caption), see [test script](../tests/CIDEr/test_CIDEr.py).

Thanks to huggingface space, you can also check [here](https://huggingface.co/spaces/Kamichanw/CIDEr) to try `CIDEr` online.

In [4]:
from testbed.data.coco import postprocess_generation
from testbed.data.utils import prepare_caption_input
from tqdm import tqdm
import evaluate

total_cider = evaluate.load("Kamichanw/CIDEr")
result = []

# for simplicity, just run 10 batches
for _, batch in zip(range(10), tqdm(dataloader, desc=f"Evaluating {model.model_name} ...")):
    text, images = prepare_caption_input(batch)
    predictions = model.generate(text, images, **hparams["generate_args"])
    for pred, context in zip(predictions, batch):
        last_cap = context[-1]
        gt_captions = last_cap["sentences_raw"]
        prediction = postprocess_generation(pred)
        total_cider.add(prediction=prediction, reference=gt_captions)
        result.append(
            {
                "cocoid": last_cap["cocoid"],
                "raw_output": pred,
                "filename": last_cap["filename"],
                "sentences": last_cap["sentences_raw"],
                "prediction": prediction,
            }
        )

eval_result = total_cider.compute()
eval_result

Evaluating idefics2-8b-base ...:   2%|▏         | 9/416 [00:57<43:11,  6.37s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
PTBTokenizer tokenized 2871 tokens at 70512.47 tokens per second.


{'CIDEr': 1.2339626951654092}

In [13]:
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
tokenizer = PTBTokenizer()
tokenizer.tokenize({
    123:[{"caption":"I love \"you\" \n\n"}]
})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
PTBTokenizer tokenized 5 tokens at 157.56 tokens per second.


{123: ['i love you']}

In [10]:
import json
import evaluate

result = json.load(open("result3.json", "r"))
cider = evaluate.load("Kamichanw/CIDEr")
for item in result:
    item["caption"] = item["caption"].split("\n", 1)[0]
    cider.add(prediction=item["caption"], reference=item["sentences"])

print(cider.compute())
json.dump(result, open("result3.json", "w"))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
PTBTokenizer tokenized 2868 tokens at 70648.06 tokens per second.


{'CIDEr': 1.2495354617821606}


## Step 4. Save Results
With the help of `evaluate.save`, we are able to save result and other hyper parameters to a json file.

In [5]:
evaluate.save("./", eval_result=eval_result, hparams=hparams, records=result)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PosixPath('result-2024_08_15-07_21_01.json')