# !NOTE: THIS NOTEBOOK IS MADE TO RUN ON COLAB. CHECK THE OTHER NOTEBOOK FOR LOCAL RUNS

In [1]:
import random
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset

dataset = load_dataset("HuggingFaceM4/A-OKVQA")

In [2]:
from matplotlib import pyplot as plt

def imshow(img, ax=None, caption = ""):
    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(img)
    ax.set_title(caption)
    ax.axis('off')
    return ax

def plot_images_with_captions(images, captions):
    fig, axes = plt.subplots(nrows=len(images), ncols=1, figsize=(10, 10))
    for i, (img, caption) in enumerate(zip(images, captions)):
        ax = imshow(img, axes[i])
        ax.set_title(caption)
    plt.show()

# images = [dataset['train'][i]['image'] for i in range(2)]
# captions = [dataset['train'][i]['question'] for i in range(2)]
# plot_images_with_captions(images, captions)


# Load BLIP model 🏋

In [3]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

MODEL_ID = 'openbmb/MiniCPM-V-2'

model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True, torch_dtype=torch.bfloat16)
# For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
model = model.to(device='cuda', dtype=torch.bfloat16)
# For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
#model = model.to(device='cuda', dtype=torch.float16)
# For Mac with MPS (Apple silicon or AMD GPUs).
# Run with `PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py`
#model = model.to(device='mps', dtype=torch.float16)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model.eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/1.99M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

MiniCPMV(
  (llm): MiniCPMForCausalLM(
    (model): MiniCPMModel(
      (embed_tokens): Embedding(122753, 2304)
      (layers): ModuleList(
        (0-39): 40 x MiniCPMDecoderLayer(
          (self_attn): MiniCPMSdpaAttention(
            (q_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (k_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (v_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (o_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (rotary_emb): MiniCPMRotaryEmbedding()
          )
          (mlp): MiniCPMMLP(
            (gate_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (up_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (down_proj): Linear(in_features=5760, out_features=2304, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): MiniCPMRMSNorm()
          (post_attention_layernorm): Min

In [4]:
# prompt: get total model parameters in human readable format

total_params = sum(p.numel() for p in model.parameters())
human_readable_params = f"{total_params / 1e6:.2f}M"
print(f"Total Parameters: {human_readable_params}")


Total Parameters: 3434.97M


# running inference on A-OKVQA 🏃

In [46]:
import warnings
warnings.filterwarnings('ignore')

In [62]:
import json


def chat(image, question, choices):
    # user_msg = f"Question: {question} Options: {', '.join(choices)}. What is the answer and explain?"
    user_msg = f"Question: {question} Options: {', '.join(choices)}. Let's think step by step and then answer."

    msgs = [
    {'role' : 'user', 'content' : "You are helpful assistant that will repond with only options provided by user."},
    {'role' : 'user', 'content' : user_msg}
    ] 

    res, context, _ = model.chat(
        image=image,
        msgs=msgs,
        question={"image": image, "question": json.dumps(msgs)},
        context=None,
        tokenizer=tokenizer,
        sampling=True,
        temperature=0.7  
    )
    
    # print(f"User: {user_msg}")
    # print(f"Model: {res}")
    # imshow(image)
    return res, user_msg

# x = dataset['train'][0]
# image = x['image']
# question = x['question']
# choices = x['choices']
# chat(image, question, choices)

# Generating Sheets for Annotation

In [64]:
import pandas as pd
from tqdm.auto import tqdm

RES_PATH = "./results/"
SAVE_PATH = RES_PATH + "img/"

import os

if not os.path.exists(RES_PATH):
    os.makedirs(RES_PATH)

if not os.path.exists(RES_PATH + "img/"):
    os.makedirs(RES_PATH + "img/")

if not os.path.exists(RES_PATH + "result/"):
    os.makedirs(RES_PATH + "result/")

meta_prompt = "Please explain the reasoning behind your answer?"
N_SAMPLES = 250  # -1
N = len(dataset["train"])

infer = dataset["train"].select(range(N))
# # --------- setting set-50 for testing ------------
set_50 = pd.read_csv("set-50-idx.csv")
set_50_idx = set_50["idx"].tolist()
infer = dataset["train"].select(set_50_idx)
# # --------- END setting set-50 for testing ------------

results = []

for idx, x in enumerate(tqdm(infer, total=len(infer))):
    if idx > N_SAMPLES:
        break

    # save image to make sheet
    image = x["image"]
    image.save(f"{SAVE_PATH}/{idx}.jpg")
    question = x['question']
    choices = x['choices']
    correct_choice = x['choices'][x['correct_choice_idx']]
    res, user_msg = chat(image, x['question'], x['choices'])

    def check(qa_rationale_answer, correct_answer):
        # check if the correct_answer is in the qa_rationale_answer
        return 1 if correct_answer.lower() in qa_rationale_answer.lower() else 0

    is_correct = check(res, correct_choice)
    print(f"Question: {question} \nOne shot question: {user_msg} \nAnswer: {res} \nCorrect Answer: {correct_choice}")
    print(f"Is Correct: {is_correct}")
    print('-'*100)

    results.append(
        {
            "idx": idx,
            "question": question,
            "correct_answer": correct_choice,
            "predicted_answer": res,
            "is_correct":(
                # 1 if qa_answer == x["choices"][x["correct_choice_idx"]] else 0
                is_correct
            ),
            "groundtruth_rationale": x["rationales"],
            "direct_answer": x["direct_answers"],
            "rationale_prompt": user_msg,
            "generated_rationale": res,
            "image_path": f"{SAVE_PATH}/{idx}.jpg",
        }
    )

    # print('-'*100)

  0%|          | 0/50 [00:00<?, ?it/s]

Question: What is the man by the bags awaiting? 
One shot question: Question: What is the man by the bags awaiting? Options: skateboarder, train, delivery, cab. Let's think step by step and then answer. 
Answer: The man is likely awaiting a mode of transportation, such as walking with luggage to board the skateboarder or waiting for his cab. 
Correct Answer: cab
Is Correct: 1
----------------------------------------------------------------------------------------------------
Question: Where does this man eat pizza? 
One shot question: Question: Where does this man eat pizza? Options: office, cafe, motel, outside. Let's think step by step and then answer. 
Answer: Step 1: Identify the location in which pizza is being consumed.
The image shows a man holding up an eating plate with slices of cheese and pepperoni pizza, near his desk at work (office) or computer keyboard area. There are no visible elements suggesting that he's consuming it outside like on a sidewalk or cafe table; nor does

In [None]:
RES_NAME = f"test.xlsx"
df = pd.DataFrame(results)
writer = pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# worksheet.insert_image('R1', f'/content/drive/MyDrive/<hidden>/img/{idx}.jpg')

# # Close the Pandas Excel writer and output the Excel file.
# writer.save()

In [66]:
import xlsxwriter


RES_NAME = f"{MODEL_ID.split('/')[-1]}_inference_chat.xlsx"
df = pd.DataFrame(results)
print(RES_PATH, RES_NAME)

with pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter') as writer:
    # Write the DataFrame to the worksheet, excluding the 'ImagePath' column if you don't want it in the Excel file.
    df.to_excel(writer, sheet_name='Sheet1', index=False)

    # Access the XlsxWriter workbook and worksheet objects from the DataFrame.
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']

    # Assuming 'ImagePath' is the name of the column containing the image paths.
    # Iterate over the DataFrame to insert images.
    for index, data in df.iterrows():
        # The cell where to insert the image is in column 'R', on the row corresponding to the DataFrame's index + 2
        # (because DataFrame's index starts at 0 and Excel rows start at 1, and there's a header row).
        cell = f'R{index + 2}'
        image_path = data['image_path']

        # Insert the image.
        worksheet.insert_image(cell, image_path)
        # worksheet.insert_image(cell, image_path, {'x_scale': 0.5, 'y_scale': 0.5})


./results/ MiniCPM-V-2_inference_chat.xlsx
