# !NOTE: THIS NOTEBOOK IS MADE TO RUN ON COLAB. CHECK THE OTHER NOTEBOOK FOR LOCAL RUNS

In [2]:
!nvidia-smi

Sat Apr 20 05:03:20 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.203.03   Driver Version: 450.203.03   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 6000     Off  | 00000000:1A:00.0 Off |                  Off |
| 33%   33C    P8    24W / 260W |      1MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro RTX 6000     Off  | 00000000:1B:00.0 Off |                  Off |
| 33%   30C    P8    15W / 260W |      1MiB / 24220MiB |      0%      Default |
|       

In [22]:
import random
from tqdm.auto import tqdm
from datasets import load_dataset

dataset = load_dataset("HuggingFaceM4/A-OKVQA")

In [4]:
from matplotlib import pyplot as plt

def imshow(img, ax=None, caption = ""):
    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(img)
    ax.set_title(caption)
    ax.axis('off')
    return ax

def plot_images_with_captions(images, captions):
    fig, axes = plt.subplots(nrows=len(images), ncols=1, figsize=(10, 10))
    for i, (img, caption) in enumerate(zip(images, captions)):
        ax = imshow(img, axes[i])
        ax.set_title(caption)
    plt.show()

# images = [dataset['train'][i]['image'] for i in range(2)]
# captions = [dataset['train'][i]['question'] for i in range(2)]
# plot_images_with_captions(images, captions)


# Load BLIP model 🏋

In [41]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cpu")
torch_dtype = torch.float32 if device.type == "cpu" else torch.float16

MODEL_ID = "Salesforce/blip2-flan-t5-xxl"  # [Salesforce/instructblip-flan-t5-xxl, liuhaotian/llava-v1.5-13b]

processor = Blip2Processor.from_pretrained(MODEL_ID)
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xxl", device_map="auto")
model = Blip2ForConditionalGeneration.from_pretrained(
    MODEL_ID, torch_dtype=torch_dtype,
).to(device)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [42]:
# prompt: get total model parameters in human readable format

total_params = sum(p.numel() for p in model.parameters())
human_readable_params = f"{total_params / 1e6:.2f}M"
print(f"Total Parameters: {human_readable_params}")


Total Parameters: 12229.60M


# running inference on A-OKVQA 🏃

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
def inference(image, question, mode = "qa", hyperparams = None, device = "cuda", dtype = torch.float32):
  # inputs = processor(images=image, text=make_prompt({'question': question}), return_tensors="pt").to("cuda", torch.float16)
  inputs = processor(images=image, text=question, return_tensors="pt").to(device, dtype)
  if mode == "qa":
    outputs = model.generate(**inputs,
                             num_beams = 5,
                             length_penalty = -1)
  elif mode == "rationale":
    outputs = model.generate(**inputs,
                             num_beams=5,
                             length_penalty = 1.1 # choose from [1, 1.5, 2]
                             )

  # Decode and print the answer
  answer = processor.decode(outputs[0], skip_special_tokens=True)
  return answer


def distributed_inference(images, questions, mode = "qa", hyperparams = None, device = "cuda", dtype = torch.float32, accelerator=None):
    inputs = accelerator.prepare(processor(images=images, text=questions, return_tensors="pt"))

    # Generate outputs based on the mode
    if mode == "qa":
        outputs = model.generate(**inputs,
                                 num_beams=5,
                                 length_penalty=-1)
    elif mode == "rationale":
        outputs = model.generate(**inputs,
                                 num_beams=5,
                                 length_penalty=1)  # Length penalty options: 1, 1.5, 2

    # Decode the outputs
    answer = processor.decode(outputs[0], skip_special_tokens=True)
    return answer


def make_prompt(x):
  """made so we can do some preprocessing editing to the question. left for the future"""
  return f"{x['question']} Choices: {', '.join(x['choices'])}"

# Helper methods

# Generating Sheets for Annotation

In [60]:
set_50 = pd.read_csv("set-50-idx.csv")
set_50_idx = set_50["idx"].tolist()

In [63]:
infer = dataset['train'].select(set_50_idx)

In [66]:
import pandas as pd
from tqdm.auto import tqdm

RES_PATH = "./results/"
SAVE_PATH = RES_PATH + "img/"

import os

if not os.path.exists(RES_PATH):
    os.makedirs(RES_PATH)

if not os.path.exists(RES_PATH + "img/"):
    os.makedirs(RES_PATH + "img/")

if not os.path.exists(RES_PATH + "result/"):
    os.makedirs(RES_PATH + "result/")

meta_prompt = "Please explain the reasoning behind your answer?"
N_SAMPLES = 250  # -1
N = len(dataset["train"])

# # --------- setting set-50 for testing ------------
set_50 = pd.read_csv("set-50-idx.csv")
set_50_idx = set_50["idx"].tolist()
infer = dataset["train"].select(set_50_idx)
# # --------- END setting set-50 for testing ------------

results = []

for idx, x in enumerate(tqdm(infer, total=len(infer))):
    if idx > N_SAMPLES:
        break

    # save image to make sheet
    image = x["image"]
    image.save(f"{SAVE_PATH}/{idx}.jpg")

    # run inference
    question = make_prompt(x)  # f"{x['question']} Choices: {str(x['choices'])}"
    qa_answer = inference(image, question, mode="qa", device=device, dtype=torch_dtype)
    ra_question = (
        "Question: " + question + ". Answer: " + qa_answer + ". " + meta_prompt
    )
    answer = inference(
        image, ra_question, mode="rationale", device=device, dtype=torch_dtype
    )
    # print(f"question: {ra_question} \n rationale: {answer}")

    results.append(
        {
            "idx": idx,
            "question": question,
            "correct_answer": x["choices"][x["correct_choice_idx"]],
            "predicted_answer": qa_answer,
            "is_correct": (
                1 if qa_answer == x["choices"][x["correct_choice_idx"]] else 0
            ),
            "groundtruth_rationale": x["rationales"],
            "direct_answer": x["direct_answers"],
            "rationale_prompt": ra_question,
            "generated_rationale": answer,
            "image_path": f"{SAVE_PATH}/{idx}.jpg",
        }
    )

    # print('-'*100)

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
RES_NAME = f"test.xlsx"
df = pd.DataFrame(results)
writer = pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# worksheet.insert_image('R1', f'/content/drive/MyDrive/<hidden>/img/{idx}.jpg')

# # Close the Pandas Excel writer and output the Excel file.
# writer.save()

In [68]:
import xlsxwriter

RES_NAME = f"{MODEL_ID.split('/')[-1]}_inference.xlsx"
df = pd.DataFrame(results)
print(RES_PATH, RES_NAME)

with pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter') as writer:
    # Write the DataFrame to the worksheet, excluding the 'ImagePath' column if you don't want it in the Excel file.
    df.to_excel(writer, sheet_name='Sheet1', index=False)

    # Access the XlsxWriter workbook and worksheet objects from the DataFrame.
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']

    # Assuming 'ImagePath' is the name of the column containing the image paths.
    # Iterate over the DataFrame to insert images.
    for index, data in df.iterrows():
        # The cell where to insert the image is in column 'R', on the row corresponding to the DataFrame's index + 2
        # (because DataFrame's index starts at 0 and Excel rows start at 1, and there's a header row).
        cell = f'R{index + 2}'
        image_path = data['image_path']

        # Insert the image.
        worksheet.insert_image(cell, image_path)
        # worksheet.insert_image(cell, image_path, {'x_scale': 0.5, 'y_scale': 0.5})


./results/ blip2-flan-t5-xxl_inference.xlsx
