In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#  %%capture
!pip install datasets
!pip install accelerate
!pip install xlsxwriter

In [None]:
import random
from datasets import load_dataset

dataset = load_dataset("HuggingFaceM4/A-OKVQA")

In [None]:
from matplotlib import pyplot as plt

def imshow(img, ax=None, caption = ""):
    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(img)
    ax.set_title(caption)
    ax.axis('off')
    return ax

In [None]:
def plot_images_with_captions(images, captions):
    fig, axes = plt.subplots(nrows=len(images), ncols=1, figsize=(10, 10))
    for i, (img, caption) in enumerate(zip(images, captions)):
        ax = imshow(img, axes[i])
        ax.set_title(caption)
    plt.show()

# images = [dataset['train'][i]['image'] for i in range(2)]
# captions = [dataset['train'][i]['question'] for i in range(2)]
# plot_images_with_captions(images, captions)


# Load BLIP model 🏋

In [None]:
import torch
# Load model directly
from transformers import AutoProcessor, AutoModelForPreTraining, LlavaForConditionalGeneration

MODEL_ID = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

model.to("cuda")

In [None]:
# prompt: get total model parameters in human readable format

total_params = sum(p.numel() for p in model.parameters())
human_readable_params = f"{total_params / 1e6:.2f}M"
print(f"Total Parameters: {human_readable_params}")


# running inference on A-OKVQA 🏃

In [None]:
from typing import List, Dict, Any

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def inference(image, question, mode = "qa", hyperparams = None, max_new_tokens=40):
    # inputs = processor(images=image, text=make_prompt({'question': question}), return_tensors="pt").to("cuda", torch.float16)
    inputs = processor(text=question, images=image, return_tensors="pt").to("cuda")#, torch.float16)
    if mode == "qa":
        outputs = model.generate(**inputs,
                                num_beams=5,
                                length_penalty=-1,
                                max_new_tokens=max_new_tokens)
                                #  max_length=max_new_tokens)
    elif mode == "rationale":
        outputs = model.generate(**inputs,
                                num_beams=5,
                                length_penalty=1.1, # choose from [1, 1.5, 2]
                                max_new_tokens=max_new_tokens,
                                )
                            #  max_length=max_new_tokens,
                            #  )
    elif mode == "qa+r": # question answering + rationale (one step)
        outputs = model.generate(**inputs,
                                 num_beams=5,
                                 length_penalty=1.8,
                                 max_new_tokens=max_new_tokens,
                                 )


    # Decode and print the answer
    answer = processor.decode(outputs[0], skip_special_tokens=True)
    return answer

def make_prompt(x):
    """made so we can do some preprocessing editing to the question. left for the future"""
    return f"{x['question']} Choices: {', '.join(x['choices'])}"


# Helper methods

# Generating Sheets for Annotation

In [None]:
indices_for_set_50 = [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 22, 23, 24,
                      25, 26, 27, 28, 29, 30, 32, 34, 35, 36, 37, 38, 41, 43,
                      44, 46, 47, 48, 50, 56, 59, 60, 69, 70, 73, 76, 85, 87,
                      90, 92, 94, 97, 112, 154]
short_train_dataset = [dataset['train'][i] for i in indices_for_set_50]

In [None]:
import re

def check_answer(response, choices):
    # Extract the first sentence of the response
    last_sentence = re.split(r'\.|\n', response)[-1]
    if not last_sentence: # If the first sentence is empty
        last_sentence = re.split(r'\.|\n', response)[-2]

    last_paragraph = re.split(r'\n', response)[-1]
    if last_paragraph == response:
        last_paragraph = last_sentence

    # Initialize a list to hold matches
    matches = []

    # Check for each choice in the first sentence
    for choice in choices:
        if choice.lower() in last_paragraph.lower():
            matches.append(choice)

    # Determine the result based on matches
    if len(matches) == 1:
        return f"{matches[0]}"
    elif len(matches) > 1:
        return f"Indeterminate answer: Multiple matches found {matches}"
    else:
        return "Incorrect answer: No matches found"

# Example usage
response_text = """
Response: In the image, there is a large China Airlines airplane parked on the tarmac, and several cars are parked nearby. This suggests that the cars belong to airport workers who are responsible for servicing the airplane or performing other tasks related to airport operations. As a result, they have access to designated parking areas within the airport premises.
Thus, the drivers of the cars were able to park here because they are airport workers.
"""
choices = ["firemen", "airport workers", "police", "postal workers"]

result = check_answer(response_text, choices)
print(result)

In [None]:
import pandas as pd
from tqdm.auto import tqdm
from PIL import Image
import numpy as np

RES_PATH = "/content/drive/MyDrive/<hidden>/results/"
SAVE_PATH = "/content/drive/MyDrive/<hidden>/img"

import os
if not os.path.exists(RES_PATH):
    os.makedirs(RES_PATH)
    os.makedirs(RES_PATH + "img\\")
    os.makedirs(RES_PATH + "result\\")

if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# meta_prompt =  "Please explain the reasoning behind your answer?"
# new_meta_prompt = "Identify and describe the visual markers in the image that are critical to your answer."
one_step_prompt = "First, generate the rationale for the question. Follow this with the statement 'Thus, the answer is ' and then provide the answer."

N_SAMPLES = len(indices_for_set_50)

results = []
# prompt_answer_pair = {}

for idx, x in enumerate(tqdm(short_train_dataset, total = N_SAMPLES)):
    if idx >= N_SAMPLES:
        break

    # save image to make sheet
    image = x['image']

    # Show the image
    plt.imshow(image)
    plt.axis('off')
    plt.show()

    # save image
    image.save(f'{SAVE_PATH}{indices_for_set_50[idx]}.jpg')

    # run inference
    question = make_prompt(x) #f"{x['question']} Choices: {str(x['choices'])}"
    print(question)
    one_step_question = f"<image>\nUSER:Question: {question}.\n{one_step_prompt}\nASSISTANT:"
    one_step_answer = inference(image, one_step_question, mode = "qa+r", max_new_tokens=150)
    one_step_answer = one_step_answer[len(one_step_question)-6:]
    print("ONE_STEP_ANSWER:")
    print(f"Question: {one_step_question}\n Response: {one_step_answer}")
    qa_answer = check_answer(one_step_answer, x['choices'])
    print(qa_answer)


    results.append({
        'question': question,
        'correct_answer' : x['choices'][x['correct_choice_idx']],
        'predicted_answer' : qa_answer,
        'is_correct': 1 if qa_answer.lower() == x['choices'][x['correct_choice_idx']].lower() else 0,
        'groundtruth_rationale' : x['rationales'],
        'direct_answer' : x['direct_answers'],
        'one_step_prompt' : one_step_prompt,
        'generated_rationale' : one_step_answer,
        'image_path': f'{SAVE_PATH}/{indices_for_set_50[idx]}.jpg',
    })

    # print('-'*100)

In [None]:
RES_NAME = f"test.xlsx"
df = pd.DataFrame(results)
writer = pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# worksheet.insert_image('R1', f'/content/drive/MyDrive/<hidden>/img/{idx}.jpg')

# # Close the Pandas Excel writer and output the Excel file.
# writer.save()

In [None]:
import xlsxwriter

RES_NAME = f"{MODEL_ID.split('/')[-1].strip()}_inference_one_step.xlsx"
df = pd.DataFrame(results)
print(RES_PATH, RES_NAME)

with pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter') as writer:
    # Write the DataFrame to the worksheet, excluding the 'ImagePath' column if you don't want it in the Excel file.
    df.to_excel(writer, sheet_name='Sheet1', index=False)

    # Access the XlsxWriter workbook and worksheet objects from the DataFrame.
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']

    # Assuming 'ImagePath' is the name of the column containing the image paths.
    # Iterate over the DataFrame to insert images.
    for index, data in df.iterrows():
        # The cell where to insert the image is in column 'R', on the row corresponding to the DataFrame's index + 2
        # (because DataFrame's index starts at 0 and Excel rows start at 1, and there's a header row).
        cell = f'R{index + 2}'
        image_path = data['image_path']

        # Insert the image.
        worksheet.insert_image(cell, image_path)
        # worksheet.insert_image(cell, image_path, {'x_scale': 0.5, 'y_scale': 0.5})
