In [None]:
import openai
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset

dataset = load_dataset("HuggingFaceM4/A-OKVQA")

In [None]:
import base64
import requests

with open("../OPENAI_key.txt") as f:
    # OpenAI API Key
    api_key = f.read().split()[0]

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

def query(image, prompt, max_tokens=500):

    # if type(image) == str:
    #     image = encode_image(image)

    payload = {
        "model": "gpt-4-turbo",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                ]
            }
        ],
        "max_tokens": max_tokens
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    print(f"Response: {response}")

    if response.status_code != 200:
        print(response.json())
        return None
    else:
       return response.json()['choices'][0]['message']['content']

In [None]:
def inference(image, question, mode = "qa"):
  if mode == "qa":
    output = query(image, question, max_tokens=50)
  elif mode == "rationale":
    output = query(image, question, max_tokens=100)
  elif mode == "qa_rationale":
    output = query(image, question)
  
  return output

def make_prompt(x):
  """made so we can do some preprocessing editing to the question. left for the future"""
  return f"{x['question']} Choices: {', '.join(x['choices'])}"

In [None]:
# Load the dataset
dataset = load_dataset("HuggingFaceM4/A-OKVQA")

# Verify the structure of the dataset
print("Dataset structure: ", dataset)

# Select the first 250 examples from the training set
infer = dataset['train'].select(range(250))

# Verify the first example to confirm it's from the training set
print("First training example: ", infer[0])

In [None]:
import pandas as pd
from tqdm.auto import tqdm

RES_PATH = "./results/"
SAVE_PATH = RES_PATH + "img/"

import os

if not os.path.exists(RES_PATH):
    os.makedirs(RES_PATH)

if not os.path.exists(RES_PATH + "img/"):
    os.makedirs(RES_PATH + "img/")

if not os.path.exists(RES_PATH + "result/"):
    os.makedirs(RES_PATH + "result/")

N = 250

infer = dataset["train"].select(range(250))
# # --------- setting set-50 for testing ------------
# set_50 = pd.read_csv("set-50-idx.csv")
# set_50_idx = set_50["idx"].tolist()
# infer = dataset["train"].select(set_50_idx)
# # --------- END setting set-50 for testing ------------

In [None]:
# function to convert PIL image to base64
import base64
import io

def pil_image_to_base64(pil_image, img_format="JPEG"):
    img_buffer = io.BytesIO()
    pil_image.save(img_buffer, format=img_format)
    img_buffer.seek(0)
    return base64.b64encode(img_buffer.read()).decode('utf-8')

In [None]:
from PIL import Image

results = []

for idx, x in enumerate(tqdm(infer, total=len(infer))):
    print(x)
    print(x['choices'])
    print(x['choices'][x['correct_choice_idx']])
    break
    # Retrieve the image bytes
    image_data = x["image"].get('bytes')

    # Convert bytes data to a PIL Image object
    image = Image.open(io.BytesIO(image_data))
    
     # Show the image
    plt.imshow(image)
    plt.axis('off')
    plt.show()
    
    # Save the image to the specified path
    image.save(f"{SAVE_PATH}/{idx}.jpg")

    question_and_choices = make_prompt(x)  # f"{x['question']} Choices: {str(x['choices'])}"

    # one-shot run for qa_rationale
#     one_shot_prompt = "First generate the rationale for the question and then answer the question based on the rationale. Make sure to provide final answer in JSON format such as  {'answer': 'your answer'}"
    one_shot_prompt = "First, generate a rationale for why you select a given answer for the following question. Follow this with the statement 'Thus, the answer is ' and then provide the answer."
    qa_rationale_question =  f"{one_shot_prompt} Question and choices: {question_and_choices}."

    def is_correct(qa_rationale_answer, correct_answer):
        # check if the correct_answer is in the qa_rationale_answer
        return 1 if f'Thus, the answer is {correct_answer.lower()}' in qa_rationale_answer.lower() else 0

    image = pil_image_to_base64(image)
    qa_rationale_answer = inference(image, qa_rationale_question, mode="qa_rationale")
    
    print(f"qa_rationale_answer: {qa_rationale_answer}")

    print(f"Question: {question_and_choices} \nOne shot question: {qa_rationale_question} \nAnswer: {qa_rationale_answer} \nCorrect Answer: {x['choices'][x['correct_choice_idx']]}")
    print(f"Is Correct: {is_correct(qa_rationale_answer, x['choices'][x['correct_choice_idx']])}")
    print('-'*100)

    results.append(
        {
            "idx": idx,
            "question": question_and_choices,
            "correct_answer": x["choices"][x["correct_choice_idx"]],
            "predicted_answer": qa_rationale_answer,
            "is_correct": (
                # 1 if qa_answer == x["choices"][x["correct_choice_idx"]] else 0
                is_correct(qa_rationale_answer, x["choices"][x["correct_choice_idx"]])
            ),
            "groundtruth_rationale": x["rationales"],
            "direct_answer": x["direct_answers"],
            "rationale_prompt": qa_rationale_question,
            "generated_rationale": qa_rationale_answer,
            "image_path": f"{SAVE_PATH}/{idx}.jpg",
        }
    )
    # print('-'*100)

In [None]:
results

In [None]:
from copy import deepcopy

res_cpy = deepcopy(results)

In [None]:
res_cpy

In [None]:
# now we try to find the real answers via parsing the json in 
# every response and compare it with the correct answer
import re
import json


def extract_answer(text):
    try:
        pattern = r'\{\s*(?:"[^"]*"\s*:\s*".*?"|\'[^\']*\'\s*:\s*\'.*?\')(?:\s*,\s*(?:"[^"]*"\s*:\s*".*?"|\'[^\']*\'\s*:\s*\'.*?\'))*\s*\}'
        dict_str = re.search(pattern, text).group(0)
        ans = eval(dict_str)
        return list(ans.values())[0]
    except:
        return text

for idx in range(len(res_cpy)):
    
    res_cpy[idx]["predicted_answer"] = extract_answer(res_cpy[idx]["predicted_answer"])
    res_cpy[idx]["is_correct"] =  int(
        res_cpy[idx]["correct_answer"].lower() in res_cpy[idx]["predicted_answer"].lower()
    )

In [None]:
RES_NAME = f"gpt4o_trial2.xlsx"
df = pd.DataFrame(res_cpy)
writer = pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']


In [None]:
import xlsxwriter

MODEL_ID = "gpt-4o"

RES_NAME = f"{MODEL_ID.split('/')[-1]}_inference_one_shot_50_improved_prompt.xlsx"
df = pd.DataFrame(res_cpy)
print(RES_PATH, RES_NAME)

with pd.ExcelWriter(RES_PATH + RES_NAME, engine='xlsxwriter') as writer:
    # Write the DataFrame to the worksheet, excluding the 'ImagePath' column if you don't want it in the Excel file.
    df.to_excel(writer, sheet_name='Sheet1', index=False)

    # Access the XlsxWriter workbook and worksheet objects from the DataFrame.
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']

    # Assuming 'ImagePath' is the name of the column containing the image paths.
    # Iterate over the DataFrame to insert images.
    for index, data in df.iterrows():
        # The cell where to insert the image is in column 'R', on the row corresponding to the DataFrame's index + 2
        # (because DataFrame's index starts at 0 and Excel rows start at 1, and there's a header row).
        cell = f'R{index + 2}'
        image_path = data['image_path']

        # Insert the image.
        worksheet.insert_image(cell, image_path)
        # worksheet.insert_image(cell, image_path, {'x_scale': 0.5, 'y_scale': 0.5})
