In [None]:
from openai import OpenAI
# OPENAI api key
api_key_path = '../../OPENAI_key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip().split('\n')[0]
    
client = OpenAI(api_key=api_key)

In [None]:
import pandas as pd
import numpy as np

dataset_type = "validation" # or training

# Read data
# file_path = '../results/Human Annotation of LLaVA+ Rationales.xlsx'
file_path = '../results/llava-1.5-7b-hf_val_500_two_steps.xlsx'

model_name = "LLaVA"
    
if dataset_type == "training":
    # Read the specified columns from the sheet
    columns_to_read = [
        'question',
        'correct_answer',
        'predicted_answer',
        'is_correct',
        'groundtruth_rationale',
        'generated_rationale',
        'gen_rationale_distinct_pieces',
    ]
    data = pd.read_excel(file_path, header=1, usecols=columns_to_read)
elif dataset_type == "validation":
    # columns_to_read = [
    #     'question',
    #     'correct_answer',
    #     'predicted_qa_answer',	
    #     'qa_is_correct',	
    #     'groundtruth_rationale',	
    #     'direct_answer',	
    #     'one_step_question',	
    #     'one_step_answer',	
    #     'one_step_is_correct',
    #     'one_step_rationale',
    #     'image_path'
    # ]
    columns_to_read = [
        'question',
        'correct_answer',
        'predicted_answer',	
        'is_correct',	
        'generated_rationale',
        'image_path'
    ]
    # data = pd.read_excel(file_path, usecols=columns_to_read, sheet_name='val_500_set')
    data = pd.read_excel(file_path, usecols=columns_to_read)
    
# data['image_path'] = data.index.to_series().apply(lambda x: f"../results/img/{dataset_type}/{x}.jpg")
data['image_path_split'] = data['image_path'].apply(lambda x: x.split('//'))
data['image_path'] = data['image_path_split'].apply(lambda x: f"../results/img/validation/{x[1]}")
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data.drop('image_path_split', axis=1, inplace=True)

data

In [None]:
data['generated_rationale'][0]

In [None]:
import os

# Define the file to store the total cost
COST_FILE = "total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")
        
def calculate_cost(usage, model, verbose=0):
    if model == "gpt-4o-2024-05-13":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    if model == "gpt-4o-2024-08-06":
        input_cost_per_token = 0.0025 / 1000
        output_cost_per_token = 0.010 / 1000
    
    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

In [None]:
import base64
import requests

def gpt_gen_vf_questions(row, rationale_column_name, cost_verbose=0):
    model_name = "gpt-4o-2024-08-06"
    # # Read the image and convert it to base64 format
    # with open(row['image_path'], "rb") as image_file:
    #     encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    system_prompt = f"""You will be shown a question about an image, along with an answer, and a rationale that explains the answer based on details from the image. Your task is to generate a list of yes/no questions that verify the details about the image that are **explicitly** mentioned in the rationale. Your questions should be phrased such that the answer to that question being yes means that the detail in the rationale is correct. Focus on creating questions that can be visually verified or refuted based on the details provided in the rationale. Ensure the questions are specific and directly pertain to aspects that are visually relevant and mentioned in the rationale. Avoid generating questions about elements that are not mentioned in the rationale, or the rationale explicitly states are not relevant or present. Also avoid generating multiple questions that check for the same visual detail.

Here is one example:
Input: 
Question: Why is the person wearing a helmet?
Answer: For safety
Rationale: The person is wearing a helmet because they are riding a bicycle on a busy city street. Helmets are commonly used to protect against head injuries in case of accidents, especially in areas with heavy traffic.

Good Questions:
1. Is the person wearing a helmet while riding a bicycle?
Reason: This question is directly answerable by observing whether the person on the bicycle is wearing a helmet in the image. 
2. Is the street in the image busy with traffic?
Reason: This question can be visually verified by looking at the amount of traffic on the street in the image.

Bad Questions:
1. Is the person wearing the helmet because they are concerned about head injuries?
Reason: This question is not good because it assumes the person’s intentions or concerns, which cannot be visually verified from the image.
2. Does wearing a helmet suggest that the person is highly safety-conscious?
Reason: This question relies on inference and external knowledge about the person’s mindset, rather than on observable details from the image.
3. Is there any indication that the person is wearing a helmet for safety reasons?
Reason: This question verifies the answer to the original question, rather than verifying a detail about the image that's mentioned in the rationale.
4. Is the person wearing a safety vest?
Reason: This question is not good because it tries to verify details about the image that are not explicitly mentioned in the rationale.
5. Is the person not wearing sunglasses?
Reason: This question is not good because it asks for verification by absence and can only be answered with a "no," which is not the preferred type of question.

Respond with a list of (good) questions (without the reasons), starting from '1. '"""
    
#     user_input = f"""Question: {row['question']}
# Answer: {row['one_step_answer']}
# Rationale: {row[rationale_column_name]}"""

    user_input = f"""Question: {row['question']}
Answer: {row['predicted_answer']}
Rationale: {row[rationale_column_name]}"""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": system_prompt
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    }, 
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()
        
        # content is a list of questions, separated by a newline character
        # 1. ... \n 2. ... \n 3. ...

        try:
            # Split the content into individual questions and return a python list
            if '\n' in content:
                parts = content.split('\n')
            else:
                parts = content.split('. ')  # Split by ". " as a fallback
            questions = []

            for part in parts:
                try:
                    # Attempt to split and take the second part
                    question = part.split('. ')[1]
                    questions.append(question)
                except IndexError:
                    # If there's an issue with splitting, add the entire part or handle as needed
                    questions.append(part)
        except Exception as e:
            questions = [content]
            print(f"Error: {e}")

        return questions

In [None]:
# One example
# gpt_gen_vf_questions(data.iloc[0], 'one_step_rationale', cost_verbose=1)
gpt_gen_vf_questions(data.iloc[0], 'generated_rationale', cost_verbose=1)

In [None]:
df = data.copy()

In [None]:
# Initialize the 'vf_questions' column if it doesn't exist
if 'vf_questions' not in df.columns:
    df['vf_questions'] = None  # Or use an empty list []
    
for idx, row in df.iterrows():
#     questions = gpt_gen_vf_questions(row, 'one_step_rationale')
    questions = gpt_gen_vf_questions(row, 'generated_rationale')
    # add the questions to the dataframe
    df.at[idx, 'vf_questions'] = questions
    print(f"Questions for example {idx}:")
    for i, question in enumerate(questions):
        print(f"{i+1}. {question}")
    print()
    
df

In [None]:
# Output to a xlsx file
df.to_excel("data_balanced_vf_questions_val_2step.xlsx", index=False)

# Use LLaVA to check vf_questions

In [None]:
!nvidia-smi

In [None]:
import torch
# Load model directly
from transformers import AutoProcessor, AutoModelForPreTraining, LlavaForConditionalGeneration

MODEL_ID = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

model.to("cuda:9")

In [None]:
from PIL import Image

def inference(image, question, mode = "qa", hyperparams = None, max_new_tokens=40):
  # inputs = processor(images=image, text=make_prompt({'question': question}), return_tensors="pt").to("cuda", torch.float16)
  inputs = processor(text=question, images=image, return_tensors="pt").to("cuda:9")#, torch.float16)
  if mode == "qa":
      outputs = model.generate(**inputs,
                               num_beams=5,
                               length_penalty=-1,
                               max_new_tokens=max_new_tokens)
                              #  max_length=max_new_tokens)
  elif mode == "rationale":
      outputs = model.generate(**inputs,
                               num_beams=5,
                               length_penalty=1.1, # choose from [1, 1.5, 2]
                               max_new_tokens=max_new_tokens,
                               )
                            #  max_length=max_new_tokens,
                            #  )
  elif mode == "vf_question":
      outputs = model.generate(**inputs,
                               num_beams=5,
                               length_penalty=-1,
                               max_new_tokens=1,
                               )

  # Decode and print the answer
  answer = processor.decode(outputs[0], skip_special_tokens=True)
  answer = answer.split(question.split("<image>")[1])[1].strip()
  return answer


image0 = Image.open(df.iloc[0]['image_path'])
question0 = f"<image>\nUSER:Question: {df.iloc[0]['vf_questions'][0]}. Answer with 'yes' or 'no'.\nASSISTANT:"
inference(image0, question0, mode="vf_question")


In [None]:
df

In [None]:
def answer_vf_questions(data):
    # if 'vf_answers_LLaVA' not in data.columns:
    #     data['vf_answers_LLaVA'] = None
    if 'vf_answers_LLaVA_GPT_r' not in data.columns:
        data['vf_answers_LLaVA_GPT_r'] = None
    for idx, row in data.iterrows():
        image = Image.open(row['image_path'])
        answer_list = []
        # Image
        display(image)
        # for i, question in enumerate(row['vf_questions']):
        #     question = f"<image>\nUSER:Question: {question}. Answer with 'yes' or 'no'.\nASSISTANT:"
        #     answer = inference(image, question, mode="vf_question")
        #     answer_list.append(answer)
        #     # Print image, question, and answer
        #     print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        # data.at[idx, 'vf_answers_LLaVA'] = answer_list
        for i, question in enumerate(row['vf_questions_GPT_r']):
            question = f"<image>\nUSER:Question: {question}. Answer with 'yes' or 'no'.\nASSISTANT:"
            answer = inference(image, question, mode="vf_question")
            answer_list.append(answer)
            # Print image, question, and answer
            print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        data.at[idx, 'vf_answers_LLaVA_GPT_r'] = answer_list
    return data

df = answer_vf_questions(df)
df

In [None]:
# Output to a xlsx file
df.to_excel("data_balanced_vf_questions.xlsx", index=False)

# Use GPT to check vf_questions

In [None]:
import base64
import requests

def gpt_answer_vf_questions(question, image_path, cost_verbose=0):
    model_name = "gpt-4o-2024-08-06"
    # Read the image and convert it to base64 format
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    user_input = f"""Question: {question}. Based on the information provided in the image, answer with 'yes' or 'no'. Provide one-word answer only."""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            # {
            #     "role": "system",
            #     "content": [
            #         {
            #             "type": "text",
            #             "text": system_prompt
            #         }
            #     ]
            # },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    }, 
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        content = response.json()['choices'][0]['message']['content'].strip().strip('.')
        return content

In [None]:
df

In [None]:
from PIL import Image

def answer_vf_questions(data):
    if 'vf_answers_GPT' not in data.columns:
        data['vf_answers_GPT'] = None
    # if 'vf_answers_GPT_GPT_r' not in data.columns:
    #     data['vf_answers_GPT_GPT_r'] = None
    for idx, row in data.iterrows():
        image = Image.open(row['image_path'])
        answer_list = []
        # Image
        display(image)
        for i, question in enumerate(row['vf_questions']):
            answer = gpt_answer_vf_questions(question, row['image_path'], cost_verbose=1)
            answer_list.append(answer)
            # Print image, question, and answer
            print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        data.at[idx, 'vf_answers_GPT'] = answer_list
        # for i, question in enumerate(row['vf_questions_GPT_r']):
        #     answer = gpt_answer_vf_questions(question, row['image_path'], cost_verbose=1)
        #     answer_list.append(answer)
        #     # Print image, question, and answer
        #     print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        # data.at[idx, 'vf_answers_GPT_GPT_r'] = answer_list
    return data

df = answer_vf_questions(df)
df

In [None]:
# import pandas as pd
# df = pd.read_excel('data_balanced_vf_questions_val_set_GPT_r.xlsx')
# df.drop('vf_answers_LLaVA_GPT_r', axis=1, inplace=True)
# df

In [None]:
# Output to a xlsx file
output_path = "data_vf_questions_val_2steps.xlsx"
# df.to_excel("data_balanced_vf_questions.xlsx", index=False)
df.to_excel(output_path, index=False)

In [None]:
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as ExcelImage
import os

# First, save the dataframe to an excel file (without images) to manipulate it with openpyxl
df.to_excel(output_path, index=False)

# Load the saved excel file
wb = load_workbook(output_path)
ws = wb.active

# Add images to the new column in the excel file
for index, row in df.iterrows():
    img_path = row['image_path']
    # Check if the image file exists before adding
    if os.path.exists(img_path):
        img = ExcelImage(img_path)
        img_cell = f"R{index + 2}"  # Placing the image starting from row 2, column R
        ws.add_image(img, img_cell)

# Save the updated excel file with images
output_path_with_images = output_path
wb.save(output_path_with_images)

output_path_with_images

In [None]:
skippppppp

In [None]:
# Analyze the correlation between vf_answers_LLaVA and vf_answers_GPT
from scipy.stats import spearmanr

# Flatten the lists
flat_llava_answers = [item for sublist in df['vf_answers_LLaVA_GPT_r'] for item in sublist]
flat_gpt_answers = [item for sublist in df['vf_answers_GPT_GPT_r'] for item in sublist]

# Map 'Yes' to 1 and 'No' to 0 for correlation calculation
mapping = {'Yes': 1, 'No': 0}
flat_llava_mapped = [mapping[answer] for answer in flat_llava_answers]
flat_gpt_mapped = [mapping[answer] for answer in flat_gpt_answers]

# Calculate Spearman correlation
spearman_corr, p_value = spearmanr(flat_llava_mapped, flat_gpt_mapped)

spearman_corr, p_value

In [None]:
from sklearn.metrics import cohen_kappa_score

# Calculate Cohen's kappa
cohen_kappa = cohen_kappa_score(flat_llava_mapped, flat_gpt_mapped)

cohen_kappa