In [1]:
from openai import OpenAI
# OPENAI api key
api_key_path = '../../OPENAI_key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip().split('\n')[0]
    
client = OpenAI(api_key=api_key)

In [7]:
import pandas as pd
import numpy as np

dataset_type = "validation" # or training

# Read data
file_path = '../results/Human Annotation of LLaVA+ Rationales.xlsx'

if file_path == '../results/Human Annotation of LLaVA+ Rationales.xlsx':
    model_name = "LLaVA"
    
if dataset_type == "training":
    # Read the specified columns from the sheet
    columns_to_read = [
        'question',
        'correct_answer',
        'predicted_answer',
        'is_correct',
        'groundtruth_rationale',
        'generated_rationale',
        'gen_rationale_distinct_pieces',
    ]
    data = pd.read_excel(file_path, header=1, usecols=columns_to_read)
elif dataset_type == "validation":
    columns_to_read = [
        'question',
        'correct_answer',
        'predicted_qa_answer',	
        'qa_is_correct',	
        'groundtruth_rationale',	
        'direct_answer',	
        'one_step_question',	
        'one_step_answer',	
        'one_step_is_correct',
        'one_step_rationale'
    ]
    data = pd.read_excel(file_path, usecols=columns_to_read, sheet_name='val_500_set')
    
data['image_path'] = data.index.to_series().apply(lambda x: f"../results/img/{dataset_type}/{x}.jpg")

data.dropna(inplace=True)

data

In [19]:
# Consider only these examples: bike in front of the bus (43), cat growling at the car (25), number 10 example (32),
# dog walking in the rain (35), and firefighters in front of the building (46)
data_small = data[data.index.isin([43, 25, 32, 35, 46])].copy()
data_small.drop(columns=['gen_rationale_distinct_pieces', 'groundtruth_rationale'], inplace=True)
data_small

In [54]:

# index 9, 16, 25, 32, 0, 29, 27, 1
data_combined = data.loc[[9, 16, 25, 32, 0, 29, 27, 1]].copy()
data_combined

In [8]:
import os

# Define the file to store the total cost
COST_FILE = "total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")
        
def calculate_cost(usage, model="GPT-4o", verbose=0):
    if model == "GPT-4o":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    
    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

In [57]:
import base64
import requests

def gpt_gen_vf_questions(row, rationale_column_name, cost_verbose=0):
    # # Read the image and convert it to base64 format
    # with open(row['image_path'], "rb") as image_file:
    #     encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    system_prompt = f"""Given a question, answer, and rationale, generate a list of all possible yes/no questions that can be answered by examining the corresponding image associated with the original question. Focus on creating questions that can be visually verified or refuted based on the details provided in the rationale. Ensure the questions are specific and directly pertain to aspects that are visually relevant and mentioned in the rationale. Avoid generating questions about elements that the rationale explicitly states are not relevant or present. Also avoid generating questions that check for the same visual detail. Questions should be verifiable with a "yes" answer rather than a "no."

Here is one example:
Input: 
Question: Why is the person wearing a helmet?
Answer: For safety
Rationale: The person is wearing a helmet because they are riding a bicycle on a busy city street. Helmets are commonly used to protect against head injuries in case of accidents, especially in areas with heavy traffic.

Good Questions:
1. Is the person wearing a helmet while riding a bicycle?
Reason: This question is directly answerable by observing whether the person on the bicycle is wearing a helmet in the image. 
2. Is the street in the image busy with traffic?
Reason: This question can be visually verified by looking at the amount of traffic on the street in the image.

Bad Questions:
1. Is the person wearing the helmet because they are concerned about head injuries?
Reason: This question is not good because it assumes the person’s intentions or concerns, which cannot be visually verified from the image.
2. Does wearing a helmet suggest that the person is highly safety-conscious?
Reason: This question relies on inference and external knowledge about the person’s mindset, rather than on observable details from the image.
3. Is the person not wearing sunglasses?
Reason: This question is not good because it asks for verification by absence and can only be answered with a "no," which is not the preferred type of question.

Respond with a list of (good) questions (without the reasons)."""
    
    user_input = f"""Question: {row['question']}
Answer: {row['predicted_answer']}
Rationale: {row[rationale_column_name]}"""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": system_prompt
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    }, 
                    # NO IMAGE PROVIDED
                    # {
                    #     "type": "image_url",
                    #     "image_url": {
                    #         "url": f"data:image/jpeg;base64,{encoded_image}"
                    #     }
                    # }
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()
        
        # content is a list of questions, separated by a newline character
        # 1. ... \n 2. ... \n 3. ...

        # Split the content into individual questions and return a python list
        parts = content.split('\n')
        questions = [part.split('. ')[1] for part in parts]

        return questions

In [8]:
# One example
gpt_gen_vf_questions(data.iloc[0], cost_verbose=1)

In [56]:
# df = data_small
df = data_combined

In [33]:
# Initialize the 'vf_questions' column if it doesn't exist
if 'vf_questions' not in df.columns:
    df['vf_questions'] = None  # Or use an empty list []
    
for idx, row in df.iterrows():
    questions = gpt_gen_vf_questions(row, cost_verbose=1)
    # add the questions to the dataframe
    df.at[idx, 'vf_questions'] = questions
    print(f"Questions for example {idx}:")
    for i, question in enumerate(questions):
        print(f"{i+1}. {question}")
    print()
    
df

In [62]:
# Initialize the 'vf_questions' column if it doesn't exist
if 'vf_questions_GPT_r' not in df.columns:
    df['vf_questions_GPT_r'] = None  # Or use an empty list []
    
for idx, row in df.iterrows():
    questions = gpt_gen_vf_questions(row, "gpt-4o_rationale", cost_verbose=1)
    # add the questions to the dataframe
    df.at[idx, 'vf_questions_GPT_r'] = questions
    print(f"Questions for example {idx}:")
    for i, question in enumerate(questions):
        print(f"{i+1}. {question}")
    print()
    
df

# Use LLaVA to check vf_questions

In [34]:
!nvidia-smi

In [35]:
import torch
# Load model directly
from transformers import AutoProcessor, AutoModelForPreTraining, LlavaForConditionalGeneration

MODEL_ID = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

model.to("cuda")

In [36]:
from PIL import Image

def inference(image, question, mode = "qa", hyperparams = None, max_new_tokens=40):
  # inputs = processor(images=image, text=make_prompt({'question': question}), return_tensors="pt").to("cuda", torch.float16)
  inputs = processor(text=question, images=image, return_tensors="pt").to("cuda")#, torch.float16)
  if mode == "qa":
      outputs = model.generate(**inputs,
                               num_beams=5,
                               length_penalty=-1,
                               max_new_tokens=max_new_tokens)
                              #  max_length=max_new_tokens)
  elif mode == "rationale":
      outputs = model.generate(**inputs,
                               num_beams=5,
                               length_penalty=1.1, # choose from [1, 1.5, 2]
                               max_new_tokens=max_new_tokens,
                               )
                            #  max_length=max_new_tokens,
                            #  )
  elif mode == "vf_question":
      outputs = model.generate(**inputs,
                               num_beams=5,
                               length_penalty=-1,
                               max_new_tokens=1,
                               )

  # Decode and print the answer
  answer = processor.decode(outputs[0], skip_special_tokens=True)
  answer = answer.split(question.split("<image>")[1])[1].strip()
  return answer


image0 = Image.open(df.iloc[0]['image_path'])
question0 = f"<image>\nUSER:Question: {df.iloc[0]['vf_questions'][0]}. Answer with 'yes' or 'no'.\nASSISTANT:"
inference(image0, question0, mode="vf_question")


In [61]:
df

In [63]:
def answer_vf_questions(data):
    # if 'vf_answers_LLaVA' not in data.columns:
    #     data['vf_answers_LLaVA'] = None
    if 'vf_answers_LLaVA_GPT_r' not in data.columns:
        data['vf_answers_LLaVA_GPT_r'] = None
    for idx, row in data.iterrows():
        image = Image.open(row['image_path'])
        answer_list = []
        # Image
        display(image)
        # for i, question in enumerate(row['vf_questions']):
        #     question = f"<image>\nUSER:Question: {question}. Answer with 'yes' or 'no'.\nASSISTANT:"
        #     answer = inference(image, question, mode="vf_question")
        #     answer_list.append(answer)
        #     # Print image, question, and answer
        #     print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        # data.at[idx, 'vf_answers_LLaVA'] = answer_list
        for i, question in enumerate(row['vf_questions_GPT_r']):
            question = f"<image>\nUSER:Question: {question}. Answer with 'yes' or 'no'.\nASSISTANT:"
            answer = inference(image, question, mode="vf_question")
            answer_list.append(answer)
            # Print image, question, and answer
            print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        data.at[idx, 'vf_answers_LLaVA_GPT_r'] = answer_list
    return data

df = answer_vf_questions(df)
df

In [38]:
# Output to a xlsx file
df.to_excel("data_balanced_vf_questions.xlsx", index=False)

# Use GPT to check vf_questions

In [64]:
import base64
import requests

def gpt_answer_vf_questions(question, image_path, cost_verbose=0):
    # Read the image and convert it to base64 format
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    user_input = f"""Question: {question}. Based on the information provided in the image, answer with 'yes' or 'no'. Provide one-word answer only."""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4o-2024-08-06",
        "messages": [
            # {
            #     "role": "system",
            #     "content": [
            #         {
            #             "type": "text",
            #             "text": system_prompt
            #         }
            #     ]
            # },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    }, 
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, verbose=cost_verbose)
        content = response.json()['choices'][0]['message']['content'].strip().strip('.')
        return content

In [65]:
df

In [68]:
from PIL import Image

def answer_vf_questions(data):
    # if 'vf_answers_GPT' not in data.columns:
    #     data['vf_answers_GPT'] = None
    if 'vf_answers_GPT_GPT_r' not in data.columns:
        data['vf_answers_GPT_GPT_r'] = None
    for idx, row in data.iterrows():
        image = Image.open(row['image_path'])
        answer_list = []
        # Image
        display(image)
        # for i, question in enumerate(row['vf_questions']):
        #     answer = gpt_answer_vf_questions(question, row['image_path'], cost_verbose=1)
        #     answer_list.append(answer)
        #     # Print image, question, and answer
        #     print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        # data.at[idx, 'vf_answers_GPT'] = answer_list
        for i, question in enumerate(row['vf_questions_GPT_r']):
            answer = gpt_answer_vf_questions(question, row['image_path'], cost_verbose=1)
            answer_list.append(answer)
            # Print image, question, and answer
            print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        data.at[idx, 'vf_answers_GPT_GPT_r'] = answer_list
    return data

df = answer_vf_questions(df)
df

In [69]:
# Output to a xlsx file
# df.to_excel("data_balanced_vf_questions.xlsx", index=False)
df.to_excel("data_balanced_vf_questions_GPT_r.xlsx", index=False)

In [74]:
# Analyze the correlation between vf_answers_LLaVA and vf_answers_GPT
from scipy.stats import spearmanr

# Flatten the lists
flat_llava_answers = [item for sublist in df['vf_answers_LLaVA_GPT_r'] for item in sublist]
flat_gpt_answers = [item for sublist in df['vf_answers_GPT_GPT_r'] for item in sublist]

# Map 'Yes' to 1 and 'No' to 0 for correlation calculation
mapping = {'Yes': 1, 'No': 0}
flat_llava_mapped = [mapping[answer] for answer in flat_llava_answers]
flat_gpt_mapped = [mapping[answer] for answer in flat_gpt_answers]

# Calculate Spearman correlation
spearman_corr, p_value = spearmanr(flat_llava_mapped, flat_gpt_mapped)

spearman_corr, p_value

In [75]:
from sklearn.metrics import cohen_kappa_score

# Calculate Cohen's kappa
cohen_kappa = cohen_kappa_score(flat_llava_mapped, flat_gpt_mapped)

cohen_kappa