1. Dense image caption / Use GPT to get detailed description of the image
    DenseCap https://google.github.io/localized-narratives/ 
2. Split into chunk of information pieces and add to the rationale, see if entailment prob changes
3. Find examples of rationales where rationales are not complete, and check if simulatability itself captures the lack of completeness

In [None]:
# TODO: Use Dense Image caption to replace GPT

In [None]:
import os
from openai import OpenAI

# OPENAI api key
api_key_path = '../../OPENAI_key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip().split('\n')[0]
    
client = OpenAI(api_key=api_key)

# Record the cost
COST_FILE = "total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")
        
def calculate_cost(usage, model, verbose=0):
    if model == "gpt-4o-2024-05-13":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    elif model == "gpt-4o-2024-08-06":
        input_cost_per_token = 0.0025 / 1000
        output_cost_per_token = 0.010 / 1000
    elif model == "gpt-4o-mini-2024-07-18":
        input_cost_per_token = 0.00015 / 1000
        output_cost_per_token = 0.00060 / 1000
    
    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

In [None]:
import base64
import requests

def gpt_generate_dense_captions(image_path, cost_verbose=0):
    model_name = "gpt-4o-2024-08-06"
    # Read the image and convert it to base64 format
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    user_input = f"""Generate a list of detailed descriptions that covers every visible detail from the image. Respond in the form of "1. ... \n 2. ... \n 3. ...". """
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            # {
            #     "role": "system",
            #     "content": [
            #         {
            #             "type": "text",
            #             "text": system_prompt
            #         }
            #     ]
            # },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    }, 
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()
        
        # content is a list of questions, separated by a newline character
        # 1. ... \n 2. ... \n 3. ...

        # Split the content into individual questions and return a python list
        try:
            parts = content.split('\n')
            descriptions = [part.split('. ')[1] for part in parts]
        except(IndexError):
            descriptions = content

        return descriptions

# Load the data

In [None]:
import pandas as pd
import numpy as np

# Read the data
data = pd.read_excel("data_vf_questions_val_set_GPT_r.xlsx").dropna().reset_index()

data

In [None]:
from tqdm import tqdm
tqdm.pandas()

df = data.copy()
df['dense_captions'] = df.progress_apply(lambda row: gpt_generate_dense_captions(row['image_path']), axis=1)
df

In [None]:
df.to_csv('data_vf_q_val_GPT_r_dense_captions.csv', index=False)

# Evaluate Simulatability

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('data_vf_q_val_GPT_r_dense_captions.csv')
df = data.copy()

data

In [None]:
import re
def generate_mask(generated_rationale, predicted_answer):
    # Create a regex pattern to match the predicted answer case-insensitively and as a whole word
    predicted_answer = str(predicted_answer)
    pattern = re.compile(r'\b' + re.escape(predicted_answer) + r'\b', re.IGNORECASE)
    return pattern.sub("<mask>", generated_rationale)

df['rationale_mask'] = df.apply(lambda row: generate_mask(row['one_step_rationale'], row['one_step_answer']), axis=1)
df

In [None]:
import base64
import requests

def gpt_gen_hypothesis(question, predicted_ans, cost_verbose=0):
    model_name = "gpt-4o-mini-2024-07-18"   
    user_input = f"""Integrate the question and the answer into one sentence.
For example, given the question "What is the man waiting for?" and the answer "taxi", you should output "The man is waiting for taxi."

Question: {question}
Answer: {predicted_ans}
"""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            # {
            #     "role": "system",
            #     "content": [
            #         {
            #             "type": "text",
            #             "text": system_prompt
            #         }
            #     ]
            # },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    },
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()
        return content

df['hypothesis'] = df.apply(lambda row: gpt_gen_hypothesis(row["question"], row["one_step_answer"]), axis=1)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# load the tokenizer and model only if they are not already defined
# if 'nli_tokenizer' not in globals():
nli_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl", device_map="cuda:9")
# if 'nli_model' not in globals():
nli_model = AutoModelForSeq2SeqLM.from_pretrained("soumyasanyal/nli-entailment-verifier-xxl", load_in_8bit=True, device_map="cuda:9")

def calc_support_prob(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer):
    def get_score(nli_model, nli_tokenizer, input_ids):
        pos_ids = nli_tokenizer('Yes').input_ids
        neg_ids = nli_tokenizer('No').input_ids
        pos_id = pos_ids[0]
        neg_id = neg_ids[0]

        with torch.no_grad():
            logits = nli_model(input_ids, decoder_input_ids=torch.zeros((input_ids.size(0), 1), dtype=torch.long)).logits
            pos_logits = logits[:, 0, pos_id]
            neg_logits = logits[:, 0, neg_id]
            posneg_logits = torch.cat([pos_logits.unsqueeze(-1), neg_logits.unsqueeze(-1)], dim=1)

            # Cast to float before applying softmax
            posneg_logits = posneg_logits.float()
            scores = torch.nn.functional.softmax(posneg_logits, dim=1)
            entail_score = scores[:, 0].item()
            no_entail_score = scores[:, 1].item()
        
        return entail_score, no_entail_score
    
    prompt = f"Premise: {premise}\nHypothesis: {hypothesis}\nGiven the premise, is the hypothesis correct?\nAnswer:"
    input_ids = nli_tokenizer(prompt, return_tensors='pt').input_ids
    return get_score(nli_model, nli_tokenizer, input_ids)[0]

# def evaluate_support(data, nli_model, nli_tokenizer, use_mask=True, use_pieces=False, hypothesis_col='hypothesis', rationale_col='rationale_mask', threshold=0.5):
#     support_scores = []
#     for idx, row in data.iterrows():
#         if use_pieces:
#             premise = row['concat_rationale_pieces_mask'] if use_mask else row['concat_rationale_pieces']
#         else: 
#             premise = row['gen_rationale_mask'] if use_mask else row['generated_rationale']
        
#         hypothesis = row[hypothesis_col]
#         entail_prob = calc_support_prob(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
#         support = entail_prob > threshold 
#         if entail_prob < threshold:
#             print(f"Premise: {premise}")
#             print(f"Hypothesis: {hypothesis}")
#             print(f"Probability: {entail_prob}")
#         support_scores.append({
#             'entail_prob': entail_prob,
#             'support': support
#         })
#     return support_scores

df['support_score_original'] = df.apply(lambda row: calc_support_prob(row['rationale_mask'], row['hypothesis']), axis=1)

In [None]:
import ast

df['dense_captions'] = data['dense_captions']

def safe_literal_eval(sentence):
    try:
        # Try to convert the string to a Python literal (e.g., list)
        return ast.literal_eval(sentence)
    except (ValueError, SyntaxError):
        # If there's a ValueError or SyntaxError, return the sentence inside a list
        return [sentence]

# Apply the function to the DataFrame column
df['dense_captions'] = df['dense_captions'].apply(safe_literal_eval)

for sentence in df['dense_captions'][0]:
    print(sentence)

In [None]:
from tqdm import tqdm
# Append the captions sentence by sentence to rationale_mask, see the new support score

def append_captions_and_calculate(row):
    # Copy the existing rationale mask
    rationale_masks = []
    # Append masks generated from each caption
    for caption in row['dense_captions']:
        caption_masked = generate_mask(caption, row['one_step_answer'])
        rationale_concatted = row['rationale_mask'] + caption_masked
        rationale_masks.append(rationale_concatted)
    # Calculate the support probability with the updated mask
    score_list = []
    for rationale_concatted in rationale_masks:
        score_list.append(calc_support_prob(rationale_concatted, row['hypothesis']))
    return score_list

# Apply the function to each row in the DataFrame, use tqdm to show the progress over rows
tqdm.pandas(desc="Processing rows")

df['support_score_after_concat'] = df.progress_apply(append_captions_and_calculate, axis=1)

In [None]:
df

In [None]:
df.to_csv('data_vf_q_val_GPT_r_dense_captions_support_concat.csv', index=False)

In [None]:
df

In [None]:
# Add another column to df which tells me which indices of the list in support_score_after_concat has  > 0.1 simulatability increase
# Function to check if simulatability increase > 0.1
def find_indices_with_increase(row):
    original = row["support_score_original"]
    return [i for i, score in enumerate(row["support_score_after_concat"]) if score - original > 0.1]

# Apply the function to create the new column
df["simulatability_increase_indices"] = df.apply(find_indices_with_increase, axis=1)

In [None]:
import pandas as pd

df = pd.read_csv('data_vf_q_val_GPT_r_dense_captions_support_concat.csv')
data = df.copy()
df

In [None]:
import ast
df["support_score_after_concat"] = df["support_score_after_concat"].apply(ast.literal_eval)

In [None]:
df["support_score_after_concat"] = df["support_score_after_concat"].apply(lambda x: [round(num, 3) for num in x])

In [None]:
df

In [None]:
# Save to Excel
output_path = "val_qa_with_images.xlsx"
df.to_excel(output_path, index=False)

In [None]:
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as ExcelImage
import os

# First, save the dataframe to an excel file (without images) to manipulate it with openpyxl
df.to_excel(output_path, index=False)

# Load the saved excel file
wb = load_workbook(output_path)
ws = wb.active

# Add images to the new column in the excel file
for index, row in df.iterrows():
    img_path = row['image_path']
    # Check if the image file exists before adding
    if os.path.exists(img_path):
        img = ExcelImage(img_path)
        img_cell = f"V{index + 2}"  # Placing the image starting from row 2, column R
        ws.add_image(img, img_cell)

# Save the updated excel file with images
output_path_with_images = output_path
wb.save(output_path_with_images)

output_path_with_images
