In [None]:
from openai import OpenAI
# OPENAI api key
api_key_path = '../../OPENAI_key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip().split('\n')[0]
    
client = OpenAI(api_key=api_key)

In [None]:
import pandas as pd
import numpy as np

# Read data
file_path = '../results/Human Annotation of LLaVA+ Rationales.xlsx'

if file_path == '../results/Human Annotation of LLaVA+ Rationales.xlsx':
    model_name = "LLaVA"
    
# Read the specified columns from the sheet
columns_to_read = [
    'question',
    'correct_answer',
    'predicted_answer',
    'is_correct',
    'groundtruth_rationale',
    'generated_rationale',
    'gen_rationale_distinct_pieces',
]

if file_path == '../results/Human Annotation of LLaVA+ Rationales.xlsx':
    data = pd.read_excel(file_path, header=1, usecols=columns_to_read)
else:
    data = pd.read_excel(file_path, usecols=columns_to_read)

data.dropna(inplace=True)
assert len(data) == 50, f"Expected data length of 50, but got {len(data)}"

# Load the CSV file with the new column data
new_data = pd.read_csv('/home/<link_hidden>/<hidden>/notebooks/analysis/data_with_inform_sim/data_with_inform_sim_mask_LLaVA.csv')

# Ensure the new column exists in the new data
if 'extracted_rationale_pieces' in new_data.columns:
    # Replace the column in the existing DataFrame
    data['gen_rationale_distinct_pieces'] = new_data['extracted_rationale_pieces']
else:
    print("The column 'extracted_rationale_pieces' does not exist in the new data.")
    
data['image_path'] = data.index.to_series().apply(lambda x: f"../results/img/{x}.jpg")

data

In [None]:
import ast

# Convert the gen_rationale_distinct_pieces column back into a Python list
data['gen_rationale_distinct_pieces'] = data['gen_rationale_distinct_pieces'].apply(ast.literal_eval)

In [None]:
# Check the DataFrame to ensure the conversion
data['gen_rationale_distinct_pieces']

In [None]:
data['gen_rationale_distinct_pieces'][94]

In [None]:
import os

# Define the file to store the total cost
COST_FILE = "total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")
        
def calculate_cost(usage, model="GPT-4o", verbose=0):
    if model == "GPT-4o":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    
    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

In [None]:
import base64
import requests

def gpt_sentence_fidelity(image_path, sentence, cost_verbose=0):
    # Read the image and convert it to base64 format
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    system_prompt = f"""You are an expert annotator. Given an image and a sentence, you have to evaluate whether whatever is described in the sentence is accurately happening in the image.

Respond with one of the following:

1: The sentence is talking about something that is **directly** present in the image. For example, the image is that of an airplane and the sentence mentions the airplane. 

0: The sentence mentions events or elements that may make common sense but are not directly depicted in the image. Also consists of cases where things "can happen" or "may happen", but are not directly shown in the image. For example, the image is that of an airplane, and the sentence mentions a pilot present. This makes common sense as an airplane usually has a pilot, but in this example, if the pilot is not directly visible in the image, you mark 0.

-1: The sentence describes something that is **incorrect or contrary** to the visual content of the image. There is no reasonable basis to infer the presence of these elements, and they do not align with what is shown in the image. For example, the image is that of an airplane, and the sentence mentions a car, which is clearly incorrect.

Example Evaluations:
- Image Description: You are shown an image that represents the living room of a house. There is a green sofa, a table and a lamp shown in the image.
  - Sentence: "A sofa is present." → 1 (A sofa can be directly seen in the image.)
  - Sentence: "Living room can have a TV." → 0 (General knowledge, a living room may likely have a TV, but it is not directly shown in this image.)
  - Sentence: "A red sofa is visible." → -1 (Contradicts the image content, a green sofa is present instead of a red one.)

Carefully identify the key elements in both the sentence and the image to make an accurate evaluation."""
    
    user_input = f"Sentence: {sentence}"
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": system_prompt
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()

        # Split the content into score and reason
        parts = content.split(': ', 1)

        # Assign score and reason, using a default value for reason if it doesn't exist
        score = int(parts[0])
        reason = parts[1] if len(parts) > 1 else ''

        return score, reason

## Three examples: sentences with different visual fidelity score

In [None]:
image0_path = "../results/img/0.jpg"
sentence_0_1 = "He is standing on the side of the road."
gpt_sentence_fidelity(image0_path, sentence_0_1, cost_verbose=1)

In [None]:
image3_path = "../results/img/3.jpg"
sentence_3_1 = "Airport workers have access to designated parking areas within the airport premises, which allows them to park their vehicles close to their workstations."
gpt_sentence_fidelity(image3_path, sentence_3_1, cost_verbose=1)

In [None]:
image16_path = "../results/img/16.jpg"
sentence_16_0 = "The computer near the woman in blue is an Acer computer."
gpt_sentence_fidelity(image16_path, sentence_16_0, cost_verbose=1)

In [None]:
def evaluate_fidelity(data, image_path_column, rationale_column):
    # Make a copy of the original DataFrame to avoid SettingWithCopyWarning
    data_copy = data.copy()
    score_results = []
    reason_results = []
    for index, row in data_copy.iterrows():
        image_path = row[image_path_column]
        rationale_pieces = row[rationale_column]
        fidelity_scores = []
        fidelity_reasons = []
        for sentence in rationale_pieces:
            score, reason = gpt_sentence_fidelity(image_path, sentence)
            fidelity_scores.append(score)
            fidelity_reasons.append(reason)
        score_results.append(fidelity_scores)
        reason_results.append(fidelity_reasons)
    
    # Add the results as a new column to the DataFrame using .loc
    data_copy.loc[:, 'sentence_fidelity_scores'] = score_results
    data_copy.loc[:, 'sentence_fidelity_reasons'] = reason_results
    return data_copy

In [None]:
data_with_sentence_VF = evaluate_fidelity(data, 'image_path', 'gen_rationale_distinct_pieces')
data_with_sentence_VF

In [None]:
import os
def save_dataframe_to_csv(dataframe, model_name):
    # Define the directory and filename
    directory = "data_with_sentence_VF"
    filename = f"data_with_sentence_VF_{model_name}.csv"
    # Create the directory if it does not exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = os.path.join(directory, filename)
    dataframe.to_csv(file_path, index=False)
    print(f"Dataframe successfully saved to {file_path}.")

save_dataframe_to_csv(data_with_sentence_VF, model_name)

# Analysis

In [None]:
import pandas as pd

# Now read the file data_with_sentence_VF/data_with_sentence_VF_LLaVA.csv.
data = pd.read_csv('data_with_sentence_VF/data_with_sentence_VF_LLaVA.csv')
data

In [None]:
# # Select a specific column
# column_to_export = data[['sentence_fidelity_scores', 'sentence_fidelity_reasons']]

# # Export the selected column to an Excel file
# column_to_export.to_excel('temp.xlsx', index=False)

In [None]:
# Data merge with the human annotations
human_annotated_s_VF = pd.read_excel('../results/Human Annotation of LLaVA+ Rationales.xlsx', header=1, usecols=['Sentence-wise Visual Fidelity']).dropna()
# Reindex the human_annotated_s_VF DataFrame to match the data DataFrame
human_annotated_s_VF.index = data.index
data['Sentence-wise Visual Fidelity'] = human_annotated_s_VF['Sentence-wise Visual Fidelity']
data.rename(columns={'sentence_fidelity_scores': 'Sentence-wise Visual Fidelity (Automated)', 
                     'Sentence-wise Visual Fidelity': 'Sentence-wise Visual Fidelity (Human)'}, inplace=True)
data

In [None]:
import numpy as np
import ast

# Function to parse string representation of list to actual list
def parse_list_string(list_str):
    return ast.literal_eval(list_str)

# Function to calculate VF score
def calculate_vf_score(row, col_name):
    vf_scores = parse_list_string(row[col_name])
    vf_score_percentage = sum(vf_scores) / len(vf_scores) if len(vf_scores) > 0 else 0
    return vf_score_percentage

# Adding new columns for VF scores
data["VF Score (Automated)"] = data.apply(calculate_vf_score, col_name="Sentence-wise Visual Fidelity (Automated)", axis=1)
data["VF Score (Human)"] = data.apply(calculate_vf_score, col_name="Sentence-wise Visual Fidelity (Human)", axis=1)

data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import FuncFormatter

# Calculate the correlation between the automated and human VF scores
correlation = data["VF Score (Automated)"].corr(data["VF Score (Human)"])
print(f"Correlation between Automated and Human VF Scores: {correlation:.4f}")

# Function to add jitter
def add_jitter(arr, jitter_amount=0):
    return arr + np.random.uniform(-jitter_amount, jitter_amount, arr.shape)

# Add jitter to the data
x_jittered = add_jitter(data["VF Score (Automated)"])
y_jittered = add_jitter(data["VF Score (Human)"])

# Plot the scatter plot with jittered data
plt.figure(figsize=(8, 6))
plt.scatter(x_jittered, y_jittered, color='blue', alpha=0.3)

# The best-fit line
sns.regplot(x=data["VF Score (Automated)"], y=data["VF Score (Human)"], scatter=False, color='red')

# Format the x-ticks to limit the number of decimals
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{x:.2f}'))

plt.title("Automated vs. Human Visual Fidelity Scores")
plt.xlabel("VF Score (Automated)")
plt.ylabel("VF Score (Human)")
plt.grid(True)
plt.show()

In [None]:
import ast

# Create DataFrame
tmp_df1 = pd.DataFrame()
flattened_sVF_scores = pd.DataFrame()

# Convert the string representation of lists into actual lists
tmp_df1['Sentence-wise Visual Fidelity (Human)'] = [ast.literal_eval(x) for x in data['Sentence-wise Visual Fidelity (Human)']]
tmp_df1['Sentence-wise Visual Fidelity (Automated)'] = [ast.literal_eval(x) for x in data['Sentence-wise Visual Fidelity (Automated)']]

# Flatten the lists
flattened_sVF_scores['Sentence-wise Visual Fidelity (Human)'] = [item for sublist in tmp_df['Sentence-wise Visual Fidelity (Human)'] for item in sublist]
flattened_sVF_scores['Sentence-wise Visual Fidelity (Automated)'] = [item for sublist in tmp_df['Sentence-wise Visual Fidelity (Automated)'] for item in sublist]

# Calculate the correlation
correlation = flattened_sVF_scores.corr().iloc[0, 1]
correlation

In [None]:
flattened_sVF_scores

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculating confusion matrix
conf_matrix = confusion_matrix(flattened_sVF_scores['Sentence-wise Visual Fidelity (Human)'], flattened_sVF_scores['Sentence-wise Visual Fidelity (Automated)'], labels=[-1, 0, 1])

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis', xticklabels=['-1', '0', '1'], yticklabels=['-1', '0', '1'])
plt.xlabel('Automated Scores')
plt.ylabel('Human Scores')
plt.title('Confusion Matrix: Sentence-wise Visual Fidelity (Human vs Automated)')
plt.show()

In [None]:
# Now, calculate the % of cases with at least one VF subscore -1.
def calculate_vf_mismatch(row, col_name):
    vf_scores = parse_list_string(row[col_name])
    mismatch = -1 in vf_scores
    return mismatch

# Return two scores, one for the automated and one for the human VF scores
automated_vf_mismatch = data.apply(calculate_vf_mismatch, col_name="Sentence-wise Visual Fidelity (Automated)", axis=1)
human_vf_mismatch = data.apply(calculate_vf_mismatch, col_name="Sentence-wise Visual Fidelity (Human)", axis=1)
print(f"Automated VF Mismatch: {automated_vf_mismatch.mean() * 100}%")
print(f"Human VF Mismatch: {human_vf_mismatch.mean() * 100}%")

In [None]:
# Calculate the average of the VF scores
automated_vf_avg = data["VF Score (Automated)"].mean()
human_vf_avg = data["VF Score (Human)"].mean()
print(f"Automated VF Average: {automated_vf_avg:.3f}")
print(f"Human VF Average: {human_vf_avg:.3f}")

In [None]:
import os
def save_dataframe_to_csv(dataframe, model_name):
    # Define the directory and filename
    directory = "data_with_sentence_VF"
    filename = f"data_with_sentence_VF_{model_name}.csv"
    # Create the directory if it does not exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = os.path.join(directory, filename)
    dataframe.to_csv(file_path, index=False)
    print(f"Dataframe successfully saved to {file_path}.")

save_dataframe_to_csv(data, "LLaVA")