In this notebook, we analyze the rationales in text-only metrics and our proposed text / vision metrics.

In [None]:
%pip install openpyxl

In [None]:
# Load the xlsx sheet
import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

file_path = '../results/rationales.xlsx'

# Load all sheets into a dictionary
sheets_dict = pd.read_excel(file_path, sheet_name=None)

# Refactor image paths and ensure column types
for sheet_name, df in sheets_dict.items():
    # Update image paths
    df['image_path'] = df['image_path'].str.replace('./images/', './results/img/validation/')
    
    # Ensure specific columns are strings
    for col in ['question', 'predicted_answer', 'correct_answer', 'generated_rationale']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    # Print sample rows to verify
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()

# Extract a small sheet for testing
mini_sheet = sheets_dict['LLaVA-1.5 with image'].head()
mini_sheet

# 1. Informativeness, Simulatability, Contrastivity

## 1.1 Hypothesis, Alternative Hypotheses

In [None]:
import ast
import base64
import requests
import openai
from tqdm import tqdm

# Load the OpenAI API key, read from ../OPENAI_key.txt first line
with open('../OPENAI_key.txt', 'r') as file:
    api_key = file.readline().strip()

# Initialize the OpenAI API client
openai.api_key = api_key

# Define the file to store the total cost
COST_FILE = "../total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")

def calculate_cost(usage, model, verbose=0):
    if model == "gpt-4o-2024-05-13":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    elif model == "gpt-4o-2024-08-06":
        input_cost_per_token = 0.0025 / 1000
        output_cost_per_token = 0.010 / 1000
    elif model == "gpt-4o-mini-2024-07-18":
        input_cost_per_token = 0.00015 / 1000
        output_cost_per_token = 0.00060 / 1000

    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

def gpt_gen_hypothesis(question, predicted_ans, cost_verbose=0):
    model_name = "gpt-4o-mini-2024-07-18"   
    user_input = f"""Integrate the question and the answer into one sentence.
For example, given the question "What is the man waiting for?" and the answer "taxi", you should output "The man is waiting for taxi."

Question: {question}
Answer: {predicted_ans}
"""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    },
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()
        return content
    
def generate_alternative_hypotheses(question, other_answers):
    # Generate hypotheses for each alternative answer
    return [gpt_gen_hypothesis(question, answer) for answer in other_answers]

def extract_distinct_rationale_pieces(hypothesis, rationale):
    prompt=f"""Please break the following rationale into distinct pieces, and keep only the ones that are not semantically equivalent to the hypothesis. Output the final answer in a Python list format.

Example:
Hypothesis: The man by the bags is waiting for a delivery.
Rationale: The man by the bags is waiting for a delivery, as indicated by the presence of the suitcases and the fact that he is standing on the side of the road. The other options, such as a skateboarder, train, or cab, do not seem to be relevant to the situation depicted in the image.
Output: ["Suitcases are present in the image.", "The man is standing on the side of the road.", "The other options, such as a skateboarder, train, or cab, do not seem to be relevant to the situation depicted in the image."]

Task:
Hypothesis: {hypothesis}
Rationale: {rationale}"""
    
    model_name = "gpt-4o-2024-08-06"
    response = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    calculate_cost(response['usage'], model_name)
    message = response.choices[0].message.content
    start_index = message.find('[')
    end_index = message.rfind(']')
    R_list = message[start_index:end_index+1]
    R_list = ast.literal_eval(R_list)
    return R_list

def process_sheet(sheet_df):
    # Initialize columns
    hypotheses = []
    alternative_hypotheses = []
    all_answers_list = []
    other_answers_list = []

    # Loop through each row in the DataFrame
    for _, row in tqdm(sheet_df.iterrows(), total=sheet_df.shape[0], desc="Processing Rows"):
        # Generate hypothesis
        hypothesis = gpt_gen_hypothesis(row['question'], row['predicted_answer'])
        hypotheses.append(hypothesis)

        # Extract all answers
        all_answers = row['question'].split("Choices: ")[-1].split(", ") if "Choices: " in row['question'] else []
        all_answers_list.append(all_answers)

        # Filter out the predicted answer to get other answers
        other_answers = [ans for ans in all_answers if ans.lower() != row['predicted_answer'].lower()]
        other_answers_list.append(other_answers)

        # Generate alternative hypotheses
        alt_hypotheses = [gpt_gen_hypothesis(row['question'], ans) for ans in other_answers]
        alternative_hypotheses.append(alt_hypotheses)

    # Assign new columns
    sheet_df.loc[:,'Hypothesis'] = hypotheses
    sheet_df.loc[:,'Alternative Hypotheses'] = alternative_hypotheses

    return sheet_df


for sheet_name, df in sheets_dict.items():
    print(f"Processing sheet: {sheet_name}")
    process_sheet(df)
    print(df.head())
    print()

In [None]:
# Store the updated sheets here, as a temp, in case of any errors
temp_file_path = 'temp_analysis.xlsx'
with pd.ExcelWriter(temp_file_path) as writer:
    for sheet_name, df in sheets_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Load the temp file to verify
sheets_dict_temp = pd.read_excel(temp_file_path, sheet_name=None)
for sheet_name, df in sheets_dict_temp.items():
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()

## 1.2 Simulatability and Contrastivity (done in Qual_eval_2.ipynb)

## 1.3 Informativeness

In [None]:
# Load the xlsx sheet
import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Load temp_analysis_v3.xlsx
file_path = 'temp_analysis_v3.xlsx'

# Load all sheets into a dictionary
sheets_dict = pd.read_excel(file_path, sheet_name=None)

# Refactor image paths and ensure column types
for sheet_name, df in sheets_dict.items():    
    # Ensure specific columns are strings
    for col in ['question', 'predicted_answer', 'correct_answer', 'generated_rationale']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    # Print sample rows to verify
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()

# Extract a small sheet for testing
mini_sheet = sheets_dict['LLaVA-1.5 with image'].head()
demo_sheets_dict = {
    'mini_sheet': mini_sheet
}
mini_sheet

In [None]:
import openai
from tqdm import tqdm

# Load the OpenAI API key, read from ../OPENAI_key.txt first line
with open('../OPENAI_key.txt', 'r') as file:
    api_key = file.readline().strip()

# Initialize the OpenAI API client
openai.api_key = api_key

# Define the file to store the total cost
COST_FILE = "../total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")

def calculate_cost(usage, model, verbose=0):
    if model == "gpt-4o-2024-05-13":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    elif model == "gpt-4o-2024-08-06":
        input_cost_per_token = 0.0025 / 1000
        output_cost_per_token = 0.010 / 1000
    elif model == "gpt-4o-mini-2024-07-18":
        input_cost_per_token = 0.00015 / 1000
        output_cost_per_token = 0.00060 / 1000

    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

In [None]:
import openai
import ast

client = openai.Client(api_key=api_key)

# Generate set R which contains rationale pieces for every instance

def extract_distinct_rationale_pieces(hypothesis, rationale, max_retries=5):
    prompt=f"""Please break the following rationale into distinct pieces, and keep only the ones that are not semantically equivalent to the hypothesis. Output the final answer in a Python list format.

Example:
Hypothesis: The man by the bags is waiting for a delivery.
Rationale: The man by the bags is waiting for a delivery, as indicated by the presence of the suitcases and the fact that he is standing on the side of the road. The other options, such as a skateboarder, train, or cab, do not seem to be relevant to the situation depicted in the image.
Output: ["Suitcases are present in the image.", "The man is standing on the side of the road.", "The other options, such as a skateboarder, train, or cab, do not seem to be relevant to the situation depicted in the image."]

Task:
Hypothesis: {hypothesis}
Rationale: {rationale}"""

    retries = 0
    while retries < max_retries:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-2024-08-06",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            # usage = response.usage
            # calculate_cost(usage, "gpt-4o-2024-08-06")
            message = response.choices[0].message.content
            start_index = message.find('[')
            end_index = message.rfind(']')
            R_list_str = message[start_index:end_index+1]
            R_list = ast.literal_eval(R_list_str)
            return R_list
        
        except (SyntaxError, ValueError) as e:
            print(f"Attempt {retries + 1} failed with error: {e}")
            retries += 1
    
    # If all attempts fail, return an empty list or handle it as needed
    print("All attempts failed. Returning an empty list.")
    return []

In [None]:
import warnings
# Suppress the specific SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

for sheet_name, df in tqdm(sheets_dict.items(), desc="Processing Sheets"):
    # Apply the function to each row and store the result in a new column
    df.loc[:,'extracted_rationale_pieces'] = df.apply(
        lambda row: extract_distinct_rationale_pieces(row['Hypothesis'], row['generated_rationale']),
        axis=1
    )

In [None]:
# Store the updated sheets here, as a temp, in case of any errors
temp_file_path = 'temp_analysis_v4.xlsx'
with pd.ExcelWriter(temp_file_path) as writer:
    for sheet_name, df in sheets_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Load the temp file to verify
sheets_dict_temp = pd.read_excel(temp_file_path, sheet_name=None)
for sheet_name, df in sheets_dict_temp.items():
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()

# 2. Commonsense Plausibility

In [None]:
import transformers
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm


torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Global variables for model components
_vera_model = None
_vera_tokenizer = None
_vera_linear = None
_vera_temperature = None

def get_vera_score(statements):
    """
    Get plausibility scores for statements using the VERA model. Loads the model only once.

    Parameters:
    - statements (str or list of str): A single statement or a list of statements to evaluate.

    Returns:
    - scores (list of float): Calibrated plausibility scores for each input statement.
    """
    global _vera_model, _vera_tokenizer, _vera_linear, _vera_temperature

    # Lazy loading of the model
    if _vera_model is None:
        print("Loading VERA model...")
        model_name = 'liujch1998/vera'
        _vera_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        _vera_model = transformers.T5EncoderModel.from_pretrained(model_name).to(device)
        _vera_model.D = _vera_model.shared.embedding_dim

        # Define the linear layer
        _vera_linear = torch.nn.Linear(_vera_model.D, 1, dtype=_vera_model.dtype)
        _vera_linear.weight = torch.nn.Parameter(_vera_model.shared.weight[32099, :].unsqueeze(0))
        _vera_linear.bias = torch.nn.Parameter(_vera_model.shared.weight[32098, 0].unsqueeze(0))
        _vera_model.eval()

        # Get temperature for calibration
        _vera_temperature = _vera_model.shared.weight[32097, 0].item()

    # Ensure input is a list
    if isinstance(statements, str):
        statements = [statements]

    # Tokenize the input and move to device
    inputs = _vera_tokenizer.batch_encode_plus(
        statements,
        return_tensors='pt',
        padding='longest',
        truncation='longest_first',
        max_length=128
    )
    input_ids = inputs.input_ids.to(device)

    # Perform inference
    with torch.no_grad():
        output = _vera_model(input_ids)
        last_hidden_state = output.last_hidden_state
        hidden = last_hidden_state[:, -1, :]  # Extract hidden state for the last token
        logits = _vera_linear(hidden).squeeze(-1)  # Calculate logits
        logits_calibrated = logits / _vera_temperature  # Apply temperature calibration
        scores_calibrated = logits_calibrated.sigmoid()  # Convert to probabilities

    # Return scores as a list
    return scores_calibrated.tolist()


statements = [
    "Water freezes at 0 degrees Celsius under normal atmospheric pressure.",
    "The sun rises in the west.",
    "The sun rises in the east.",
    "Since the density of a marble is much less than the density of mercury, the marble would sink to the bottom of the bowl if placed in it.",
    "Since the density of a marble is much more than the density of water, the marble would sink to the bottom of the bowl if placed in it.",
    "Since the density of water is much less than the density of a marble, the marble would sink to the bottom of the bowl if placed in it."
]

# Get plausibility scores
scores = get_vera_score(statements)

# Print the results
for statement, score in zip(statements, scores):
    print(f"Statement: {statement}")
    print(f"Plausibility score: {score}")


file_path = 'results.xlsx'

# Load all sheets into a dictionary
sheets_dict = pd.read_excel(file_path, sheet_name=None)

# Process each sheet
for sheet_name, df in tqdm(sheets_dict.items(), desc='Processing sheets'):
    # Update image paths
    df['image_path'] = df['image_path'].str.replace('./images/', '../results/img/validation/', regex=False)
    
    # Ensure specific columns are strings
    for col in ['question', 'predicted_answer', 'correct_answer', 'generated_rationale']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    # Get the generated rationales
    if 'generated_rationale' in df.columns:
        statements = df['generated_rationale'].to_list()
        
        # Get plausibility scores
        scores = get_vera_score(statements)
        
        # Add the scores as a new column
        df['commonsense_plausibility_score'] = scores
    else:
        # Handle cases where the column might be missing
        df['commonsense_plausibility_score'] = np.nan
    
    # Update the sheet in the dictionary
    sheets_dict[sheet_name] = df

# # Save the updated sheets to a new Excel file
# output_file_path = 'results_cp.xlsx'
# with pd.ExcelWriter(output_file_path) as writer:
#     for sheet_name, df in sheets_dict.items():
#         df.to_excel(writer, sheet_name=sheet_name, index=False)

# print(f"Updated Excel file saved to {output_file_path}")


# 3. Visual Fidelity

In [None]:
import base64
import requests

def gpt_gen_vf_questions(row, rationale_column_name, cost_verbose=0):
    model_name = "gpt-4o-2024-08-06"
    # # Read the image and convert it to base64 format
    # with open(row['image_path'], "rb") as image_file:
    #     encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    system_prompt = f"""You will be shown a question about an image, along with an answer, and a rationale that explains the answer based on details from the image. Your task is to generate a list of yes/no questions that verify the details about the image that are **explicitly** mentioned in the rationale. Your questions should be phrased such that the answer to that question being yes means that the detail in the rationale is correct. Focus on creating questions that can be visually verified or refuted based on the details provided in the rationale. Ensure the questions are specific and directly pertain to aspects that are visually relevant and mentioned in the rationale. Avoid generating questions about elements that are not mentioned in the rationale, or the rationale explicitly states are not relevant or present. Also avoid generating multiple questions that check for the same visual detail.

Here is one example:
Input: 
Question: Why is the person wearing a helmet?
Answer: For safety
Rationale: The person is wearing a helmet because they are riding a bicycle on a busy city street. Helmets are commonly used to protect against head injuries in case of accidents, especially in areas with heavy traffic.

Good Questions:
1. Is the person wearing a helmet while riding a bicycle?
Reason: This question is directly answerable by observing whether the person on the bicycle is wearing a helmet in the image. 
2. Is the street in the image busy with traffic?
Reason: This question can be visually verified by looking at the amount of traffic on the street in the image.

Bad Questions:
1. Is the person wearing the helmet because they are concerned about head injuries?
Reason: This question is not good because it assumes the person’s intentions or concerns, which cannot be visually verified from the image.
2. Does wearing a helmet suggest that the person is highly safety-conscious?
Reason: This question relies on inference and external knowledge about the person’s mindset, rather than on observable details from the image.
3. Is there any indication that the person is wearing a helmet for safety reasons?
Reason: This question verifies the answer to the original question, rather than verifying a detail about the image that's mentioned in the rationale.
4. Is the person wearing a safety vest?
Reason: This question is not good because it tries to verify details about the image that are not explicitly mentioned in the rationale.
5. Is the person not wearing sunglasses?
Reason: This question is not good because it asks for verification by absence and can only be answered with a "no," which is not the preferred type of question.

Respond with a list of (good) questions (without the reasons), starting from '1. '"""
    

    user_input = f"""Question: {row['question']}
Answer: {row['predicted_answer']}
Rationale: {row[rationale_column_name]}"""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": system_prompt
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    }, 
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()
        
        # content is a list of questions, separated by a newline character
        # 1. ... \n 2. ... \n 3. ...

        try:
            # Split the content into individual questions and return a python list
            if '\n' in content:
                parts = content.split('\n')
            else:
                # parts = content.split('. ')  # Split by ". " as a fallback
                parts = [content]
            questions = []

            for part in parts:
                try:
                    # Attempt to split and take the second part
                    question = part.split('. ')[1]
                    questions.append(question)
                except IndexError:
                    # If there's an issue with splitting, add the entire part or handle as needed
                    questions.append(part)
        except Exception as e:
            questions = [content]
            print(f"Error: {e}")

        return questions
    

# Process each sheet
for sheet_name, df in tqdm(sheets_dict.items(), desc='Processing sheets'):
    vf_questions = []
    # Initialize the 'vf_questions' column if it doesn't exist
    # if 'vf_questions' not in df.columns:
    #     df['vf_questions'] = None  # Or use an empty list []

    # Loop through each row in the DataFrame
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Rows"):
        # Generate visual verification questions
        questions = gpt_gen_vf_questions(row, 'generated_rationale')
        # store the questions as a string (but appear like a list)
        questions = str(questions)
        vf_questions.append(questions)
        
    # Assign new columns
    df.loc[:,'vf_questions'] = vf_questions

    

In [None]:
# Store the updated sheets here, as a temp, in case of any errors
temp_file_path = 'temp_analysis_v2.xlsx'
with pd.ExcelWriter(temp_file_path) as writer:
    for sheet_name, df in sheets_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Load the temp file to verify
sheets_dict_temp = pd.read_excel(temp_file_path, sheet_name=None)
for sheet_name, df in sheets_dict_temp.items():
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()

In [None]:
# Read temp_analysis_v2.xlsx

# Load the xlsx sheet
import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import ast

file_path = 'temp_analysis_v2.xlsx'

# Load all sheets into a dictionary
sheets_dict = pd.read_excel(file_path, sheet_name=None)

# Refactor image paths and ensure column types
for sheet_name, df in sheets_dict.items():
    # Ensure specific columns are strings
    for col in ['question', 'predicted_answer', 'correct_answer', 'generated_rationale']:
        if col in df.columns:
            df[col] = df[col].astype(str)
            
    # Ensure vf_questions is a list
    if 'vf_questions' in df.columns:
        df['vf_questions'] = df['vf_questions'].apply(ast.literal_eval)
    
    print(type(df['vf_questions'][0]))
    
    # Print sample rows to verify
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()

# Extract a small sheet for testing
mini_sheet = sheets_dict['LLaVA-1.5 with image'].head()
demo_sheets_dict = {
    "mini_sheet": mini_sheet
}
mini_sheet

In [None]:
import ast
import base64
import requests
import openai
from tqdm import tqdm

# Load the OpenAI API key, read from ../OPENAI_key.txt first line
with open('../OPENAI_key.txt', 'r') as file:
    api_key = file.readline().strip()

# Initialize the OpenAI API client
openai.api_key = api_key

# Define the file to store the total cost
COST_FILE = "../total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")

def calculate_cost(usage, model, verbose=0):
    if model == "gpt-4o-2024-05-13":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    elif model == "gpt-4o-2024-08-06":
        input_cost_per_token = 0.0025 / 1000
        output_cost_per_token = 0.010 / 1000
    elif model == "gpt-4o-mini-2024-07-18":
        input_cost_per_token = 0.00015 / 1000
        output_cost_per_token = 0.00060 / 1000

    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

In [None]:
import base64
import requests

def gpt_answer_vf_questions(question, image_path, cost_verbose=0):
    model_name = "gpt-4o-2024-08-06"
    # Read the image and convert it to base64 format
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
    
    user_input = f"""Question: {question}. Based on the information provided in the image, answer with 'yes' or 'no'. Provide one-word answer only."""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            # {
            #     "role": "system",
            #     "content": [
            #         {
            #             "type": "text",
            #             "text": system_prompt
            #         }
            #     ]
            # },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    }, 
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        content = response.json()['choices'][0]['message']['content'].strip().strip('.')
        return content


from PIL import Image

def answer_vf_questions(data):
    if 'vf_answers_GPT' not in data.columns:
        data['vf_answers_GPT'] = None
    # if 'vf_answers_GPT_GPT_r' not in data.columns:
    #     data['vf_answers_GPT_GPT_r'] = None
    data['vf_answers_GPT'] = data['vf_answers_GPT'].astype(object)
    for idx, row in data.iterrows():
        image = Image.open(row['image_path'])
        answer_list = []
        # Image
#         display(image)
        for i, question in enumerate(row['vf_questions']):
            answer = gpt_answer_vf_questions(question, row['image_path'], cost_verbose=1)
            answer_list.append(answer)
            # Print image, question, and answer
            print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        data.at[idx, 'vf_answers_GPT'] = answer_list
        # for i, question in enumerate(row['vf_questions_GPT_r']):
        #     answer = gpt_answer_vf_questions(question, row['image_path'], cost_verbose=1)
        #     answer_list.append(answer)
        #     # Print image, question, and answer
        #     print(f"Image {idx}, Question {i+1}: {question}\nAnswer: {answer}")
        # data.at[idx, 'vf_answers_GPT_GPT_r'] = answer_list
    return data

for sheet_name, df in sheets_dict.items():
    print(f"Processing sheet: {sheet_name}")
    df = answer_vf_questions(df)
    print(df.head())
    print()

# Store the updated sheets here, as a temp, in case of any errors
temp_file_path = 'temp_analysis_v3.xlsx'
with pd.ExcelWriter(temp_file_path) as writer:
    for sheet_name, df in sheets_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Load the temp file to verify
sheets_dict_temp = pd.read_excel(temp_file_path, sheet_name=None)
for sheet_name, df in sheets_dict_temp.items():
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()

# End

In [None]:
import pandas as pd

# Load the Excel files into dictionaries of DataFrames
file_path1 = 'temp_analysis_v4.xlsx'
file_path2 = 'temp_analysis_forked_v2.xlsx'
file_path3 = 'temp_analysis_forked2_cp.xlsx'

sheets_dict1 = pd.read_excel(file_path1, sheet_name=None)
sheets_dict2 = pd.read_excel(file_path2, sheet_name=None)
sheets_dict3 = pd.read_excel(file_path3, sheet_name=None)

# Combine the sheets, appending unique columns from forked_v2 to v4
combined_sheets = {}

for sheet_name, df1 in sheets_dict1.items():
    if sheet_name in sheets_dict2:
        df2 = sheets_dict2[sheet_name]
        
        # Find unique columns in df2 not present in df1
        unique_cols = df2.columns.difference(df1.columns)
        
        # Append these unique columns to df1
        df_combined = pd.concat([df1, df2[unique_cols]], axis=1)
    else:
        # If the sheet is not in the second file, keep the original
        df_combined = df1
        
    if sheet_name in sheets_dict3:
        df3 = sheets_dict3[sheet_name]
        
        # Find unique columns in df3 not present in df_combined
        unique_cols = df3.columns.difference(df_combined.columns)
        
        # Append these unique columns to df_combined
        df_combined = pd.concat([df_combined, df3[unique_cols]], axis=1)
    
    combined_sheets[sheet_name] = df_combined

# Save the combined sheets to a new Excel file
output_file_path = 'temp_analysis_combined.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    for sheet_name, df in combined_sheets.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

output_file_path


In [None]:
# Read temp_analyis_combined.xlsx
import openpyxl
import pandas as pd
import ast
import os

file_path = 'temp_analysis_combined.xlsx'

sheets_dict = pd.read_excel(file_path, sheet_name=None)

# We process by converting every cell in the column to a list of strings (using ast.literal_eval)
# And we convery every answer other than 'Yes' or 'No' to 'No'.
# Add a column called vf_answers_GPT_processed
# Then, add a column Visual_Fidelity, which checks if cell in column "vf_answers_GPT" contains 'No', outputs 0 if it does, and 1 otherwise.

# Add a column called informativeness, which is a binary value.
# It is 1 if extracted_rationale_pieces is not "[]" and 0 otherwise.

# For columns strict_sim and support, convert them from TRUE FALSE to 1 and 0 respectively.
# Process each sheet
for sheet_name, df in sheets_dict.items():
    # Convert every cell in the column "vf_answers_GPT" to a list of strings
    if "vf_answers_GPT" in df.columns:
        df["vf_answers_GPT_converted"] = df["vf_answers_GPT"].apply(
            lambda x: str(["No" if ans not in ["Yes", "No"] else ans for ans in ast.literal_eval(x)])
            if isinstance(x, str) else x
        )
    
    # Add a column "visual_fidelity"
    if "vf_answers_GPT" in df.columns:
        df["visual_fidelity"] = df["vf_answers_GPT_converted"].apply(
            lambda x: 0 if isinstance(x, str) and "No" in ast.literal_eval(x) else 1
        )
    
    # Add a column "informativeness"
    if "extracted_rationale_pieces" in df.columns:
        df["informativeness"] = df["extracted_rationale_pieces"].apply(
            lambda x: 1 if x != "[]" else 0
        )
    
    # Convert "strict_sim" from TRUE/FALSE to 1/0
    if "strict_sim" in df.columns:
        df["strict_sim"] = df["strict_sim"].apply(lambda x: 1 if x is True else 0)
    
    # Convert "support" from TRUE/FALSE to 1/0
    if "support" in df.columns:
        df["support"] = df["support"].apply(lambda x: 1 if x is True else 0)
        
    if "commonsense_plausibility_score" in df.columns:
        # Rename to commonsense_plausibility
        df.rename(columns={"commonsense_plausibility_score": "commonsense_plausibility"}, inplace=True)


from openpyxl import load_workbook
from openpyxl.drawing.image import Image

def add_images_to_excel(file_path, sheets_dict, column='V'):
    workbook = load_workbook(file_path)
    
    for sheet_name, df in sheets_dict.items():
        if "image_path" in df.columns:
            worksheet = workbook[sheet_name]
            
            # Iterate over the rows in the DataFrame
            for index, row in df.iterrows():
                image_path = row.get("image_path")
                
                # Check if the image_path exists and is valid
                if isinstance(image_path, str) and os.path.exists(image_path):
                    # Add the image to the corresponding row
                    img = Image(image_path)
                    cell_address = f"{column}{index + 2}"  # Adjust row index to match Excel's 1-based indexing
                    worksheet.add_image(img, cell_address)
    
    # Save the workbook with images added
    workbook.save(file_path)
    print(f"Images added and saved to {file_path}")

# Save full details Excel file
output_file_path1 = '../results/rationales_analysis_full_details.xlsx'
with pd.ExcelWriter(output_file_path1, engine='openpyxl') as writer:
    for sheet_name, df in sheets_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"Processed data saved to {output_file_path1}")

# Add images to the full details Excel file
add_images_to_excel(output_file_path1, sheets_dict, column='V')

# Save scores-only Excel file
output_file_path2 = '../results/rationales_analysis_scores.xlsx'
sheets_score_only = {
    sheet_name: df[['question', 'predicted_answer', 'correct_answer', 'is_correct', 
                    'generated_rationale', 'image_path', 'visual_fidelity', 
                    'informativeness', 'strict_sim', 'support', 'commonsense_plausibility']]
    for sheet_name, df in sheets_dict.items()
}
with pd.ExcelWriter(output_file_path2, engine='openpyxl') as writer:
    for sheet_name, df in sheets_score_only.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"Processed data saved to {output_file_path2}")

# Add images to the scores-only Excel file
add_images_to_excel(output_file_path2, sheets_score_only, column='M')


# # Save the processed data to a new Excel file
# # Add the images back, using the column image_path
# output_file_path1 = '../results/rationales_analysis_full_details.xlsx'
# with pd.ExcelWriter(output_file_path1, engine='openpyxl') as writer:
#     for sheet_name, df in sheets_dict.items():
#         df.to_excel(writer, sheet_name=sheet_name, index=False)
# print(f"Processed data saved to {output_file_path1}")

# output_file_path2 = '../results/rationales_analysis_scores.xlsx'
# sheets_score_only = {sheet_name: df[['question', 'predicted_answer', 'correct_answer', 'is_correct', \
#     'generated_rationale', 'image_path', 'visual_fidelity', 'informativeness', 'strict_sim', 'support', \
#     'commonsense_plausibility']] for sheet_name, df in sheets_dict.items()}
# with pd.ExcelWriter(output_file_path2, engine='openpyxl') as writer:
#     for sheet_name, df in sheets_score_only.items():
#         df.to_excel(writer, sheet_name=sheet_name, index=False)
# print(f"Processed data saved to {output_file_path2}")

# Calculate non-binary scores for visual fidelity and contrastiveness

In [None]:
# Read ../results/rationales_analysis_full_details.xlsx
import openpyxl
import pandas as pd
import os
import ast

file_path = '../results/rationales_analysis_full_details.xlsx'

sheets_dict = pd.read_excel(file_path, sheet_name=None)


for sheet_name, df in sheets_dict.items():
    # Ensure specific columns are strings
    for col in ['question', 'predicted_answer', 'correct_answer', 'generated_rationale']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    for col in ['vf_questions', 'vf_answers_GPT_converted', 'alt_ent_prob']:
        df[col] = df[col].apply(ast.literal_eval)
    # Check the percentage of 'Yes' in the column vf_answers_GPT_converted
    df['visual_fidelity_converted_score'] = df['vf_answers_GPT_converted'].apply(lambda x: x.count('Yes') / len(x))
    # Check p(entail_prob) / (p(entail_prob) + sum(p(alt_ent_prob)))
    df['contrastiveness_converted_score'] = df.apply(lambda row: row['entail_prob'] / (row['entail_prob'] + sum(row['alt_ent_prob'])), axis=1)
    df['question_id'] = df['image_path'].apply(lambda x: x.split('/')[-1].split('.')[0])

sheets_dict

In [None]:
# # Store the sheets back
# output_file_path = '../results/rationales_analysis_full_details_v2.xlsx'
# with pd.ExcelWriter(output_file_path) as writer:
#     for sheet_name, df in sheets_dict.items():
#         df.to_excel(writer, sheet_name=sheet_name, index=False)
        
# from openpyxl import load_workbook
# from openpyxl.drawing.image import Image

# def add_images_to_excel(file_path, sheets_dict, column='V'):
#     workbook = load_workbook(file_path)
    
#     for sheet_name, df in sheets_dict.items():
#         if "image_path" in df.columns:
#             worksheet = workbook[sheet_name]
            
#             # Iterate over the rows in the DataFrame
#             for index, row in df.iterrows():
#                 image_path = row.get("image_path")
                
#                 # Check if the image_path exists and is valid
#                 if isinstance(image_path, str) and os.path.exists(image_path):
#                     # Add the image to the corresponding row
#                     img = Image(image_path)
#                     cell_address = f"{column}{index + 2}"  # Adjust row index to match Excel's 1-based indexing
#                     worksheet.add_image(img, cell_address)
    
#     # Save the workbook with images added
#     workbook.save(file_path)
#     print(f"Images added and saved to {file_path}")
    
# # Add images
# add_images_to_excel(output_file_path, sheets_dict, column='X')

In [None]:
import ast

def safe_literal_eval(val):
    # Only use ast.literal_eval if the value is a string.
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except Exception as e:
            print(f"Error evaluating {val}: {e}")
            return val
    return val

def format_vf_sentence(question, answer):
    # Clean the inputs
    question = question.strip()
    answer_clean = answer.strip().lower()
    # if answer_clean == 'no':
    #     return f"The visual presentation did not clearly address '{question}'."
    # elif answer_clean == 'yes':
    #     return f"The visual presentation clearly addressed '{question}'."
    # else:
    #     print(f"Invalid answer: {answer}")
    #     return f"For '{question}', the response was '{answer}'."
    return "<br> - " + question

for sheet_name, df in sheets_dict.items():
    print(f"Sheet Name: {sheet_name}")
    print(df.head())
    print()
    
    reason_vf_list = []
    reason_contr_list = []
    for idx, row in df.iterrows():
        # reason of high/low visual fidelity
        # show two visual questions and their answers
        # priority to show incorrect answered questions
        visual_questions = safe_literal_eval(row['vf_questions'])
        visual_answers = safe_literal_eval(row['vf_answers_GPT_converted'])
        
        # Pair up questions with answers
        qa_pairs = list(zip(visual_questions, visual_answers))
        
        # Prioritize pairs with incorrect answers ("No")
        incorrect_pairs = [pair for pair in qa_pairs if pair[1].strip().lower() == 'no']
        correct_pairs = [pair for pair in qa_pairs if pair[1].strip().lower() != 'no']
        
        # Choose up to two pairs, prioritizing incorrect answers
        selected_pairs = incorrect_pairs[:2]
        if len(selected_pairs) < 2:
            needed = 2 - len(selected_pairs)
            selected_pairs.extend(correct_pairs[:needed])
        
       # Create natural language sentences for each question-answer pair
        sentences = [format_vf_sentence(q, a) for q, a in zip(visual_questions, visual_answers)]
        # Combine the sentences into a single natural language paragraph
        paragraph = " ".join(sentences)
        
        reason_vf_list.append(paragraph)
        
        # Contrastiveness Reasoning ---------------------------------------
        # Parse alternative scores and answer choices
        contr_scores = safe_literal_eval(row['alt_ent_prob'])
        # minus the predict answer
        predicted_answer = row['predicted_answer']
        alt_answers = row['question'].split('Choices: ')[1].split(', ')
        if predicted_answer in alt_answers:
            alt_answers.remove(predicted_answer)
        
        
        correct_score = row['entail_prob']
        threshold = correct_score / 2
        
        # Select answers with a score at least half of the correct answer's score
        selected_options = []
        for ans, score in zip(alt_answers, contr_scores):
            if score >= threshold:
                selected_options.append(f"{ans.strip()} (score: {score:.2f})")
        
        if selected_options:
            contr_reason = (
                ", ".join(selected_options)
            )
        else:
            contr_reason = ""
        
        reason_contr_list.append(contr_reason)
        
    df['reason_vf'] = reason_vf_list
    df['reason_contr'] = reason_contr_list
        

In [None]:
# modify the print setting to display all columns
pd.set_option('display.max_columns', None)
sheets_dict['LLaVA-1.5 with image']

In [None]:
sheets_dict['LLaVA-1.5 with image']['vf_questions'][9]


In [None]:
# Create a sample json file 
# Which includes question, predicted answer, generated_rationale, visual_fidelity_converted_score (named as visual fidelity), contrastiveness_converted_score (named as contrastiveness)
# first 10 lines of the first df in the sheets_dict
# Save it as sample_data.json
import json

# Initialize the sample_data dictionary
sample_data = {}

# Iterate through all sheets in the sheets_dict
for sheet_name, df in sheets_dict.items():

    # Define the columns to include and their desired names in JSON
    selected_columns = {
        'question_id': 'question_id',
        'question': 'question',
        'predicted_answer': 'predicted_answer',
        'is_correct': 'prediction_is_correct',
        'generated_rationale': 'generated_rationale',
        'visual_fidelity_converted_score': 'visual_fidelity',
        'contrastiveness_converted_score': 'contrastiveness',
        'reason_vf': 'reason_vf',
        'reason_contr': 'reason_contr'
    }

    # Check if all required columns exist
    missing_columns = [col for col in selected_columns.keys() if col not in df.columns]
    if missing_columns:
        raise KeyError(f"The following required columns are missing in the DataFrame of sheet '{sheet_name}': {missing_columns}")

    # Rename columns as per the desired JSON structure
    renamed_df = df[list(selected_columns.keys())].rename(columns=selected_columns)

    # Convert to dictionary
    sheet_data = renamed_df.to_dict(orient='records')

    # Generate a valid file name by replacing spaces with underscores
    file_name = f"{sheet_name.replace(' ', '_').lower()}.json"

    # Save to a separate JSON file
    with open(file_name, 'w') as file:
        json.dump(sheet_data, file, indent=4)  # Added indent for better readability

    print(f"Data from sheet '{sheet_name}' has been successfully saved to '{file_name}'.")


In [None]:
import openpyxl
import pandas as pd
import os
import ast

# file_path = '../results/rationales_analysis_full_details_v2_selected.xlsx'

# sheets_dict = pd.read_excel(file_path, sheet_name=None)

# for sheet_name, df in sheets_dict.items():
#     # Ensure specific columns are strings
#     for col in ['question', 'predicted_answer', 'correct_answer', 'generated_rationale']:
#         if col in df.columns:
#             df[col] = df[col].astype(str)
#     for col in ['vf_questions', 'vf_answers_GPT_converted', 'alt_ent_prob']:
#         df[col] = df[col].apply(ast.literal_eval)
#     # Check the percentage of 'Yes' in the column vf_answers_GPT_converted
#     df['visual_fidelity_converted_score'] = df['vf_answers_GPT_converted'].apply(lambda x: x.count('Yes') / len(x))
#     # Check p(entail_prob) / (p(entail_prob) + sum(p(alt_ent_prob)))
#     df['contrastiveness_converted_score'] = df.apply(lambda row: row['entail_prob'] / (row['entail_prob'] + sum(row['alt_ent_prob'])), axis=1)
#     df['question_id'] = df['image_path'].apply(lambda x: x.split('/')[-1].split('.')[0])

# sheets_dict

def save_selected_rows_to_json(sheet_name, df):
    # Take the first 10 rows of the DataFrame

    # Define the columns to include and their desired names in JSON
    selected_columns = {
        'question_id': 'question_id',
        'question': 'question',
        'predicted_answer': 'predicted_answer',
        'is_correct': 'prediction_is_correct',
        'generated_rationale': 'generated_rationale',
        'visual_fidelity_converted_score': 'visual_fidelity',
        'contrastiveness_converted_score': 'contrastiveness',
        'reason_vf': 'reason_vf',
        'reason_contr': 'reason_contr'
    }

    # Check if all required columns exist
    missing_columns = [col for col in selected_columns.keys() if col not in df.columns]
    if missing_columns:
        raise KeyError(f"The following required columns are missing in the DataFrame of sheet '{sheet_name}': {missing_columns}")

    # Rename columns as per the desired JSON structure
    renamed_df = df[list(selected_columns.keys())].rename(columns=selected_columns)

    # Convert to dictionary
    sheet_data = renamed_df.to_dict(orient='records')

    # Generate a valid file name by replacing spaces with underscores
    file_name = f"{sheet_name.replace(' ', '_').lower()}.json"

    # Save to a separate JSON file
    with open(file_name, 'w') as file:
        json.dump(sheet_data, file, indent=4)  # Added indent for better readability

    print(f"Data from sheet '{sheet_name}' has been successfully saved to '{file_name}'.")

# For the first in sheets_dict, randomly select 200 rows with 100 correct and 100 incorrect
# For the second in sheets_dict, randomly select equal number of rows of correct with the incorrect ones
import random
import json

for idx, (sheet_name, df) in enumerate(sheets_dict.items()):
    if sheet_name.lower().startswith('llava'):
        # Randomly select 200 rows with 100 correct and 100 incorrect
        correct_rows = df[df['is_correct'] == 1].sample(n=100, random_state=42)
        incorrect_rows = df[df['is_correct'] == 0].sample(n=100, random_state=42)
        selected_rows = pd.concat([correct_rows, incorrect_rows])
        # Save the selected rows to a new json file
        save_selected_rows_to_json(sheet_name, selected_rows)
    elif sheet_name.lower().startswith('gpt'):
        # filter out the rows contained GPT refusal explanations: I'm sorry / I am sorry / I can't / I cannot / I'm unable / I am unable / I apologize
        refused_expls = ['I\'m sorry', 'I am sorry', 'I can\'t', 'I cannot', 'I\'m unable', 'I am unable', 'I apologize']
        df['predicted_answer'] = df['predicted_answer'].str.lower().str.strip('*\n ')
        df['is_correct'] = (df['is_correct'] | (df['predicted_answer'] == df['correct_answer'].str.lower())).astype(int)
        
        df = df[~df['generated_rationale'].str.contains('|'.join(refused_expls), case=False)]
        min_row_num = df['is_correct'].value_counts().min()
        print(f"Number of incorrect rows: {min_row_num}")
        correct_rows = df[df['is_correct'] == 1].sample(n=min_row_num, random_state=42)
        incorrect_rows = df[df['is_correct'] == 0].sample(n=min_row_num, random_state=42)
        selected_rows = pd.concat([correct_rows, incorrect_rows])
        # Save the selected rows to a new json file
        save_selected_rows_to_json(sheet_name, selected_rows)


# Extra: Improve the contrastiveness metric
## Can we try convert scores back to logits and compute the confidence from there?

In [None]:
# Read ../results/rationales_analysis_full_details.xlsx
import openpyxl
import pandas as pd
import os
import ast

file_path = '../results/rationales_analysis_full_details_v2.xlsx'

sheets_dict = pd.read_excel(file_path, sheet_name=None)


for sheet_name, df in sheets_dict.items():
    # Ensure specific columns are strings
    for col in ['question', 'predicted_answer', 'correct_answer', 'generated_rationale']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    for col in ['vf_questions', 'vf_answers_GPT_converted', 'alt_ent_prob']:
        df[col] = df[col].apply(ast.literal_eval)
    # Check the percentage of 'Yes' in the column vf_answers_GPT_converted
    df['visual_fidelity_converted_score'] = df['vf_answers_GPT_converted'].apply(lambda x: x.count('Yes') / len(x))
    # Check p(entail_prob) / (p(entail_prob) + sum(p(alt_ent_prob)))
    df['contrastiveness_converted_score'] = df.apply(lambda row: row['entail_prob'] / (row['entail_prob'] + sum(row['alt_ent_prob'])), axis=1)
    df['question_id'] = df['image_path'].apply(lambda x: x.split('/')[-1].split('.')[0])

sheet1 = sheets_dict['LLaVA-1.5 with image']
sheet1

In [None]:
# Plot the distribution of contrastiveness scores
import matplotlib.pyplot as plt

plt.hist(sheet1['contrastiveness_converted_score'], bins=25, color='skyblue', edgecolor='black')
plt.xlabel('Contrastiveness Score')
plt.ylabel('Frequency')
plt.title('Distribution of Contrastiveness Scores')
plt.grid(axis='y', alpha=0.75)
plt.show()


In [None]:
import numpy as np

def convert_prob_back_to_logits(prob):
    """_summary_
        Usually, probs cannot be converted back to logits because we don't know the temperature and the potential tokens.
        But here, this NLI entailment model only generates 2 labels, with temperature set to 1.
        p = e^x / (e^x + e^y) => e^x = p * (e^x + e^y) => e^x = p / (1 - p) * e^y => x = log(p / (1 - p)) + y
        So there is a group of logits that can generate the same probability:
        x = log(p / (1 - p)) + c, y = c.
        So by setting c = 0, we can convert the probability back to logits.
    """
    return np.log(prob / (1 - prob))

# Convert the contrastiveness scores back to logits
def compute_cont_score_new(pred_ent_prob, alt_ent_prob):
    pred_logit = convert_prob_back_to_logits(pred_ent_prob)
    alt_logits = [convert_prob_back_to_logits(prob) for prob in alt_ent_prob]
    # return e^x / (e^x + sum(e^y))
    return np.exp(pred_logit) / (np.exp(pred_logit) + sum(np.exp(alt_logits)))

# Compute the new contrastiveness scores
for i, row in sheet1.iterrows():
    sheet1.at[i, 'contrastiveness_converted_score_new'] = compute_cont_score_new(row['entail_prob'], row['alt_ent_prob'])
    
sheet1[['entail_prob', 'alt_ent_prob', 'contrastiveness_converted_score', 'contrastiveness_converted_score_new']]
    

In [None]:
# Plot the distribution of contrastiveness scores
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 3))
plt.hist(sheet1['contrastiveness_converted_score'], bins=25, color='skyblue', edgecolor='black')
plt.xlabel('Contrastiveness Score')
plt.ylabel('Frequency')
plt.title('Distribution of Contrastiveness Scores')
plt.grid(axis='y', alpha=0.75)
plt.show()

# Plot the distribution of contrastiveness scores
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 3))
plt.hist(sheet1['contrastiveness_converted_score_new'], bins=25, color='skyblue', edgecolor='black')
plt.xlabel('Contrastiveness Score')
plt.ylabel('Frequency')
plt.title('Distribution of Contrastiveness Scores (New, converted back to logits and calculated softmax)')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
import pandas as pd
import json

# Extract the previous llava-1.5_with_image_old.json question ids (make sure we did the same questions on the user study)
llava_15_old = pd.read_json('llava-1.5_with_image_old.json')
llava_15_old_question_ids = llava_15_old['question_id'].tolist()
# save the question ids to a new json file
with open('llava-1.5_with_image_question_ids.json', 'w') as f:
    json.dump(llava_15_old_question_ids, f, indent=4)