# Section 1: Explanation Entailment (Support)

In [None]:
!nvidia-smi

In [None]:
%%capture
%pip install nltk rouge-score sacrebleu openpyxl openai torch transformers datasets matplotlib bitsandbytes

In [None]:
# OPENAI api key
api_key_path = '../../OPENAI_key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip().split('\n')[0]

In [None]:
import pandas as pd
import numpy as np

# Load the spreadsheet
file_path = '../results/Human Annotation of LLaVA+ Rationales.xlsx'

file_path = "../results/gpt-4o_inference_two_steps_50.xlsx"
spreadsheet = pd.ExcelFile(file_path)

# # Display sheet names
# spreadsheet.sheet_names

# Read the specified columns from the sheet
columns_to_read = [
    'question',
    'correct_answer',
    'predicted_answer',
    'is_correct',
    'groundtruth_rationale',
    'generated_rationale'
]

if file_path== '../results/Human Annotation of LLaVA+ Rationales.xlsx':
    data = pd.read_excel(file_path, header=1, usecols=columns_to_read)
else:
    data = pd.read_excel(file_path, usecols=columns_to_read)

data

In [None]:
def get_longest_rationale(rationale_list):
    rationales = eval(rationale_list)
    return max(rationales, key=len) if isinstance(rationales, list) else ''

# Apply the function to create the new column
data['longest_groundtruth_rationale'] = data['groundtruth_rationale'].apply(get_longest_rationale)

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

nltk.download('wordnet')

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = SmoothingFunction().method1
    score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
    return score

def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

def calculate_meteor(reference, candidate):
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    score = meteor_score([reference_tokens], candidate_tokens)
    return score

def display_scores(reference, candidate):
    bleu_score = calculate_bleu(reference, candidate)
    rouge_scores = calculate_rouge(reference, candidate)
    meteor_score = calculate_meteor(reference, candidate)

    print(f"BLEU Score: {bleu_score:.4f}")
    print("ROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"  {key}: {value}")
    print(f"METEOR Score: {meteor_score:.4f}")

In [None]:
# Calculate scores for each row and store in the dataframe
data['BLEU_score'] = data.apply(lambda row: calculate_bleu(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)
data['ROUGE_scores'] = data.apply(lambda row: calculate_rouge(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)
data['METEOR_score'] = data.apply(lambda row: calculate_meteor(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)

In [None]:
data

We can see that BLEU scores are generally very low. 

BLEU scores are generally more suited to tasks like machine translation where there is a high degree of overlap between the reference and generated text. 
For tasks like rationale generation, where the generated text might be quite different in wording from the reference but still correct, BLEU scores might be low.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the NLI model and tokenizer
# nli_tokenizer = AutoTokenizer.from_pretrained('roberta-large-mnli')
# nli_model = AutoModelForSequenceClassification.from_pretrained('roberta-large-mnli')

nli_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
nli_model = AutoModelForSeq2SeqLM.from_pretrained("soumyasanyal/nli-entailment-verifier-xxl", load_in_8bit=True, device_map="auto")

In [None]:
def calc_low_support_score(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer):
    
    def get_score(nli_model, nli_tokenizer, input_ids):
        pos_ids = nli_tokenizer('Yes').input_ids
        neg_ids = nli_tokenizer('No').input_ids
        pos_id = pos_ids[0]
        neg_id = neg_ids[0]

        with torch.no_grad():
            logits = nli_model(input_ids, decoder_input_ids=torch.zeros((input_ids.size(0), 1), dtype=torch.long)).logits
            pos_logits = logits[:, 0, pos_id]
            neg_logits = logits[:, 0, neg_id]
            posneg_logits = torch.cat([pos_logits.unsqueeze(-1), neg_logits.unsqueeze(-1)], dim=1)

            # Cast to float before applying softmax
            posneg_logits = posneg_logits.float()
            scores = torch.nn.functional.softmax(posneg_logits, dim=1)
            entail_score = scores[:, 0].item()
            contra_score = scores[:, 1].item()
        
        return entail_score, contra_score
    
    prompt = f"Premise: {premise}\nHypothesis: {hypothesis}\nGiven the premise, is the hypothesis correct?\nAnswer:"
    input_ids = nli_tokenizer(prompt, return_tensors='pt').input_ids
    return get_score(nli_model, nli_tokenizer, input_ids)[1]

premise = "A fossil fuel is a kind of natural resource. Coal is a kind of fossil fuel."
hypothesis = "Coal is a kind of natural resource."

low_support_score = calc_low_support_score(premise, hypothesis)
print(f'Hypothesis does not entail the premise: {bool(low_support_score >= 0.5)}')

In [None]:
# import openai

# client = openai.OpenAI(api_key=api_key)

# def create_hypothesis(question, answer):
#     question = question.split(' Choices:')[0]
    
#     QA_concat_str = f"Q: {question} A: {answer}"
#     prompt = f"Rephrase the following question and answer into a descriptive sentence:\n{QA_concat_str}"

#     # Get the response from GPT-3.5-turbo
#     response = client.chat.completions.create(
#         model="gpt-3.5-turbo",
#         messages=[{"role": "user", "content": prompt}],
#         max_tokens=50,
#     )

#     # Extract the rephrased sentence
#     rephrased_sentence = response.choices[0].message.content.strip()

#     return rephrased_sentence

# # Apply the function to each row in the DataFrame
# data['hypothesis'] = data.apply(lambda row: create_hypothesis(row['question'], row['predicted_answer']), axis=1)

In [None]:
# # Read hypothesis from hypothesis column from data_with_LAS.csv
# # Load the new data from CSV file
# data_with_LAS = pd.read_csv('data_with_LAS.csv')

# # Combine the 'hypothesis' column from the new data with the current DataFrame
# data['hypothesis'] = data_with_LAS['hypothesis']

In [None]:
# Generate hypothesis using run_question_converter.sh from https://github.com/HanjieChen/REV
import pandas as pd
import json
import os

input_data = data[['question', 'predicted_answer']].copy()

# Keep only the part of the question before 'Choices:'
input_data['question'] = input_data['question'].apply(lambda x: x.split(' Choices:')[0])

input_data.rename(columns={'question': 'question_text', 'predicted_answer': 'answer_text'}, inplace=True)

input_jsonl = 'input_data.jsonl'

# Write to JSONL file
with open(input_jsonl, 'w') as f:
    for index, row in input_data.iterrows():
        json.dump(row.to_dict(), f)
        f.write('\n')        

(Run the conversion script)

In [None]:
# Define the full path to the script
script_path = '<link_hidden>/REV/run_question_converter.sh'

# Set PYTHONPATH and run the script
os.system(f'export PYTHONPATH=/home/<link_hidden>/REV/:$PYTHONPATH && bash {script_path} cqa {input_jsonl} cuda:0')

In [None]:
import json

output_jsonl = 'input_data.jsonl.predictions'

# Read the predictions
with open(output_jsonl, 'r') as f:
    predictions = [json.loads(line) for line in f]

# Convert to DataFrame
predictions_df = pd.DataFrame(predictions)
predictions_df.rename(columns={'question_text': 'question', 'question_statement_text': 'hypothesis'}, inplace=True)

data['question_no_choice'] = data.apply(lambda row: row['question'].split(' Choices:')[0], axis=1)

# Merge datasets based on the 'question' column
data = pd.merge(data, predictions_df[['question', 'hypothesis']], left_on='question_no_choice', right_on='question', how='left')

data.drop(columns=['question_y'], inplace=True)
data.rename(columns={'question_x': 'question'}, inplace=True)

# Display the updated DataFrame
data

In [None]:
import re
def generate_mask(generated_rationale, predicted_answer):
    # Create a regex pattern to match the predicted answer case-insensitively and as a whole word
    pattern = re.compile(r'\b' + re.escape(predicted_answer) + r'\b', re.IGNORECASE)
    return pattern.sub("<mask>", generated_rationale)

data['gen_rationale_mask'] = data.apply(lambda row: generate_mask(row['generated_rationale'], row['predicted_answer']), axis=1)
data

In [None]:
data.rename(columns={'hypothesis_y': 'hypothesis'}, inplace=True)

In [None]:
# Adjust pandas display options
pd.set_option('display.max_colwidth', None)

# Display the full content of the 'gen_rationale_mask' column
print(data['gen_rationale_mask'])

# Reset the display option back
pd.reset_option('display.max_colwidth')

In [None]:
# def predict_nli(model, tokenizer, premise, hypothesis):
#     inputs = tokenizer(premise, hypothesis, return_tensors='pt', truncation=True, padding=True)
#     with torch.no_grad():
#         logits = model(**inputs).logits
#         probabilities = torch.softmax(logits, dim=-1)
#     return probabilities

# def evaluate_support(data, nli_model, nli_tokenizer):
#     support_scores = []

#     for idx, row in data.iterrows():
#         premise = row['gen_rationale_mask']
#         hypothesis = row['hypothesis']
#         probabilities = predict_nli(nli_model, nli_tokenizer, premise, hypothesis)
        
#         # {0: 'CONTRADICTION', 1: 'NEUTRAL', 2: 'ENTAILMENT'}
#         contradiction_prob = probabilities[0][0].item()
#         if contradiction_prob > 0.5:
#             print(f"Premise: {premise}")
#             print(f"Hypothesis: {hypothesis}")
#             print(f"Probabilities: {probabilities[0]}")
# #         print(probabilities[0])
#         support = contradiction_prob < 0.5 

#         support_scores.append({
#             'contradiction_prob': contradiction_prob,
#             'support': support
#         })

#     return support_scores

threshold = 0.5

def evaluate_support(data, nli_model, nli_tokenizer, use_mask=True, hypothesis_col='hypothesis', threshold=threshold):
    support_scores = []

    for idx, row in data.iterrows():
        premise = row['gen_rationale_mask'] if use_mask else row['generated_mask']
        hypothesis = row[hypothesis_col]
        
        no_entail_prob = calc_low_support_score(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
    
        if no_entail_prob > threshold:
            print(f"Premise: {premise}")
            print(f"Hypothesis: {hypothesis}")
            print(f"Probability: {no_entail_prob}")
#         print(probabilities[0])
        support = no_entail_prob < threshold 

        support_scores.append({
            'no_entail_prob': no_entail_prob,
            'support': support
        })

    return support_scores

In [None]:
# Evaluate simulatability
support_results = evaluate_support(data, nli_model, nli_tokenizer)
support_df = pd.DataFrame(support_results)
for column in support_df.columns:
    data[column] = support_df[column]
data

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of contradiction_prob
plt.figure(figsize=(8, 3))
plt.hist(data['no_entail_prob'], bins=50, edgecolor='black')
plt.title('Distribution of no_entail_prob')
plt.xlabel('no_entail_prob')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Filter rows where 'support' is False
filtered_data = data[data['no_entail_prob'] >= 0.5]
print(len(filtered_data))
filtered_data

In [None]:
# Filter rows where 'support' is False
filtered_data2 = data[data['no_entail_prob'] < 0.1]
print(len(filtered_data2))
filtered_data2

In [None]:
data.to_csv('data_with_support.csv', index=False)

In [None]:
filtered_data[['question','correct_answer','predicted_answer', 'hypothesis', 'gen_rationale_mask', 'no_entail_prob']]\
.to_csv('filtered_data.csv', index=False)
filtered_data2[['question','correct_answer','predicted_answer', 'hypothesis', 'gen_rationale_mask', 'no_entail_prob']]\
.to_csv('filtered_data2.csv', index=False)

# Make all of it in one shot

In [None]:
!nvidia-smi

In [None]:
import os
import re
import nltk
import torch
import json
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# load the tokenizer and model only if they are not already defined
if 'nli_tokenizer' not in globals():
    nli_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
if 'nli_model' not in globals():
    nli_model = AutoModelForSeq2SeqLM.from_pretrained("soumyasanyal/nli-entailment-verifier-xxl", load_in_8bit=True, device_map="auto")

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = SmoothingFunction().method1
    score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
    return score

def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

def calculate_meteor(reference, candidate):
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    score = meteor_score([reference_tokens], candidate_tokens)
    return score

def display_scores(reference, candidate):
    bleu_score = calculate_bleu(reference, candidate)
    rouge_scores = calculate_rouge(reference, candidate)
    meteor_score = calculate_meteor(reference, candidate)
    print(f"BLEU Score: {bleu_score:.4f}")
    print("ROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"  {key}: {value}")
    print(f"METEOR Score: {meteor_score:.4f}")
    
def get_longest_rationale(rationale_list):
    rationales = eval(rationale_list)
    return max(rationales, key=len) if isinstance(rationales, list) else ''

def calc_low_support_score(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer):
    def get_score(nli_model, nli_tokenizer, input_ids):
        pos_ids = nli_tokenizer('Yes').input_ids
        neg_ids = nli_tokenizer('No').input_ids
        pos_id = pos_ids[0]
        neg_id = neg_ids[0]

        with torch.no_grad():
            logits = nli_model(input_ids, decoder_input_ids=torch.zeros((input_ids.size(0), 1), dtype=torch.long)).logits
            pos_logits = logits[:, 0, pos_id]
            neg_logits = logits[:, 0, neg_id]
            posneg_logits = torch.cat([pos_logits.unsqueeze(-1), neg_logits.unsqueeze(-1)], dim=1)

            # Cast to float before applying softmax
            posneg_logits = posneg_logits.float()
            scores = torch.nn.functional.softmax(posneg_logits, dim=1)
            entail_score = scores[:, 0].item()
            contra_score = scores[:, 1].item()
        
        return entail_score, contra_score
    
    prompt = f"Premise: {premise}\nHypothesis: {hypothesis}\nGiven the premise, is the hypothesis correct?\nAnswer:"
    input_ids = nli_tokenizer(prompt, return_tensors='pt').input_ids
    return get_score(nli_model, nli_tokenizer, input_ids)[1]

def generate_mask(generated_rationale, predicted_answer):
    # Create a regex pattern to match the predicted answer case-insensitively and as a whole word
    predicted_answer = str(predicted_answer)
    pattern = re.compile(r'\b' + re.escape(predicted_answer) + r'\b', re.IGNORECASE)
    return pattern.sub("<mask>", generated_rationale)

def evaluate_support(data, nli_model, nli_tokenizer, use_mask=True, hypothesis_col='hypothesis', threshold=0.5):
    support_scores = []
    for idx, row in data.iterrows():
        premise = row['gen_rationale_mask'] if use_mask else row['generated_rationale']
        hypothesis = row[hypothesis_col]
        no_entail_prob = calc_low_support_score(premise, hypothesis, use_mask=use_mask, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
        support = no_entail_prob < threshold 
        if no_entail_prob > threshold:
            print(f"Premise: {premise}")
            print(f"Hypothesis: {hypothesis}")
            print(f"Probability: {no_entail_prob}")
        support_scores.append({
            'no_entail_prob': no_entail_prob,
            'support': support
        })
    return support_scores

def compute_file_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

# Main
def main(file_path):
    print(f"Processing {file_path}...")
    
    if file_path == '../results/Human Annotation of LLaVA+ Rationales.xlsx':
        model_name = "LLaVA"
    else:
        model_name = file_path.split('results/')[1].split('.xlsx')[0]
    
    spreadsheet = pd.ExcelFile(file_path)
    
    # Read the specified columns from the sheet
    columns_to_read = [
        'question',
        'correct_answer',
        'predicted_answer',
        'is_correct',
        'groundtruth_rationale',
        'generated_rationale'
    ]

    if file_path == '../results/Human Annotation of LLaVA+ Rationales.xlsx':
        data = pd.read_excel(file_path, header=1, usecols=columns_to_read)
    else:
        data = pd.read_excel(file_path, usecols=columns_to_read)
    data['question_no_choice'] = data.apply(lambda row: row['question'].split(' Choices:')[0], axis=1)
    data['longest_groundtruth_rationale'] = data['groundtruth_rationale'].apply(get_longest_rationale)
    
    data['BLEU_score'] = data.apply(lambda row: calculate_bleu(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)
    data['ROUGE_scores'] = data.apply(lambda row: calculate_rouge(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)
    data['METEOR_score'] = data.apply(lambda row: calculate_meteor(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)
    
    input_data = data[['question', 'predicted_answer']].copy()
    input_data['question'] = input_data['question'].apply(lambda x: x.split(' Choices:')[0])
    input_data.rename(columns={'question': 'question_text', 'predicted_answer': 'answer_text'}, inplace=True)
    input_jsonl = f'input_data_{model_name}.jsonl'
    output_jsonl = f'{input_jsonl}.predictions'
    with open(input_jsonl, 'w') as f:
        for index, row in input_data.iterrows():
            # Convert None to null
            row_dict = {k: (v if pd.notna(v) else 'None') for k, v in row.to_dict().items()}
            json.dump(row_dict, f)
            f.write('\n')
    # Compute hash of input_jsonl
    current_input_hash = compute_file_hash(input_jsonl)
    hash_file = f'{input_jsonl}.hash'
    # Check if output_jsonl exists and input_jsonl hash hasn't changed
    run_bash_command = True
    if os.path.exists(output_jsonl):
        if os.path.exists(hash_file):
            with open(hash_file, 'r') as f:
                saved_input_hash = f.read().strip()
                if current_input_hash == saved_input_hash:
                    run_bash_command = False
    # Save the current input_jsonl hash
    with open(hash_file, 'w') as f:
        f.write(current_input_hash)
    
    # The conversion step
    # Define the full path to the script
    script_path = '/home/<link_hidden>/REV/run_question_converter.sh'
    if run_bash_command:
        # Set PYTHONPATH and run the script
        os.system(f'export PYTHONPATH=/home/<link_hidden>/REV/:$PYTHONPATH && bash {script_path} cqa {input_jsonl} cuda:0')
    else:
        print(f'{output_jsonl} already exists and {input_jsonl} has not changed. Skipping the bash command.')
    
    with open(output_jsonl, 'r') as f:
        predictions = [json.loads(line) for line in f]
    predictions_df = pd.DataFrame(predictions)
    predictions_df.rename(columns={'question_statement_text': 'hypothesis'}, inplace=True)
    
    # Merge datasets based on the 'question' column
    data = pd.merge(data, predictions_df[['question_text', 'hypothesis']], left_on='question_no_choice', right_on='question_text', how='left')
    
    data['gen_rationale_mask'] = data.apply(lambda row: generate_mask(row['generated_rationale'], row['predicted_answer']), axis=1)

    # Evaluate support
    support_results = evaluate_support(data, nli_model, nli_tokenizer)
    support_df = pd.DataFrame(support_results)
    for column in support_df.columns:
        data[column] = support_df[column]
        
    # Plot the distribution of no_entail_prob
    plt.figure(figsize=(8, 3))
    plt.hist(data['no_entail_prob'], bins=50, edgecolor='black')
    plt.title('Distribution of no_entail_prob')
    plt.xlabel('no_entail_prob')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
    
    support_score = data['support'].mean()
    
    print(f"Support score: {support_score}")
    
    data.to_csv(f'data_with_support_{model_name}.csv', index=False)
    
if __name__ == '__main__':
    file_paths = [
                  '../results/Human Annotation of LLaVA+ Rationales.xlsx',
#                   "../results/gpt-4o_inference_one_shot_50_improved_prompt.xlsx",
#                   "../results/gpt-4o_inference_two_steps_50.xlsx",
#                   "../results/instructblip-flan-t5-xxl_inference_one_shot_50_improved_prompt.xlsx",
#                   "../results/llava-1.5-7b-hf_inference_no_vision.xlsx",
#                   "../results/gpt-4o_text_only_inference_one_shot.xlsx",
#                   "../results/gpt-4o_inference_one_shot.xlsx",
                 ]
    for file_path in file_paths:
        main(file_path)

## 1.2 Pilot tests of support on sentence level

In [None]:
premise = "The people in the image are located in their home. This can be inferred from the presence of a couch, which is a common piece of furniture found in homes. Additionally, the people are sitting on the couch, which further supports the idea that they are in their home. The image does not show any indications of a workplace, hospital, library, or any other location."
premise = "The image shows a casual and comfortable environment with two dogs lounging on a couch, along with personal items such as books, electronics, and various other belongings scattered around. This setting is typically indicative of a residence rather than a work, hospital, or library environment. The presence of dogs and the relaxed atmosphere further suggests a private living space. Thus, the answer is home."
hypothesis = "These people are located home."

sentences = nltk.tokenize.sent_tokenize(premise)
sentence_scores = []
for sentence in sentences:
    no_entail_prob = calc_low_support_score(sentence, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
    support = no_entail_prob < 0.2
    sentence_scores.append({
        'sentence': sentence,
        'no_entail_prob': no_entail_prob,
        'support': support
    })
sentence_scores

In [None]:
# Convert the list to a pandas DataFrame
df = pd.DataFrame(sentence_scores)
pd.set_option('display.max_colwidth', None) # Show full column content
# Display the DataFrame
df
from IPython.display import display, Markdown

# Display the DataFrame as a Markdown table
display(Markdown(copiable_table))

In [None]:
def evaluate_sentence_level_support(data, nli_model, nli_tokenizer, use_mask=False, hypothesis_col='hypothesis', threshold=0.5):
    sentence_support_scores = []
    for idx, row in data.iterrows():
        premise = row['gen_rationale_mask'] if use_mask else row['generated_rationale']
        hypothesis = row[hypothesis_col]
        sentences = nltk.tokenize.sent_tokenize(premise)
        sentence_scores = []
        for sentence in sentences:
            no_entail_prob = calc_low_support_score(sentence, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
            support = no_entail_prob < threshold
            sentence_scores.append({
                'sentence': sentence,
                'no_entail_prob': no_entail_prob,
                'support': support
            })
        sentence_support_scores.append(sentence_scores)
    return sentence_support_scores

In [None]:
def process_files(file_paths):
    for file_path in file_paths:
        model_name = file_path.split('data_with_support_')[1].split('.csv')[0]
        data = pd.read_csv(file_path)

        # Perform sentence-level support evaluation
        sentence_level_results = evaluate_sentence_level_support(data, nli_model, nli_tokenizer)

        # Add the sentence-level results as a new column
        data['sentence_level_support'] = sentence_level_results

        if not os.path.exists('data_with_sentence_level_support'):
            os.makedirs('data_with_sentence_level_support')
        
        # Save the updated data to a new CSV file
        output_file_path = f'data_with_sentence_level_support_{model_name}.csv'
        data.to_csv(output_file_path, index=False)
        print(f"Processed {file_path} and saved results to {output_file_path}")

file_paths = [
    'data_with_support_LLaVA.csv',
    # Add other file paths
]
process_files(file_paths)

## 1.3? try question+choices+answers in REV tool

In [None]:
os.system(f'export PYTHONPATH=/home/<link_hidden>/REV/:$PYTHONPATH && bash /home/<link_hidden>/REV/run_question_converter.sh cqa /home/<link_hidden>/REV/sample_input.jsonl cuda:0')

# Section 2: Visual Fidelity

In [None]:
set_50_index_df = pd.read_csv("../set-50-idx.csv")
visual_fidelity = [1,1,0,1,0,1,0,'null',1,1,1,1,-1,1,1,1,1,1,1,-1,1,0,-1,1,-1,0,0,1,1,0,1,1,0,1,1,-1,1,1,-1,1,0,0,-1,-1,1,1,-1,1,1,-1]
set_50_index_df["visual_fidelity"] = visual_fidelity

# Select subsets with visual_fidelity == 1 and visual_fidelity == -1
vf_1_subset = set_50_index_df[set_50_index_df['visual_fidelity'] == 1].head(10)
vf_0_subset = set_50_index_df[set_50_index_df['visual_fidelity'] == -1].head(10)

# Combine the subsets to create the "data" subset
set_50_index_df_vf_filtered = pd.concat([vf_1_subset, vf_0_subset]).reset_index(drop=True)
set_50_index_df_vf_filtered

In [None]:
data_vf_filtered = data.iloc[set_50_index_df_vf_filtered['idx']].copy()
data_vf_filtered['visual_fidelity'] = set_50_index_df_vf_filtered['visual_fidelity'].values
data_vf_filtered

In [None]:
# def construct_vacuous_rationale(question, answer):
#     return f"{question} The answer is {answer}."


# def calculate_rev_score(data, nli_model, nli_tokenizer):
#     rev_scores = []

#     for idx, row in data.iterrows():
#         question = row['question'].split(' Choices:')[0]
#         correct_answer = row['predicted_answer']
#         hypothesis = row['hypothesis']
        
#         generated_rationale = row['generated_rationale']
#         vacuous_rationale = construct_vacuous_rationale(question, correct_answer)
        
#         # Predict using NLI model
#         probs_generated = predict_nli(nli_model, nli_tokenizer, generated_rationale, hypothesis)
#         probs_vacuous = predict_nli(nli_model, nli_tokenizer, vacuous_rationale, hypothesis)
        
#         # Calculate log-probabilities for the entailment class (class 2)
#         log_prob_generated = torch.log(probs_generated[0][2])
#         log_prob_vacuous = torch.log(probs_vacuous[0][2])
        
#         # Compute REV score
#         rev_score = log_prob_generated - log_prob_vacuous
#         rev_scores.append(rev_score.item())
    
#     data['REV_score'] = rev_scores
#     return data

# # Calculate REV scores
# data_with_rev = calculate_rev_score(data, nli_model, nli_tokenizer)
# data_with_rev.to_csv('data_with_REV.csv', index=False)

In [None]:
# data_with_rev

In [None]:
# def tokenize_function(texts):
#     return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# def predict(model, inputs):
#     with torch.no_grad():
#         outputs = model(**inputs)
#         probabilities = torch.softmax(outputs.logits, dim=-1)
#     return probabilities

# def compute_sim(to_use, labels_to_use):
#     xe_correct = (to_use['predicted_labels_xe'] == to_use[labels_to_use]).astype(int)
#     x_correct = (to_use['predicted_labels_x'] == to_use[labels_to_use]).astype(int)
#     e_correct = (to_use['predicted_labels_e'] == to_use[labels_to_use]).astype(int)

#     baseline_correct = x_correct
#     leaking = e_correct
#     leaked = np.where(leaking == 1)[0]
#     nonleaked = np.where(leaking == 0)[0]

#     xe_correct_leaked = xe_correct[leaked]
#     baseline_correct_leaked = baseline_correct[leaked]
#     xe_correct_nonleaked = xe_correct[nonleaked]
#     baseline_correct_nonleaked = baseline_correct_nonleaked[nonleaked]

#     unweighted_mean = np.mean([
#         np.mean(xe_correct_leaked) - np.mean(baseline_correct_leaked),
#         np.mean(xe_correct_nonleaked) - np.mean(baseline_correct_nonleaked)
#     ])

#     nonleaking_diff = np.mean(xe_correct_nonleaked) - np.mean(baseline_correct_nonleaked)
#     leaking_diff = np.mean(xe_correct_leaked) - np.mean(baseline_correct_leaked)

#     return unweighted_mean, leaking_diff, nonleaking_diff

# def run_las_analysis(data, model, tokenizer):
#     input_texts = data['question'].tolist()
#     explanations = data['generated_rationale'].tolist()

#     # Tokenize input texts
#     input_encodings = tokenize_function(input_texts)
#     explanation_encodings = tokenize_function(explanations)

#     # Predict using the model
#     input_probs = predict(model, input_encodings)
#     explanation_probs = predict(model, explanation_encodings)

#     data['predicted_labels_x'] = input_probs.argmax(dim=1).numpy()
#     data['predicted_labels_e'] = explanation_probs.argmax(dim=1).numpy()

#     # Use both input and explanations for predictions (xe)
#     combined_texts = [f"{text} {exp}" for text, exp in zip(input_texts, explanations)]
#     combined_encodings = tokenize_function(combined_texts)
#     combined_probs = predict(model, combined_encodings)

#     data['predicted_labels_xe'] = combined_probs.argmax(dim=1).numpy()

#     # Compute LAS
#     unweighted_mean, leaking_diff, nonleaking_diff = compute_sim(data, 'correct_answer')

#     data['LAS_unweighted_mean'] = unweighted_mean
#     data['LAS_leaking_diff'] = leaking_diff
#     data['LAS_nonleaking_diff'] = nonleaking_diff

#     return data

In [None]:
# # Run the analysis
# updated_data = run_las_analysis(data, model, tokenizer)
# updated_data