In [None]:
!nvidia-smi

In [None]:
%%capture
%pip install nltk rouge-score sacrebleu openpyxl openai torch transformers datasets matplotlib bitsandbytes

In [None]:
from openai import OpenAI
# OPENAI api key
api_key_path = '../../OPENAI_key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip().split('\n')[0]
    
client = OpenAI(api_key=api_key)

In [None]:
import os
import re
import ast
import nltk
import torch
import json
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 0: BLEU, ROUGE, METEOR

# 1: Support / Simulatability

In [None]:
# load the tokenizer and model only if they are not already defined
if 'nli_tokenizer' not in globals():
    nli_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
if 'nli_model' not in globals():
    nli_model = AutoModelForSeq2SeqLM.from_pretrained("soumyasanyal/nli-entailment-verifier-xxl", load_in_8bit=True, device_map="auto")

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = SmoothingFunction().method1
    score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
    return score

def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

def calculate_meteor(reference, candidate):
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    score = meteor_score([reference_tokens], candidate_tokens)
    return score

def display_scores(reference, candidate):
    bleu_score = calculate_bleu(reference, candidate)
    rouge_scores = calculate_rouge(reference, candidate)
    meteor_score = calculate_meteor(reference, candidate)
    print(f"BLEU Score: {bleu_score:.4f}")
    print("ROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"  {key}: {value}")
    print(f"METEOR Score: {meteor_score:.4f}")
    
def get_longest_rationale(rationale_list):
    rationales = eval(rationale_list)
    return max(rationales, key=len) if isinstance(rationales, list) else ''

def calc_support_prob(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer):
    def get_score(nli_model, nli_tokenizer, input_ids):
        pos_ids = nli_tokenizer('Yes').input_ids
        neg_ids = nli_tokenizer('No').input_ids
        pos_id = pos_ids[0]
        neg_id = neg_ids[0]

        with torch.no_grad():
            logits = nli_model(input_ids, decoder_input_ids=torch.zeros((input_ids.size(0), 1), dtype=torch.long)).logits
            pos_logits = logits[:, 0, pos_id]
            neg_logits = logits[:, 0, neg_id]
            posneg_logits = torch.cat([pos_logits.unsqueeze(-1), neg_logits.unsqueeze(-1)], dim=1)

            # Cast to float before applying softmax
            posneg_logits = posneg_logits.float()
            scores = torch.nn.functional.softmax(posneg_logits, dim=1)
            entail_score = scores[:, 0].item()
            no_entail_score = scores[:, 1].item()
        
        return entail_score, no_entail_score
    
    prompt = f"Premise: {premise}\nHypothesis: {hypothesis}\nGiven the premise, is the hypothesis correct?\nAnswer:"
    input_ids = nli_tokenizer(prompt, return_tensors='pt').input_ids
    return get_score(nli_model, nli_tokenizer, input_ids)[0]

def generate_mask(generated_rationale, predicted_answer):
    # Create a regex pattern to match the predicted answer case-insensitively and as a whole word
    predicted_answer = str(predicted_answer)
    pattern = re.compile(r'\b' + re.escape(predicted_answer) + r'\b', re.IGNORECASE)
    return pattern.sub("<mask>", generated_rationale)

def evaluate_support(data, nli_model, nli_tokenizer, use_mask=True, use_pieces=False, hypothesis_col='hypothesis', threshold=0.5):
    support_scores = []
    for idx, row in data.iterrows():
        if use_pieces:
            premise = row['concat_rationale_pieces_mask'] if use_mask else row['concat_rationale_pieces']
        else: 
            premise = row['gen_rationale_mask'] if use_mask else row['generated_rationale']
        
        hypothesis = row[hypothesis_col]
        entail_prob = calc_support_prob(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
        support = entail_prob > threshold 
        if entail_prob < threshold:
            print(f"Premise: {premise}")
            print(f"Hypothesis: {hypothesis}")
            print(f"Probability: {entail_prob}")
        support_scores.append({
            'entail_prob': entail_prob,
            'support': support
        })
    return support_scores

def compute_file_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

In [None]:
premise = "A fossil fuel is a kind of natural resource. Coal is a kind of fossil fuel."
hypothesis = "Coal is a kind of natural resource."
calc_support_prob(premise, hypothesis)

In [None]:
# Read data
file_path = '../results/Human Annotation of LLaVA+ Rationales.xlsx'

if file_path == '../results/Human Annotation of LLaVA+ Rationales.xlsx':
    model_name = "LLaVA"
else:
    model_name = file_path.split('results/')[1].split('.xlsx')[0]

spreadsheet = pd.ExcelFile(file_path)

# Read the specified columns from the sheet
columns_to_read = [
    'question',
    'correct_answer',
    'predicted_answer',
    'is_correct',
    'groundtruth_rationale',
    'generated_rationale'
]

if file_path == '../results/Human Annotation of LLaVA+ Rationales.xlsx':
    data = pd.read_excel(file_path, header=1, usecols=columns_to_read)
else:
    data = pd.read_excel(file_path, usecols=columns_to_read)
data['question_no_choice'] = data.apply(lambda row: row['question'].split(' Choices:')[0], axis=1)
data['longest_groundtruth_rationale'] = data['groundtruth_rationale'].apply(get_longest_rationale)

data['BLEU_score'] = data.apply(lambda row: calculate_bleu(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)
data['ROUGE_scores'] = data.apply(lambda row: calculate_rouge(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)
data['METEOR_score'] = data.apply(lambda row: calculate_meteor(row['longest_groundtruth_rationale'], row['generated_rationale']), axis=1)

input_data = data[['question', 'predicted_answer']].copy()
input_data['question'] = input_data['question'].apply(lambda x: x.split(' Choices:')[0])
input_data.rename(columns={'question': 'question_text', 'predicted_answer': 'answer_text'}, inplace=True)
input_jsonl = f'input_data_{model_name}.jsonl'
output_jsonl = f'{input_jsonl}.predictions'
with open(input_jsonl, 'w') as f:
    for index, row in input_data.iterrows():
        # Convert None to null
        row_dict = {k: (v if pd.notna(v) else 'None') for k, v in row.to_dict().items()}
        json.dump(row_dict, f)
        f.write('\n')
# Compute hash of input_jsonl
current_input_hash = compute_file_hash(input_jsonl)
hash_file = f'{input_jsonl}.hash'
# Check if output_jsonl exists and input_jsonl hash hasn't changed
run_bash_command = True
if os.path.exists(output_jsonl):
    if os.path.exists(hash_file):
        with open(hash_file, 'r') as f:
            saved_input_hash = f.read().strip()
            if current_input_hash == saved_input_hash:
                run_bash_command = False
# Save the current input_jsonl hash
with open(hash_file, 'w') as f:
    f.write(current_input_hash)
    
# The conversion step
# Define the full path to the script
script_path = '/home/<link_hidden>/REV/run_question_converter.sh'
if run_bash_command:
    # Set PYTHONPATH and run the script
    os.system(f'export PYTHONPATH=/home/<link_hidden>/REV/:$PYTHONPATH && bash {script_path} cqa {input_jsonl} cuda:0')
else:
    print(f'{output_jsonl} already exists and {input_jsonl} has not changed. Skipping the bash command.')

with open(output_jsonl, 'r') as f:
    predictions = [json.loads(line) for line in f]
predictions_df = pd.DataFrame(predictions)
predictions_df.rename(columns={'question_statement_text': 'hypothesis'}, inplace=True)

# Merge datasets based on the 'question' column
data = pd.merge(data, predictions_df[['question_text', 'hypothesis']], left_on='question_no_choice', right_on='question_text', how='left')

data['gen_rationale_mask'] = data.apply(lambda row: generate_mask(row['generated_rationale'], row['predicted_answer']), axis=1)

# 2: Informativeness

In [None]:
# Generate set R which contains rationale pieces for every instance

def extract_distinct_rationale_pieces(hypothesis, rationale):
    prompt=f"""Please break the following rationale into distinct pieces, and keep only the ones that are not semantically equivalent to the hypothesis. Output the final answer in a Python list format.

Example:
Hypothesis: The man by the bags is waiting for a delivery.
Rationale: The man by the bags is waiting for a delivery, as indicated by the presence of the suitcases and the fact that he is standing on the side of the road. The other options, such as a skateboarder, train, or cab, do not seem to be relevant to the situation depicted in the image.
Output: ["Suitcases are present in the image.", "The man is standing on the side of the road.", "The other options, such as a skateboarder, train, or cab, do not seem to be relevant to the situation depicted in the image."]

Task:
Hypothesis: {hypothesis}
Rationale: {rationale}"""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    message = response.choices[0].message.content
    start_index = message.find('[')
    end_index = message.rfind(']')
    R_list = message[start_index:end_index+1]
    R_list = ast.literal_eval(R_list)
    return R_list

In [None]:
# Apply the function to each row and store the result in a new column
data['extracted_rationale_pieces'] = data.apply(
    lambda row: extract_distinct_rationale_pieces(row['hypothesis'], row['generated_rationale']),
    axis=1
)

# Save the updated dataset as a CSV file
if not os.path.exists('data_with_rationale_pieces'):
    os.makedirs('data_with_rationale_pieces')

# Save the updated data to a new CSV file
output_file_path = f'data_with_rationale_pieces/data_with_rationale_pieces_{model_name}.csv'
data.to_csv(output_file_path, index=False)
print(f"Saved results to {output_file_path}")

In [None]:
file_paths = ["data_with_support_gpt-4o_inference_one_shot.csv",
             "data_with_support_gpt-4o_text_only_inference_one_shot.csv",
             "data_with_support_llava-1.5-7b-hf_inference_no_vision.csv"]

def calc_inform_and_sim(file_path):
    model_name = file_path.split('support_')[1].split('.csv')[0]
    data = pd.read_csv(file_path)
    if 'entail_prob' not in data.columns and 'no_entail_prob' in data.columns:
        data['entail_prob'] = 1 - data['no_entail_prob']
        
    data['extracted_rationale_pieces'] = data.apply(
        lambda row: extract_distinct_rationale_pieces(row['hypothesis'], row['generated_rationale']),
        axis=1
    )

    # Save the updated dataset as a CSV file
    if not os.path.exists('data_with_rationale_pieces'):
        os.makedirs('data_with_rationale_pieces')

    # Save the updated data to a new CSV file
    output_file_path = f'data_with_rationale_pieces/data_with_rationale_pieces_{model_name}.csv'
    data.to_csv(output_file_path, index=False)
    print(f"Saved results to {output_file_path}")
    
    # Double check with this line!! is x python list (then check len(x)==0) or a string (then check len(x)==2)?
    data['informative'] = data['extracted_rationale_pieces'].apply(lambda x: False if len(x) == 0 else True)
    
    data.rename(columns={'support':"simulatable"}, inplace=True)
    
    print(data)
    
    # Save the updated dataset as a CSV file
    if not os.path.exists('data_with_inform_sim'):
        os.makedirs('data_with_inform_sim')

    # Save the updated data to a new CSV file
    output_file_path = f'data_with_inform_sim/data_with_inform_sim_{model_name}.csv'
    data.to_csv(output_file_path, index=False)
    print(f"Saved results to {output_file_path}")

for file_path in file_paths:
    calc_inform_and_sim(file_path)

In [None]:
dataset_paths = ["/home/<link_hidden>/<hidden>/notebooks/analysis/data_with_inform_sim/data_with_inform_sim_mask_LLaVA.csv",
                "/home/<link_hidden>/<hidden>/notebooks/analysis/data_with_inform_sim/data_with_inform_sim_llava-1.5-7b-hf_inference_no_vision.csv",
                "/home/<link_hidden>/<hidden>/notebooks/analysis/data_with_inform_sim/data_with_inform_sim_gpt-4o_inference_one_shot.csv",
                "/home/<link_hidden>/<hidden>/notebooks/analysis/data_with_inform_sim/data_with_inform_sim_gpt-4o_text_only_inference_one_shot.csv",
]

for dataset_path in dataset_paths:
    data = pd.read_csv(dataset_path)
#     data['informative'] = data['extracted_rationale_pieces'].apply(lambda x: False if len(x) == 2 else True)
#     data.to_csv(dataset_path, index=False)
    avg_acc = np.mean(data['is_correct'])
    if dataset_path == "/home/<link_hidden>/<hidden>/notebooks/analysis/data_with_inform_sim/data_with_inform_sim_mask_LLaVA.csv":
        avg_support = np.mean(data['simulatable_mask'])
    else:
        avg_support = np.mean(data['simulatable'])
    avg_informativeness = np.mean(data['informative'])
    print(f"Avg. accuracy: {avg_acc}")
    print(f"Avg. support: {avg_support}")
    print(f"Avg. informativeness: {avg_informativeness}")  
    print("---------------------------------------------")

In [None]:
model_name = "LLaVA"
data = pd.read_csv(f'data_with_inform_sim/data_with_inform_sim_mask_{model_name}.csv')
data = pd.read_csv(f'data_with_inform_sim/data_with_inform_sim_gpt-4o_inference_one_shot.csv')
data

In [None]:
data['informative'] = data['extracted_rationale_pieces'].apply(lambda x: False if len(x) == 2 else True)
# data['simulatable'] = data.apply(lambda row: calc_low_support_score(row['generated_rationale'], row['hypothesis']) <= 0.5, axis=1)

In [None]:
false_informative_rows = data[data['informative'].apply(lambda x: x == False)]
false_informative_rows

In [None]:
data
false_simulatable_rows = data[data['simulatable'].apply(lambda x: x == False)]
false_simulatable_rows

In [None]:
# Save the updated dataset as a CSV file
if not os.path.exists('data_with_inform_sim'):
    os.makedirs('data_with_inform_sim')

# Save the updated data to a new CSV file
output_file_path = f'data_with_inform_sim/data_with_inform_sim_{model_name}.csv'
data.to_csv(output_file_path, index=False)
print(f"Saved results to {output_file_path}")

# Pilot test: use extracted rationale pieces as the premise when calculating simulatability

In [None]:
def concat_str_to_paragraph(string_list):
    # Convert the string to a Python list
    list_of_strings = ast.literal_eval(string_list)
    # Concatenate the strings
    paragraph = ' '.join(list_of_strings)
    return paragraph

data = pd.read_csv("data_with_inform_sim/data_with_inform_sim_mask_LLaVA.csv")
data['concat_rationale_pieces'] = data['extracted_rationale_pieces'].apply(concat_str_to_paragraph)
data['concat_rationale_pieces_mask'] = data.apply(lambda row: generate_mask(row['concat_rationale_pieces'], row['predicted_answer']), axis=1)
data

In [None]:
data["support_use_pieces"] = evaluate_support(data, nli_model, nli_tokenizer, use_mask=False, use_pieces=True)
data["prob_support_use_pieces"] = data["support_use_pieces"].apply(lambda x: x['entail_prob'])
data["support_use_pieces"] = data["support_use_pieces"].apply(lambda x: x['support'])

data["support_use_pieces_mask"] = evaluate_support(data, nli_model, nli_tokenizer, use_mask=True, use_pieces=True)
data["prob_support_use_pieces_mask"] = data["support_use_pieces_mask"].apply(lambda x: x['entail_prob'])
data["support_use_pieces_mask"] = data["support_use_pieces_mask"].apply(lambda x: x['support'])

In [None]:
support_score_use_pieces = data['support_use_pieces'].astype(int).mean()
support_score_use_pieces_mask = data['support_use_pieces_mask'].astype(int).mean()
print(f"support_score_use_pieces: {support_score_use_pieces}")
print(f"support_score_use_pieces_mask: {support_score_use_pieces_mask}")

In [None]:
output_file_path = f'data_with_inform_sim/data_with_inform_sim_LLaVA_v2.csv'
data.to_csv(output_file_path, index=False)

# All sections below are not used

## Pilot tests of support on sentence level

In [None]:
premise = "The people in the image are located in their home. This can be inferred from the presence of a couch, which is a common piece of furniture found in homes. Additionally, the people are sitting on the couch, which further supports the idea that they are in their home. The image does not show any indications of a workplace, hospital, library, or any other location."
premise = "The image shows a casual and comfortable environment with two dogs lounging on a couch, along with personal items such as books, electronics, and various other belongings scattered around. This setting is typically indicative of a residence rather than a work, hospital, or library environment. The presence of dogs and the relaxed atmosphere further suggests a private living space. Thus, the answer is home."
hypothesis = "These people are located home."

sentences = nltk.tokenize.sent_tokenize(premise)
sentence_scores = []
for sentence in sentences:
    no_entail_prob = calc_low_support_score(sentence, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
    support = no_entail_prob < 0.2
    sentence_scores.append({
        'sentence': sentence,
        'no_entail_prob': no_entail_prob,
        'support': support
    })
sentence_scores

In [None]:
# Convert the list to a pandas DataFrame
df = pd.DataFrame(sentence_scores)
pd.set_option('display.max_colwidth', None) # Show full column content
# Display the DataFrame
df
from IPython.display import display, Markdown

# Display the DataFrame as a Markdown table
display(Markdown(copiable_table))

In [None]:
def evaluate_sentence_level_support(data, nli_model, nli_tokenizer, use_mask=False, hypothesis_col='hypothesis', threshold=0.5):
    sentence_support_scores = []
    for idx, row in data.iterrows():
        premise = row['gen_rationale_mask'] if use_mask else row['generated_rationale']
        hypothesis = row[hypothesis_col]
        sentences = nltk.tokenize.sent_tokenize(premise)
        sentence_scores = []
        for sentence in sentences:
            no_entail_prob = calc_low_support_score(sentence, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
            support = no_entail_prob < threshold
            sentence_scores.append({
                'sentence': sentence,
                'no_entail_prob': no_entail_prob,
                'support': support
            })
        sentence_support_scores.append(sentence_scores)
    return sentence_support_scores

In [None]:
def process_files(file_paths):
    for file_path in file_paths:
        model_name = file_path.split('data_with_support_')[1].split('.csv')[0]
        data = pd.read_csv(file_path)

        # Perform sentence-level support evaluation
        sentence_level_results = evaluate_sentence_level_support(data, nli_model, nli_tokenizer)

        # Add the sentence-level results as a new column
        data['sentence_level_support'] = sentence_level_results

        if not os.path.exists('data_with_sentence_level_support'):
            os.makedirs('data_with_sentence_level_support')
        
        # Save the updated data to a new CSV file
        output_file_path = f'data_with_sentence_level_support_{model_name}.csv'
        data.to_csv(output_file_path, index=False)
        print(f"Processed {file_path} and saved results to {output_file_path}")

file_paths = [
    'data_with_support_LLaVA.csv',
    # Add other file paths
]
process_files(file_paths)

## try question+choices+answers in REV tool

In [None]:
os.system(f'export PYTHONPATH=/home/<link_hidden>/REV/:$PYTHONPATH && bash /home/<link_hidden>/REV/run_question_converter.sh cqa /home/<link_hidden>/REV/sample_input.jsonl cuda:0')

# Visual Fidelity

In [None]:
set_50_index_df = pd.read_csv("../set-50-idx.csv")
visual_fidelity = [1,1,0,1,0,1,0,'null',1,1,1,1,-1,1,1,1,1,1,1,-1,1,0,-1,1,-1,0,0,1,1,0,1,1,0,1,1,-1,1,1,-1,1,0,0,-1,-1,1,1,-1,1,1,-1]
set_50_index_df["visual_fidelity"] = visual_fidelity

# Select subsets with visual_fidelity == 1 and visual_fidelity == -1
vf_1_subset = set_50_index_df[set_50_index_df['visual_fidelity'] == 1].head(10)
vf_0_subset = set_50_index_df[set_50_index_df['visual_fidelity'] == -1].head(10)

# Combine the subsets to create the "data" subset
set_50_index_df_vf_filtered = pd.concat([vf_1_subset, vf_0_subset]).reset_index(drop=True)
set_50_index_df_vf_filtered

In [None]:
data_vf_filtered = data.iloc[set_50_index_df_vf_filtered['idx']].copy()
data_vf_filtered['visual_fidelity'] = set_50_index_df_vf_filtered['visual_fidelity'].values
data_vf_filtered

In [None]:
# def construct_vacuous_rationale(question, answer):
#     return f"{question} The answer is {answer}."


# def calculate_rev_score(data, nli_model, nli_tokenizer):
#     rev_scores = []

#     for idx, row in data.iterrows():
#         question = row['question'].split(' Choices:')[0]
#         correct_answer = row['predicted_answer']
#         hypothesis = row['hypothesis']
        
#         generated_rationale = row['generated_rationale']
#         vacuous_rationale = construct_vacuous_rationale(question, correct_answer)
        
#         # Predict using NLI model
#         probs_generated = predict_nli(nli_model, nli_tokenizer, generated_rationale, hypothesis)
#         probs_vacuous = predict_nli(nli_model, nli_tokenizer, vacuous_rationale, hypothesis)
        
#         # Calculate log-probabilities for the entailment class (class 2)
#         log_prob_generated = torch.log(probs_generated[0][2])
#         log_prob_vacuous = torch.log(probs_vacuous[0][2])
        
#         # Compute REV score
#         rev_score = log_prob_generated - log_prob_vacuous
#         rev_scores.append(rev_score.item())
    
#     data['REV_score'] = rev_scores
#     return data

# # Calculate REV scores
# data_with_rev = calculate_rev_score(data, nli_model, nli_tokenizer)
# data_with_rev.to_csv('data_with_REV.csv', index=False)

In [None]:
# data_with_rev

In [None]:
# def tokenize_function(texts):
#     return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# def predict(model, inputs):
#     with torch.no_grad():
#         outputs = model(**inputs)
#         probabilities = torch.softmax(outputs.logits, dim=-1)
#     return probabilities

# def compute_sim(to_use, labels_to_use):
#     xe_correct = (to_use['predicted_labels_xe'] == to_use[labels_to_use]).astype(int)
#     x_correct = (to_use['predicted_labels_x'] == to_use[labels_to_use]).astype(int)
#     e_correct = (to_use['predicted_labels_e'] == to_use[labels_to_use]).astype(int)

#     baseline_correct = x_correct
#     leaking = e_correct
#     leaked = np.where(leaking == 1)[0]
#     nonleaked = np.where(leaking == 0)[0]

#     xe_correct_leaked = xe_correct[leaked]
#     baseline_correct_leaked = baseline_correct[leaked]
#     xe_correct_nonleaked = xe_correct[nonleaked]
#     baseline_correct_nonleaked = baseline_correct_nonleaked[nonleaked]

#     unweighted_mean = np.mean([
#         np.mean(xe_correct_leaked) - np.mean(baseline_correct_leaked),
#         np.mean(xe_correct_nonleaked) - np.mean(baseline_correct_nonleaked)
#     ])

#     nonleaking_diff = np.mean(xe_correct_nonleaked) - np.mean(baseline_correct_nonleaked)
#     leaking_diff = np.mean(xe_correct_leaked) - np.mean(baseline_correct_leaked)

#     return unweighted_mean, leaking_diff, nonleaking_diff

# def run_las_analysis(data, model, tokenizer):
#     input_texts = data['question'].tolist()
#     explanations = data['generated_rationale'].tolist()

#     # Tokenize input texts
#     input_encodings = tokenize_function(input_texts)
#     explanation_encodings = tokenize_function(explanations)

#     # Predict using the model
#     input_probs = predict(model, input_encodings)
#     explanation_probs = predict(model, explanation_encodings)

#     data['predicted_labels_x'] = input_probs.argmax(dim=1).numpy()
#     data['predicted_labels_e'] = explanation_probs.argmax(dim=1).numpy()

#     # Use both input and explanations for predictions (xe)
#     combined_texts = [f"{text} {exp}" for text, exp in zip(input_texts, explanations)]
#     combined_encodings = tokenize_function(combined_texts)
#     combined_probs = predict(model, combined_encodings)

#     data['predicted_labels_xe'] = combined_probs.argmax(dim=1).numpy()

#     # Compute LAS
#     unweighted_mean, leaking_diff, nonleaking_diff = compute_sim(data, 'correct_answer')

#     data['LAS_unweighted_mean'] = unweighted_mean
#     data['LAS_leaking_diff'] = leaking_diff
#     data['LAS_nonleaking_diff'] = nonleaking_diff

#     return data

In [None]:
# # Run the analysis
# updated_data = run_las_analysis(data, model, tokenizer)
# updated_data