In [None]:
!nvidia-smi

In [None]:
import os
import re
import ast
import nltk
import torch
import json
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
if 'nli_tokenizer' not in globals():
    nli_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
if 'nli_model' not in globals():
    nli_model = AutoModelForSeq2SeqLM.from_pretrained("soumyasanyal/nli-entailment-verifier-xxl", load_in_8bit=True, device_map="auto")

In [None]:
import os
from openai import OpenAI

# OPENAI api key
api_key_path = '../../OPENAI_key.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip().split('\n')[0]
    
client = OpenAI(api_key=api_key)

# Record the cost
COST_FILE = "total_cost.txt"

def read_total_cost():
    if os.path.exists(COST_FILE):
        with open(COST_FILE, "r") as file:
            content = file.read().strip()
            return float(content) if not content == "" else 0.0
    else:
        return 0.0

def write_total_cost(cost):
    prev_cost = read_total_cost()
    new_total_cost = prev_cost + cost
    with open(COST_FILE, "w") as file:
        file.write(f"{new_total_cost}")
        
def calculate_cost(usage, model, verbose=0):
    if model == "gpt-4o-2024-05-13":
        input_cost_per_token = 0.005 / 1000
        output_cost_per_token = 0.015 / 1000
    elif model == "gpt-4o-2024-08-06":
        input_cost_per_token = 0.0025 / 1000
        output_cost_per_token = 0.010 / 1000
    elif model == "gpt-4o-mini-2024-07-18":
        input_cost_per_token = 0.00015 / 1000
        output_cost_per_token = 0.00060 / 1000
    
    input_tokens = usage['prompt_tokens']
    output_tokens = usage['completion_tokens']
    cost = (input_tokens * input_cost_per_token) + (output_tokens * output_cost_per_token)
    if verbose: print(f"The cost incurred is ${cost:.3f}")
    write_total_cost(cost)

In [None]:
def get_longest_rationale(rationale_list):
    rationales = eval(rationale_list)
    return max(rationales, key=len) if isinstance(rationales, list) else ''

def calc_support_prob(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer):
    def get_score(nli_model, nli_tokenizer, input_ids):
        pos_ids = nli_tokenizer('Yes').input_ids
        neg_ids = nli_tokenizer('No').input_ids
        pos_id = pos_ids[0]
        neg_id = neg_ids[0]

        with torch.no_grad():
            logits = nli_model(input_ids, decoder_input_ids=torch.zeros((input_ids.size(0), 1), dtype=torch.long)).logits
            pos_logits = logits[:, 0, pos_id]
            neg_logits = logits[:, 0, neg_id]
            posneg_logits = torch.cat([pos_logits.unsqueeze(-1), neg_logits.unsqueeze(-1)], dim=1)

            # Cast to float before applying softmax
            posneg_logits = posneg_logits.float()
            scores = torch.nn.functional.softmax(posneg_logits, dim=1)
            entail_score = scores[:, 0].item()
            no_entail_score = scores[:, 1].item()
        
        return entail_score, no_entail_score
    
    prompt = f"Premise: {premise}\nHypothesis: {hypothesis}\nGiven the premise, is the hypothesis correct?\nAnswer:"
    input_ids = nli_tokenizer(prompt, return_tensors='pt').input_ids
    return get_score(nli_model, nli_tokenizer, input_ids)[0]

def generate_mask(generated_rationale, predicted_answer):
    # Create a regex pattern to match the predicted answer case-insensitively and as a whole word
    predicted_answer = str(predicted_answer)
    pattern = re.compile(r'\b' + re.escape(predicted_answer) + r'\b', re.IGNORECASE)
    return pattern.sub("<mask>", generated_rationale)

def evaluate_support(data, nli_model, nli_tokenizer, use_mask=True, use_pieces=False, hypothesis_col='hypothesis', threshold=0.5):
    support_scores = []
    for idx, row in data.iterrows():
        if use_pieces:
            premise = row['concat_rationale_pieces_mask'] if use_mask else row['concat_rationale_pieces']
        else: 
            premise = row['gen_rationale_mask'] if use_mask else row['generated_rationale']
        
        hypothesis = row[hypothesis_col]
        entail_prob = calc_support_prob(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
        support = entail_prob > threshold 
        if entail_prob < threshold:
            print(f"Premise: {premise}")
            print(f"Hypothesis: {hypothesis}")
            print(f"Probability: {entail_prob}")
        support_scores.append({
            'entail_prob': entail_prob,
            'support': support
        })
    return support_scores

def compute_file_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

In [None]:
# data

In [None]:
import pandas as pd
data = pd.read_csv('<link_hidden>/notebooks/analysis/data_vf_q_val_GPT_r_dense_captions_support_concat.csv')
df = data
data

In [None]:
import base64
import requests

def gpt_gen_hypothesis(question, predicted_ans, cost_verbose=0):
    model_name = "gpt-4o-mini-2024-07-18"   
    user_input = f"""Integrate the question and the answer into one sentence.
For example, given the question "What is the man waiting for?" and the answer "taxi", you should output "The man is waiting for taxi."

Question: {question}
Answer: {predicted_ans}
"""
    
    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model_name,
        "messages": [
            # {
            #     "role": "system",
            #     "content": [
            #         {
            #             "type": "text",
            #             "text": system_prompt
            #         }
            #     ]
            # },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_input,
                    },
                ]
            }
        ],
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        print(response.json())
        return None
    else:
        usage = response.json()['usage']
        calculate_cost(usage, model_name, verbose=cost_verbose)
        
        content = response.json()['choices'][0]['message']['content'].strip()
        return content

In [None]:
def generate_alternative_hypotheses(row):
    question_text, choices_text = row['question'].split("Choices: ")
    predicted_answer = row['one_step_answer']
    # Extract the possible answers from the question text
    choices = choices_text.split(', ')
    choices = [choice.strip('. ') for choice in choices]
    
    alternative_hypotheses = []
    for choice in choices:
        if choice.lower() != predicted_answer.lower():
            alternative_hypotheses.append(gpt_gen_hypothesis(question_text, choice))
            
    return alternative_hypotheses

data['alternative_hypotheses'] = data.apply(generate_alternative_hypotheses, axis=1)

data

In [None]:
def evaluate_strict_support(data, nli_model, nli_tokenizer, use_mask=True, use_pieces=False, hypothesis_col='hypothesis', alt_hypotheses_col='alternative_hypotheses', threshold=0.5):
    strict_supports = []
    for idx, row in data.iterrows():
        premise = row['rationale_mask']
        hypothesis = row[hypothesis_col]
        alt_hypotheses = row[alt_hypotheses_col]
        
        entail_prob = calc_support_prob(premise, hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer)
        alt_entail_probs = []
        for alt_hypothesis in alt_hypotheses:
            alt_entail_probs.append(calc_support_prob(premise, alt_hypothesis, nli_model=nli_model, nli_tokenizer=nli_tokenizer))
        
        strict_support = entail_prob > threshold and all(alt_entail_prob < threshold for alt_entail_prob in alt_entail_probs)
#         if strict_support == False:
        print(f"Premise: {premise}")
        print(f"Hypothesis: {hypothesis}")
        print(f"Probability: {entail_prob}")
        print(f"Alternative hypotheses: {alt_hypotheses}")
        print(f"Probability: {alt_entail_probs}")
        print("--------------------------------------------")
        strict_supports.append(strict_support)
        data.loc[idx, 'alt_ent_prob'] = str(alt_entail_probs)
        data.loc[idx, 'strict_sim'] = strict_support
    
evaluate_strict_support(data, nli_model, nli_tokenizer)
data

In [None]:
# Save to Excel
output_path = "val_strict_sim.xlsx"
df.to_excel(output_path, index=False)

In [None]:
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as ExcelImage
import os

# First, save the dataframe to an excel file (without images) to manipulate it with openpyxl
df.to_excel(output_path, index=False)

# Load the saved excel file
wb = load_workbook(output_path)
ws = wb.active

# Add images to the new column in the excel file
for index, row in df.iterrows():
    img_path = row['image_path']
    # Check if the image file exists before adding
    if os.path.exists(img_path):
        img = ExcelImage(img_path)
        img_cell = f"V{index + 2}"  # Placing the image starting from row 2, column R
        ws.add_image(img, img_cell)

# Save the updated excel file with images
output_path_with_images = output_path
wb.save(output_path_with_images)

output_path_with_images
