In [2]:
!pip install openai

None




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
import pandas as pd

# Common QA evaluation code

In [4]:
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys

def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
        
def white_space_fix(text):
        return ' '.join(text.split())

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return str(text).lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

def include_first(prediction, ground_truth):
    return (normalize_answer(ground_truth).split(' ')[0] in normalize_answer(prediction))

def evaluate(gold_answers, predictions):
    f1 = exact_match = include_first_score = total = 0
    correct = []
    for ground_truths, prediction in zip(gold_answers, predictions):
      total += 1
      is_match = metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
      is_include_first = metric_max_over_ground_truths(
                    include_first, prediction, ground_truths)
      include_first_score += is_include_first
      exact_match += is_match
      correct.append(is_match)
      f1 += metric_max_over_ground_truths(
          f1_score, prediction, ground_truths)
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    include_first_score = 100.0 * include_first_score / total

    return {'exact_match': exact_match, 'f1': f1,'is_correct':correct,'include_first_score':include_first_score}

In [5]:
import tqdm.notebook as tq

import numpy as np
from itertools import compress
from matplotlib import pyplot as plt
import ast


# GPT callouts

In [6]:
def geometric_aggregate(numbers):
    aggregated_value = 0
    for i, num in enumerate(numbers):
        aggregated_value += num * (0.5 ** i)
    return aggregated_value


In [7]:
import pprint
import google.generativeai as palm
import backoff

palm.configure(api_key='YOUR_API_KEY')



def palm2(text):
    for i in range(20):
        try:
            response = palm.generate_text(prompt=text, temperature=0.0,
                                  safety_settings=[{"category":"HARM_CATEGORY_DEROGATORY","threshold":"BLOCK_NONE"},
                                                   {"category":"HARM_CATEGORY_TOXICITY","threshold":"BLOCK_NONE"},
                                                   {"category":"HARM_CATEGORY_VIOLENCE","threshold":"BLOCK_NONE"},
                                                   {"category":"HARM_CATEGORY_SEXUAL","threshold":"BLOCK_NONE"},
                                                   {"category":"HARM_CATEGORY_MEDICAL","threshold":"BLOCK_NONE"},
                                                   {"category":"HARM_CATEGORY_DANGEROUS","threshold":"BLOCK_NONE"}],max_output_tokens=100 )
            return response.result
        except Exception as e:
            print(e)
            time.sleep(i*10)
            continue
    raise Exception('PALM2 is not available')


In [9]:
import os
import openai
import time
import requests

openai.api_key = "YOUR_API_KEY"


import backoff

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def gpt_turbo(prompt):
  for i in range(10):
      try:
        response = openai.ChatCompletion.create(
          model ="gpt-3.5-turbo-0301", 
          temperature=0.0, 
          messages=[
              {"role": "user", "content": prompt},
          ]
        )
        return response['choices'][0]['message']['content']
      except Exception as e:
        print(e)
        time.sleep(5)
  raise Exception('GPT Turbo is not available')

def gpt3(prompt,temp=0.0):
    for i in range(10):
      try:
        response =  openai.Completion.create(
          model="text-davinci-003",
          temperature=temp,
          logprobs= 1,
          top_p=0.0,
          prompt = prompt)
        return response['choices'][0]['text'],response['choices'][0]['logprobs']['token_logprobs']
      except Exception as e:
        print(e)
        time.sleep(i*10)
    raise Exception('GPT3 is not available')

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def gpt4(prompt,system_msg='You are a highly intelligent text editor.',temp=0.0):
  if temp>0:
    top_p_custom=1.0
  else:
    top_p_custom=0.0
  for i in range(10):
    try:
      messages=[{"role": "user", "content": prompt},{"role": "system",'content':system_msg}]
      response =  openai.ChatCompletion.create(
        model="gpt-4-0613",
        temperature=temp,
        top_p=top_p_custom,
        messages = messages)
      return response['choices'][0]['message']['content']
    except Exception as e:
      print(e)
      time.sleep(3)

  raise Exception('GPT4 is not available')
  
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def gpt4_no_system_msg(prompt,temp=0.0):
  messages=[{"role": "user", "content": prompt}]
  try:
    response =  openai.ChatCompletion.create(
    model="gpt-4-0613",
    temperature=temp,
    
    messages = messages)
  except:
    time.sleep(10)
    response =  openai.ChatCompletion.create(
    model="gpt-4-0613",
    temperature=temp,
    messages = messages)
  return response['choices'][0]['message']['content']

def llm_qa(context,question, gpt_func='gpt3'):
 
  prompt  = "Answer the question by copying only the answer word to word from the context. Extract the minimal span that answers the question.\nQuestion:" + question+"\nContext:"+ context +'\nExtracted span:'

  if gpt_func=='gpt3':
    ans,scores = gpt3(prompt)
    return {'answer':ans,'score':geometric_aggregate(np.e**np.array(scores[:3]))}
  elif gpt_func=='gpt4':
    ans = gpt4(prompt,"You are a question answering bot.")
    return {'answer':ans,'score':1}
  elif gpt_func=='gpt_turbo':
    ans = gpt_turbo(prompt)
    return {'answer':ans,'score':1}
  elif gpt_func=='palm2':
    ans = palm2(prompt)
    return {'answer':ans,'score':1}
  raise Exception('model not found')



# MISC

In [10]:
def correct_confidence_w_negative(anses,context,question,short_ans,model=llm_qa):
  pred = model(context=context,question = question)
  short_answer = normalize_answer(short_ans)
  if evaluate([anses],[pred['answer']])['f1'] > 30 or short_answer in normalize_answer(pred['answer']): 
    return pred['score'],pred['answer']
  else:
    return -1*pred['score'],pred['answer']

# Models Evaluation

In [19]:
import ast
import pandas as pd

def parse_list_string(list_string):
    try:
        parsed_list = ast.literal_eval(list_string)
        if isinstance(parsed_list, list):
            return parsed_list
        else:
            raise ValueError("Input string does not represent a list.")
    except (ValueError, SyntaxError):
        raise ValueError("Input string is not a valid list representation.")


dataset_adv_df_subset_processed = pd.read_csv('./ShortcutQA1_1.csv')

dataset_adv_df_subset_processed['gold_labels'] = dataset_adv_df_subset_processed.gold_labels.apply(parse_list_string)

only_squad = False
only_newsQA = False

# if only_squad:
#     dataset_adv_df_subset_processed = dataset_adv_df_subset_processed.iloc[:247] #Only Squad samples
# elif only_newsQA:
#     dataset_adv_df_subset_processed = dataset_adv_df_subset_processed.iloc[247:] #Only NewsQA samples


In [None]:
sample = dataset_adv_df_subset_processed.iloc[0]
sample['context natural']

In [None]:
len(dataset_adv_df_subset_processed)

In [37]:
anses_ori = []
anses_adv = []
gpt4_qa = lambda context,question : llm_qa(context,question,'gpt4')
gpt_turbo_qa = lambda context,question : llm_qa(context,question,'gpt_turbo')
gpt3_qa = lambda context,question : llm_qa(context,question,'gpt3')
palm2_qa = lambda context,question : llm_qa(context,question,'palm2')


#config the LLM used in the following line
LLM_to_use = palm2_qa
# llms = [gpt4_qa,gpt_turbo_qa,gpt3_qa,palm2_qa]
# llms_names = ['gpt4_qa','gpt_turbo_qa','gpt3_qa','palm2_qa']
llms = [gpt_turbo_qa]
llms_names = ['gpt_turbo_qa']


In [38]:
b=0

In [None]:
b

# Run

In [None]:
for j in tq.tqdm(range(len(dataset_adv_df_subset_processed)-b)):
  
  row = dataset_adv_df_subset_processed.iloc[b]
  anses = row.gold_labels
  anses_len= [len(ans) for ans in anses]
  short_ans_i = np.argmin(anses_len)
  short_ans = anses[short_ans_i]
  for LLM_to_use,llm_name in zip(llms,llms_names):
    _,ans_ori =  correct_confidence_w_negative(row.gold_labels, row['context natural'], row.question,short_ans,LLM_to_use)
    _,ans_adv = correct_confidence_w_negative(row.gold_labels, row['context edited'],row.question,short_ans,LLM_to_use) 
    dataset_adv_df_subset_processed.at[b, "answer_natural_"+llm_name] = ans_ori
    dataset_adv_df_subset_processed.at[b, "answer_edited_"+llm_name] = ans_adv

  b+=1



In [None]:
b

In [None]:
import pandas as pd

df = dataset_adv_df_subset_processed[['gold_labels','answer_natural_gpt4_qa', 'answer_edited_gpt4_qa',
       'answer_natural_gpt_turbo_qa', 'answer_edited_gpt_turbo_qa',
       'answer_natural_gpt3_qa', 'answer_edited_gpt3_qa',
       'answer_natural_palm2_qa', 'answer_edited_palm2_qa']]

#Define the models and their variants to be evaluated
models = [
    'gpt4_qa',
    'gpt_turbo_qa',
    'gpt3_qa',
    'palm2_qa'
]

# Loop through each model and evaluate its performance
for model in models:
    for variant in ['natural', 'edited']:
        column_name = f"answer_{variant}_{model}"
        
        
        
        # Extract answers for the current model and variant
        answers_model = df[column_name].tolist()
        
        # Evaluate the performance
        metrics = evaluate(df['gold_labels'], [[ans] for ans in answers_model])
        # Print the performance metrics
        print(f"Evaluating model: {model} (Dataset: {variant})")
        print(f"  - Exact Match: {metrics['exact_match']}")
        print(f"  - F1 Score: {metrics['f1']}")


In [None]:
# check for none answers

for model in models:
    for variant in ['natural', 'edited']:
        column_name = f"answer_{variant}_{model}"
        # Extract answers for the current model and variant
        answers_model = df[column_name].tolist()
        
        # Evaluate the performance
        none_indices = [i for i, x in enumerate(answers_model) if x is None or x == '']
        # Print the performance metrics
        print(f"Evaluating model: {model} (Dataset: {variant})")
        print(f"  - Nones answers: {len(none_indices)}")


In [None]:
#ShortcutQA1_1 only


for b in tq.tqdm(range(300)):
    row = dataset_adv_df_subset_processed.iloc[b]
    dataset_adv_df_subset_processed.at[b, "additional text"] = eval(row['distractors_lst'])[row['best_distractor_i_lst']]

na = normalize_answer
IM_ori = []
IM_adv = []
IM_adv_targeted = []
for model in models:
  column_name_natural = f"answer_natural_{model}"
  column_name_edited = f"answer_edited_{model}"

  
  for k,sample in dataset_adv_df_subset_processed.iterrows():
    
    if na(sample[column_name_edited]) in na(sample['additional text']):
      fooled_targeted = True
    else:
      fooled_targeted = False
    IM_adv_targeted.append(fooled_targeted)
    
    is_ori_ans_found = False
    for answer in sample.gold_labels:
      if na(answer) in na(sample[column_name_natural]):
        is_ori_ans_found = True
        break
    IM_ori.append(is_ori_ans_found)

    is_adv_ans_found = False
    for answer in sample.gold_labels:
      if na(answer) in na(sample[column_name_edited]):
        is_adv_ans_found = (True)
        break

    IM_adv.append(is_adv_ans_found)
  print(f"Evaluating model: {model} ")
  print(f"  - IM natural: {np.average(IM_ori)}")
  print(f"  - IM edited: {np.average(IM_adv)}")
  print(f"  - IM targeted: {np.average(IM_adv_targeted)}")



In [None]:
print('EM natural',eval_ori['exact_match'],'f1 natural', eval_ori['f1'],'EM edited',eval_adv['exact_match'],'f1 edited', eval_adv['f1'])
print('IM natural', (np.sum(IM_ori)/len(IM_ori))*100)
print('IM edited', (np.sum(IM_adv)/len(IM_adv))*100)
print('Answer in distractor', (np.sum(IM_adv_targeted)/len(IM_adv_targeted))*100)
