In [None]:
from datasets import load_dataset
import openai
import os
import itertools
import logging
import sys

import dspy
from dsp.utils import deduplicate
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
from dspy.evaluate import answer_exact_match, answer_passage_match
from dspy.evaluate.evaluate import Evaluate
from dsp.utils import deduplicate
from dspy.primitives import module

from copy import copy
import random
import json
import tqdm
import pickle
import pandas as pd
from pandas import json_normalize
import numpy as np

random.seed(1)

## LM and RM

In [None]:
# setting lm and rm in dspy
openai_key = '<api key>'
colbert_server = 'http://index.contextual.ai:8893/api/search'

lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key=openai_key)
rm = dspy.ColBERTv2(url=colbert_server)
dspy.settings.configure(lm=lm, rm=rm)

## Dataset

In [None]:
# dataset download and split
def get_squad_split(squad, split="validation"):
    data = zip(*[squad[split][field] for field in squad[split].features])
    exs = [dspy.Example(question=q, answer=a['text'][0]).with_inputs("question")
           for eid, title, context, q, a in data]
    return exs

squad = load_dataset("squad")
squad_train = get_squad_split(squad, split="train")
squad_dev = get_squad_split(squad)
dev_exs = random.sample(squad_dev, k=20)

In [None]:
def get_squad_df(squad, split="validation", sample = False, sample_size = 50):
    data = squad[split]
    df = pd.json_normalize(data)
    if sample == True:
        df = df.sample(n=sample_size, random_state=1).reset_index(drop=True)
    df['answer'] = df['answers.text'].apply(lambda x: x[0])
    # df.rename({'answers.text': 'answer'}, axis=1, inplace=True)
    df = df[['context', 'question', 'answer']]
    return df

In [None]:
squad_open = get_squad_df(squad, 'train', True)
squad_close = get_squad_df(squad, 'train', True)
squad_open.head()

Unnamed: 0,context,question,answer
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam


In [None]:
def get_halueval_split(dataset):
    data = zip(*[dataset['data'][field] for field in dataset['data'].features])
    exs = [dspy.Example(question=question, answer=right_ans, halu=halu_ans).with_inputs("question")
           for knowledge, question, right_ans, halu_ans in data]
    return exs

halueval_qa = load_dataset('pminervini/HaluEval', 'qa')
halu_ds = get_halueval_split(halueval_qa)
halu_dev = random.sample(halu_ds, k=20)

In [None]:
def get_he_df(squad, split="data", sample = False, sample_size = 50):
    data = squad[split]
    df = pd.json_normalize(data)
    if sample == True:
        df = df.sample(n=sample_size, random_state=1).reset_index(drop=True)
    df = df[['knowledge', 'question', 'right_answer', 'hallucinated_answer']]
    return df

In [None]:
he_close = get_he_df(halueval_qa, 'data', True)
he_open = get_he_df(halueval_qa, 'data', True)
he_open

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on..."
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...
5,"Jules Sitruk (born April 16, 1990 in Lilas, ne...","What comedy film, written and directed by Gart...",Son of Rambow,Jules Sitruk did not act in any of Garth Jenni...
6,Non-Stop is a 2014 American mystery action thr...,Non-Stop starred the English actress best know...,Lady Mary Crawley,Michelle Dockery played Lady Mary.
7,Baby Blue is the fourth studio album by Mexica...,What is the is the fourth studio album by Mexi...,Baby Blue,"Anahí is married to Governor of Chiapas, Manue..."
8,"Homage to the Queen, Op. 42, by Malcolm Arnold...",What was the nationality of the costume design...,English,The costume designer for Homage to the Queen w...
9,Shortia is a small genus of subshrubs or peren...,"Which genus, Shortia or Schizophragma, has mor...",Shortia,Schizophragma has fewer species.


## Segmentation

In [None]:
from nltk.tokenize import sent_tokenize
import nltk

try:
    nltk.download("punkt", quiet=True)
except FileExistsError:  # multiprocessing race condition
    pass

In [None]:
class SentenceSplitter:
    def __init__(self):
        pass

    # use NLTK to split a text string into sentences for now
    def split_into_sentences(self, text):
        return sent_tokenize(text)

In [None]:
splitter = SentenceSplitter()

## Initial Response

## Close QA

In [None]:
class ContextQASignature(dspy.Signature):
    __doc__ = """Answer questions with short factoid answers."""
    context = dspy.InputField(desc="contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc = 'Do not repeat the question, only show final answer.')


class CloseQaRAG(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(ContextQASignature)
        # self.split = SentenceSplitter()

    def forward(self, context, question):
        prediction = self.generate_answer(context = context, question=question)
        response = prediction.answer
        # claims = self.split.split_into_sentences(response)
        return response #claims

In [None]:
closeqa_model = CloseQaRAG()

In [None]:
squad_close['initial_close_response'] = squad_close.apply(lambda x:closeqa_model(context=x['context'], question=x['question']), axis = 1)
squad_close.head()

Unnamed: 0,context,question,answer,initial_close_response
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,"3,000 people."
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism,Roman Catholicism
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969,The war resumed in March 1969.
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen,Zen Buddhist teachings
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam,Encyclopaedia of Islam


In [None]:
he_close['initial_close_response'] = he_close.apply(lambda x:closeqa_model(context=x['knowledge'], question=x['question']), axis = 1)
he_close.head()


Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_close_response
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.,United States Navy during World War II
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...,German
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on...",1961
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016"


In [None]:
# context = he_close['knowledge'][0]
# question = he_close['question'][0]
# closeqa_model(context=context, question=question)

In [None]:
# squad_close['close_claims'] = squad_close['initial_close_response'].apply(len)
# squad_close[(squad_close['close_claims']>1)]

In [None]:
# he_close['close_claims'] = he_close['initial_close_response'].apply(len)
# he_close[(he_close['close_claims']>1)]

## Open QA

In [None]:
class BasicQASignature(dspy.Signature):
    __doc__ = """Answer questions with short factoid answers."""
    question = dspy.InputField()
    answer = dspy.OutputField(desc = 'do not repeat the question, only show final answer')

class OpenQaRAG(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(BasicQASignature)
        # self.split = SentenceSplitter()

    def forward(self, question):
        prediction = self.generate_answer(question=question)
        response = prediction.answer
        # claims = self.split.split_into_sentences(response)
        return response #claims



# with retrieval

# same signature as before in closeQA
# class ContextQASignature(dspy.Signature):
#     __doc__ = """Answer questions with short factoid answers."""
#     context = dspy.InputField(desc="contain relevant facts")
#     question = dspy.InputField()
#     answer = dspy.OutputField(desc = 'Do not repeat the question, only show final answer.')

class ContextOpenQaRAG(dspy.Module):
    def __init__(self, num_passages=1):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.Predict(ContextQASignature)
        # self.split = SentenceSplitter()

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        response = prediction.answer
        # claims = self.split.split_into_sentences(response)
        return response #claims
        # dspy.Prediction(context=context, answer=prediction.answer)


In [None]:
openqa_model = OpenQaRAG()
openqa_retrival_model = ContextOpenQaRAG()

## no retrival

In [None]:
squad_open['initial_open_response'] = squad_open.apply(lambda x:openqa_model(question=x['question']), axis = 1)
squad_open.head()

Unnamed: 0,context,question,answer,initial_open_response
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,Approximately 800 people.
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism,Catholicism
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969,2001
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen,Zen teachings.
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam,The Quran


In [None]:
squad_open['initial_open_context_response'] = squad_open.apply(lambda x:openqa_retrival_model(question=x['question']), axis = 1)
squad_open.head()

Unnamed: 0,context,question,answer,initial_open_response,initial_open_context_response
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,Approximately 800 people.,"3,000 people"
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism,Catholicism,Catholicism
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969,2001,1931
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen,Zen teachings.,No-mind experience and enlightenment.
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam,The Quran,Encyclopaedia of Islam


In [None]:
he_open['initial_open_response'] = he_open.apply(lambda x:openqa_model(question=x['question']), axis = 1)
he_open.head()


Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_open_response
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.,The actor served in the U.S. Navy.
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...,German
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on...",1967
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016"


In [None]:
he_open['initial_open_context_response'] = he_open.apply(lambda x:openqa_retrival_model(question=x['question']), axis = 1)
he_open.head()

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_open_response,initial_open_context_response
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.,The actor served in the U.S. Navy.,Jack Elam served in the United States Navy bef...
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...,German,German
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit,Detroit
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on...",1967,"The song ""Your Mother Should Know"" is based on..."
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","October 10, 2016"


## Hallucination check

In [None]:
class VerifyQASignature(dspy.Signature):
    __doc__ = """Verify if the claim is mentioned in the context."""
    # """You are a hallucination checking agent. Verify if there is relevent information found in context to back up this claim."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField(desc="original question that needs to be answer")
    claim = dspy.InputField()
    answer = dspy.OutputField(desc="First return a binary response, yes or no. Then provide evidence")


In [None]:
class HaluCheckRAG(dspy.Module):
    def __init__(self, num_passages=1):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(VerifyQASignature)

    def forward(self, question, claim, qa, knowledge):
        if qa == 'open':
            context = self.retrieve(question).passages
        elif qa == 'close':
            context = knowledge
        prediction = self.generate_answer(context=context, question=question, claim=claim)
        return dspy.Prediction(context=context, answer=prediction.answer)
        # return dspy.Prediction(answer=prediction.answer)


In [None]:
halu_check_model = HaluCheckRAG()
splitter = SentenceSplitter()

In [None]:
def verify(question, claims, model, splitter, qa, context = ""):
    halu = False
    support = []
    for c in claims:
        # support = []
        # if halu == True:
        #     return halu, support
        # else:
        prediction = model(question = question, claim = c, qa = qa, knowledge = context)
        result = splitter.split_into_sentences(prediction.answer)
        if result[0].startswith('No'):
            halu = True
        support.append(prediction.answer)
    return halu, support
    # , result[1:]

In [None]:
def halu_check(df, response_col, col_label, qa, context_col=""): # label as "close/ open/ context"
    splitter = SentenceSplitter()
    if type(df[response_col][0]) == list:
        df[response_col]=df[response_col].apply(lambda x: x[0])
    df[f'claims_{col_label}'] = df[response_col].apply(lambda x:splitter.split_into_sentences(x))
    if qa == 'close':
        df[[f'hallucination_{col_label}', f'support_{col_label}']] = pd.DataFrame(df.apply(lambda x:verify(x['question'], x[f'claims_{col_label}'], halu_check_model, splitter, qa, x[context_col]), axis = 1).tolist())
    else:
        df[[f'hallucination_{col_label}', f'support_{col_label}']] = pd.DataFrame(df.apply(lambda x:verify(x['question'], x[f'claims_{col_label}'], halu_check_model, splitter, qa), axis = 1).tolist())
    return df

### close

In [None]:
halu_check(he_close, 'initial_close_response', 'close', 'close', 'knowledge')
he_close

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_close_response,claims_close,hallucination_close,support_close
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.,United States Navy during World War II,[United States Navy during World War II],False,[Yes\n\nEvidence: The context explicitly state...
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...,German,[German],False,[Yes\n\nEvidence: The context mentions that Ma...
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit,[Detroit],True,[No\n\nEvidence: The claim is incorrect. The i...
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on...",1961,[1961],False,"[Yes, the claim that the song ""Your Mother Sho..."
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","[October 10, 2016]",False,"[Yes, the claim is mentioned in the context. T..."
5,"Jules Sitruk (born April 16, 1990 in Lilas, ne...","What comedy film, written and directed by Gart...",Son of Rambow,Jules Sitruk did not act in any of Garth Jenni...,Son of Rambow,[Son of Rambow],False,[Yes\n\nEvidence: The context explicitly state...
6,Non-Stop is a 2014 American mystery action thr...,Non-Stop starred the English actress best know...,Lady Mary Crawley,Michelle Dockery played Lady Mary.,Lady Mary Crawley,[Lady Mary Crawley],False,"[Yes, the claim that Lady Mary Crawley is the ..."
7,Baby Blue is the fourth studio album by Mexica...,What is the is the fourth studio album by Mexi...,Baby Blue,"Anahí is married to Governor of Chiapas, Manue...",Baby Blue,[Baby Blue],False,[Yes\n\nEvidence: The context mentions that An...
8,"Homage to the Queen, Op. 42, by Malcolm Arnold...",What was the nationality of the costume design...,English,The costume designer for Homage to the Queen w...,Oliver Messel was English.,[Oliver Messel was English.],False,[Yes\n\nEvidence: The context explicitly state...
9,Shortia is a small genus of subshrubs or peren...,"Which genus, Shortia or Schizophragma, has mor...",Shortia,Schizophragma has fewer species.,"Shortia has five species, while Schizophragma ...","[Shortia has five species, while Schizophragma...",False,"[Yes, the claim is mentioned in the context.]"


In [None]:
he_close[he_close['hallucination_close'] == True]

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_close_response,claims_close,hallucination_close,support_close
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit,[Detroit],True,[No\n\nEvidence: The claim is incorrect. The i...
14,It is directly across from the Allegany Balli...,How many people are employed by the laboratory...,"some 1,000","The laboratory across from McKenzie, Maryland ...","1,000 people.","[1,000 people.]",True,[No\n\nEvidence: The context only mentions the...
40,"The Black Mafia, also known as the Muslim Mafi...","The Black Mafia, also known as the Muslim Mafi...",PBM,"The Black Mafia, also known as the Muslim Mafi...",PBM,[PBM],True,[No\n\nEvidence: The context mentions various ...


In [None]:
halu_check(squad_close, 'initial_close_response', 'close','close', 'context')
squad_close

Unnamed: 0,context,question,answer,initial_close_response,claims_close,hallucination_close,support_close
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,"3,000 people.","[3,000 people.]",True,"[No\n\nEvidence: The claim that 3,000 people a..."
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism,Roman Catholicism,[Roman Catholicism],False,[Yes\n\nEvidence: The context explicitly state...
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969,The war resumed in March 1969.,[The war resumed in March 1969.],False,"[Yes, the claim is mentioned in the context.]"
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen,Zen Buddhist teachings,[Zen Buddhist teachings],False,[Yes\n\nEvidence: The context explicitly state...
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam,Encyclopaedia of Islam,[Encyclopaedia of Islam],False,[Yes\n\nEvidence: The claim that Welch express...
5,The foundation of Northwestern University is t...,What type of church is Northwestern University...,Methodist Episcopal Church,Methodist Episcopal Church,[Methodist Episcopal Church],False,[Yes\n\nEvidence: The context explicitly state...
6,Brasília is known as a departing point for the...,What air sports event did Brasilia host in 2003?,the 14th Hang Gliding World Championship,Hang Gliding World Championship,[Hang Gliding World Championship],False,"[Yes, the claim that Brasília hosted the Hang ..."
7,"As of 2013, the American Idol alumni in their ...",How many single have been sold by American Ido...,120 million,Over 120 million singles.,[Over 120 million singles.],False,[Yes\n\nEvidence: The claim is directly suppor...
8,Professional wrestling shows can be considered...,How do the wrestlers treat the audience?,The audience is recognized and acknowledged by...,The wrestlers acknowledge and interact with th...,[The wrestlers acknowledge and interact with t...,False,"[Yes, the claim is mentioned in the context. T..."
9,"In the 19th century, a house near Ephesus in T...",In which country is Ephesus?,Turkey,Turkey,[Turkey],False,[Yes\n\nEvidence: Ephesus is indeed located in...


In [None]:
squad_close[squad_close['hallucination_close'] == True]

Unnamed: 0,context,question,answer,initial_close_response,claims_close,hallucination_close,support_close
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,"3,000 people.","[3,000 people.]",True,"[No\n\nEvidence: The claim that 3,000 people a..."
41,"Television personality Piers Morgan, a former ...",Who shared the first homosexual kiss on EastEn...,Colin Russell and Guido Smith,Colin Russell and Guido Smith.,[Colin Russell and Guido Smith.],True,"[No, the claim is not mentioned in the context..."


### open/ context

In [None]:
halu_check(squad_open, 'initial_open_response', 'open', 'open')
squad_open

Unnamed: 0,context,question,answer,initial_open_response,initial_open_context_response,claims_open,hallucination_open,support_open
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,Approximately 800 people.,"3,000 people",[Approximately 800 people.],True,[No\n\nEvidence: The context clearly states th...
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism,Catholicism,Catholicism,[Catholicism],False,[Yes\n\nEvidence: The context explicitly state...
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969,2001,1931,[2001],True,[No\n\nThere is no mention in the context of t...
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen,Zen teachings.,No-mind experience and enlightenment.,[Zen teachings.],False,"[Yes, Zen teachings are often full of paradox,..."
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam,The Quran,Encyclopaedia of Islam,[The Quran],True,[No\n\nEvidence: The claim is not mentioned in...
5,The foundation of Northwestern University is t...,What type of church is Northwestern University...,Methodist Episcopal Church,Northwestern University is associated with the...,Reformed Church in America,[Northwestern University is associated with th...,True,[No\n\nEvidence: The context explicitly states...
6,Brasília is known as a departing point for the...,What air sports event did Brasilia host in 2003?,the 14th Hang Gliding World Championship,World Air Games,Hang Gliding World Championship,[World Air Games],True,[No\n\nEvidence: The context specifically ment...
7,"As of 2013, the American Idol alumni in their ...",How many single have been sold by American Ido...,120 million,Over 33 million singles.,Over 5 million copies worldwide.,[Over 33 million singles.],False,"[Yes, the claim is supported by the context as..."
8,Professional wrestling shows can be considered...,How do the wrestlers treat the audience?,The audience is recognized and acknowledged by...,With respect and appreciation.,As a combat sport.,[With respect and appreciation.],False,[Yes\n\nEvidence: The context mentions that pu...
9,"In the 19th century, a house near Ephesus in T...",In which country is Ephesus?,Turkey,Turkey,Turkey,[Turkey],False,"[Yes, the claim that Ephesus is in Turkey is m..."


In [None]:
squad_open[squad_open['hallucination_open'] == True].shape

(24, 11)

In [None]:
halu_check(squad_open, 'initial_open_context_response', 'context', 'open')
squad_open

Unnamed: 0,context,question,answer,initial_open_response,initial_open_context_response,claims_open,hallucination_open,support_open,claims_context,hallucination_context,support_context
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,Approximately 800 people.,"3,000 people",[Approximately 800 people.],True,[No\n\nEvidence: The context clearly states th...,"[3,000 people]",False,[Yes\n\nEvidence: The claim is directly suppor...
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism,Catholicism,Catholicism,[Catholicism],False,[Yes\n\nEvidence: The context explicitly state...,[Catholicism],False,[Yes\n\nEvidence: The context explicitly state...
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969,2001,1931,[2001],True,[No\n\nThere is no mention in the context of t...,[1931],False,[Yes\n\nEvidence: The context explicitly state...
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen,Zen teachings.,No-mind experience and enlightenment.,[Zen teachings.],False,"[Yes, Zen teachings are often full of paradox,...",[No-mind experience and enlightenment.],False,[Yes\n\nEvidence: The context specifically men...
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam,The Quran,Encyclopaedia of Islam,[The Quran],True,[No\n\nEvidence: The claim is not mentioned in...,[Encyclopaedia of Islam],False,[Yes\n\nEvidence: The context specifically men...
5,The foundation of Northwestern University is t...,What type of church is Northwestern University...,Methodist Episcopal Church,Northwestern University is associated with the...,Reformed Church in America,[Northwestern University is associated with th...,True,[No\n\nEvidence: The context explicitly states...,[Reformed Church in America],False,[Yes\n\nEvidence: The context mentions that No...
6,Brasília is known as a departing point for the...,What air sports event did Brasilia host in 2003?,the 14th Hang Gliding World Championship,World Air Games,Hang Gliding World Championship,[World Air Games],True,[No\n\nEvidence: The context specifically ment...,[Hang Gliding World Championship],False,[Yes\n\nEvidence: The context explicitly state...
7,"As of 2013, the American Idol alumni in their ...",How many single have been sold by American Ido...,120 million,Over 33 million singles.,Over 5 million copies worldwide.,[Over 33 million singles.],False,"[Yes, the claim is supported by the context as...",[Over 5 million copies worldwide.],False,"[Yes, the claim is mentioned in the context as..."
8,Professional wrestling shows can be considered...,How do the wrestlers treat the audience?,The audience is recognized and acknowledged by...,With respect and appreciation.,As a combat sport.,[With respect and appreciation.],False,[Yes\n\nEvidence: The context mentions that pu...,[As a combat sport.],False,[Yes\n\nEvidence: The context mentions that pu...
9,"In the 19th century, a house near Ephesus in T...",In which country is Ephesus?,Turkey,Turkey,Turkey,[Turkey],False,"[Yes, the claim that Ephesus is in Turkey is m...",[Turkey],False,"[Yes, the claim that Ephesus is in Turkey is m..."


In [None]:
squad_open[squad_open['hallucination_context'] == True].shape

(1, 11)

In [None]:
halu_check(he_open, 'initial_open_response', 'open', 'open')
he_open

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_open_response,initial_open_context_response,claims_open,hallucination_open,support_open
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.,The actor served in the U.S. Navy.,Jack Elam served in the United States Navy bef...,[The actor served in the U.S. Navy.],False,[Yes\n\nEvidence: The context does not explici...
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...,German,German,[German],False,[Yes\n\nEvidence: The context mentions that Ha...
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit,Detroit,[Detroit],True,[No\n\nEvidence: The claim is incorrect. The m...
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on...",1967,"The song ""Your Mother Should Know"" is based on...",[1967],False,[Yes\n\nEvidence: The context explicitly state...
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","October 10, 2016","[October 10, 2016]",True,[No\n\nEvidence: Tyler Hoechlin first appeared...
5,"Jules Sitruk (born April 16, 1990 in Lilas, ne...","What comedy film, written and directed by Gart...",Son of Rambow,Jules Sitruk did not act in any of Garth Jenni...,Son of Rambow,"""Son of Rambow""",[Son of Rambow],True,[No\n\nEvidence: The context does not mention ...
6,Non-Stop is a 2014 American mystery action thr...,Non-Stop starred the English actress best know...,Lady Mary Crawley,Michelle Dockery played Lady Mary.,Lady Mary Crawley,Beryl Patmore,[Lady Mary Crawley],True,[No\n\nEvidence: The context clearly states th...
7,Baby Blue is the fourth studio album by Mexica...,What is the is the fourth studio album by Mexi...,Baby Blue,"Anahí is married to Governor of Chiapas, Manue...",Gloria Trevi.,Inesperado,[Gloria Trevi.],True,[No\n\nEvidence: The context clearly states th...
8,"Homage to the Queen, Op. 42, by Malcolm Arnold...",What was the nationality of the costume design...,English,The costume designer for Homage to the Queen w...,Irish,Oliver Messel was British.,[Irish],True,[No\n\nEvidence: The context does not provide ...
9,Shortia is a small genus of subshrubs or peren...,"Which genus, Shortia or Schizophragma, has mor...",Shortia,Schizophragma has fewer species.,Shortia,Shortia,[Shortia],False,"[Yes, the claim that Shortia has more species ..."


In [None]:
he_open[he_open['hallucination_open'] == True][['right_answer', 'hallucinated_answer', 'initial_open_response', 'hallucination_open', 'support_open']].shape

(33, 5)

In [None]:
halu_check(he_open, 'initial_open_context_response', 'context','open')
he_open

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_open_response,initial_open_context_response,claims_open,hallucination_open,support_open,claims_context,hallucination_context,support_context
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.,The actor served in the U.S. Navy.,Jack Elam served in the United States Navy bef...,[The actor served in the U.S. Navy.],False,[Yes\n\nEvidence: The context does not explici...,[Jack Elam served in the United States Navy be...,False,[Yes\n\nEvidence: According to various sources...
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...,German,German,[German],False,[Yes\n\nEvidence: The context mentions that Ha...,[German],False,[Yes\n\nEvidence: The context mentions that Ha...
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit,Detroit,[Detroit],True,[No\n\nEvidence: The claim is incorrect. The m...,[Detroit],True,[No\n\nEvidence: The claim is incorrect. The m...
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on...",1967,"The song ""Your Mother Should Know"" is based on...",[1967],False,[Yes\n\nEvidence: The context explicitly state...,"[The song ""Your Mother Should Know"" is based o...",False,"[Yes, the claim is mentioned in the context.]"
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","October 10, 2016","[October 10, 2016]",True,[No\n\nEvidence: Tyler Hoechlin first appeared...,"[October 10, 2016]",True,[No\n\nEvidence: Tyler Hoechlin first appeared...
5,"Jules Sitruk (born April 16, 1990 in Lilas, ne...","What comedy film, written and directed by Gart...",Son of Rambow,Jules Sitruk did not act in any of Garth Jenni...,Son of Rambow,"""Son of Rambow""",[Son of Rambow],True,[No\n\nEvidence: The context does not mention ...,"[""Son of Rambow""]",False,[Yes\n\nEvidence: The claim is supported by th...
6,Non-Stop is a 2014 American mystery action thr...,Non-Stop starred the English actress best know...,Lady Mary Crawley,Michelle Dockery played Lady Mary.,Lady Mary Crawley,Beryl Patmore,[Lady Mary Crawley],True,[No\n\nEvidence: The context clearly states th...,[Beryl Patmore],False,"[Yes, the claim that Beryl Patmore is the role..."
7,Baby Blue is the fourth studio album by Mexica...,What is the is the fourth studio album by Mexi...,Baby Blue,"Anahí is married to Governor of Chiapas, Manue...",Gloria Trevi.,Inesperado,[Gloria Trevi.],True,[No\n\nEvidence: The context clearly states th...,[Inesperado],True,[No\n\nEvidence: The claim is not mentioned in...
8,"Homage to the Queen, Op. 42, by Malcolm Arnold...",What was the nationality of the costume design...,English,The costume designer for Homage to the Queen w...,Irish,Oliver Messel was British.,[Irish],True,[No\n\nEvidence: The context does not provide ...,[Oliver Messel was British.],False,[Yes\n\nEvidence: The context states that the ...
9,Shortia is a small genus of subshrubs or peren...,"Which genus, Shortia or Schizophragma, has mor...",Shortia,Schizophragma has fewer species.,Shortia,Shortia,[Shortia],False,"[Yes, the claim that Shortia has more species ...",[Shortia],False,"[Yes, the claim that Shortia has more species ..."


In [None]:
he_open[he_open['hallucination_context'] == True][['right_answer', 'hallucinated_answer', 'initial_open_context_response', 'hallucination_context', 'support_context']].shape

(21, 5)

In [None]:
he_open[he_open['hallucination_context'] == False][['right_answer', 'hallucinated_answer', 'initial_open_context_response', 'hallucination_context', 'support_context']]


Unnamed: 0,right_answer,hallucinated_answer,initial_open_context_response,hallucination_context,support_context
0,the United States Navy,Jack Elam served in the Air Force.,Jack Elam served in the United States Navy bef...,False,[Yes\n\nEvidence: According to various sources...
1,German,Hans Loewald studied philosophy with a French ...,German,False,[Yes\n\nEvidence: The context mentions that Ha...
3,1961,"The song ""Your Mother Should Know"" is based on...","The song ""Your Mother Should Know"" is based on...",False,"[Yes, the claim is mentioned in the context.]"
5,Son of Rambow,Jules Sitruk did not act in any of Garth Jenni...,"""Son of Rambow""",False,[Yes\n\nEvidence: The claim is supported by th...
6,Lady Mary Crawley,Michelle Dockery played Lady Mary.,Beryl Patmore,False,"[Yes, the claim that Beryl Patmore is the role..."
8,English,The costume designer for Homage to the Queen w...,Oliver Messel was British.,False,[Yes\n\nEvidence: The context states that the ...
9,Shortia,Schizophragma has fewer species.,Shortia,False,"[Yes, the claim that Shortia has more species ..."
10,United States,Mount Cardigan is a prominent bare-rock summit...,USA,False,"[Yes, the claim that Mount Cardigan is in the ..."
11,Texas Rangers,The Arizona Cardinals baseball team.,Boston Red Sox,False,[Yes\n\nEvidence: The context explicitly state...
12,"""Casper"" (1995)",Deanna Oliver and Sherri Stoner did some work ...,Casper,False,[Yes\n\nEvidence: The context mentions that Sh...


## Modification

In [None]:
class ReQueryQASignature(dspy.Signature):
    __doc__ = """Answer questions with short factoid answers."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    response = dspy.InputField()
    feedback = dspy.InputField()
    answer = dspy.OutputField(desc = 'Based on the context and taking the feedback on the response to the question into consideration, answer the questions wtih short answers (limited to less than 6 words)')

class ModifyOpenQaRAG(dspy.Module):
    def __init__(self, num_passages=1):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.Predict(ReQueryQASignature)

    def forward(self, question, response, feedback, retrieval=True):
        if retrieval == True:
            context = self.retrieve(question).passages
        else:
            context = ""
        prediction = self.generate_answer(context=context, question=question, response=response, feedback=feedback )
        answer = prediction.answer
        return answer


In [None]:
modify_model = ModifyOpenQaRAG()

In [None]:
he_open['support_open']=he_open['support_open'].apply(lambda x: ' '.join(x))
he_open['support_context']=he_open['support_context'].apply(lambda x: ' '.join(x))

In [None]:
he_open['modified_open_response'] = he_open.apply(lambda x: modify_model(x['question'], x['initial_open_response'], x['support_open'], False), axis = 1)
he_open['modified_context_response'] = he_open.apply(lambda x: modify_model(x['question'], x['initial_open_context_response'], x['support_context'],True), axis = 1)


In [None]:
he_open.columns

Index(['knowledge', 'question', 'right_answer', 'hallucinated_answer',
       'initial_open_response', 'initial_open_context_response', 'claims_open',
       'hallucination_open', 'support_open', 'claims_context',
       'hallucination_context', 'support_context', 'support_context_len',
       'modified_open_response', 'modified_context_response',
       'EM_right_answer', 'PM_right_answer', 'BertScore_f1_right_answer',
       'BertScore_p_right_answer', 'BertScore_r_right_answer',
       'meteor_right_answer', 'bleu_right_answer', 'rouge1_right_answer',
       'rouge2_right_answer', 'rougeL_right_answer', 'EM_hallucinated_answer',
       'PM_hallucinated_answer', 'BertScore_f1_hallucinated_answer',
       'BertScore_p_hallucinated_answer', 'BertScore_r_hallucinated_answer',
       'meteor_hallucinated_answer', 'bleu_hallucinated_answer',
       'rouge1_hallucinated_answer', 'rouge2_hallucinated_answer',
       'rougeL_hallucinated_answer'],
      dtype='object')

In [None]:
he_open[he_open['hallucination_open']==True][['hallucination_open','hallucinated_answer', 'initial_open_response', 'modified_open_response', 'right_answer']]

Unnamed: 0,hallucination_open,hallucinated_answer,initial_open_response,modified_open_response,right_answer
2,True,The Knicks-Nuggets brawl was more infamous tha...,Detroit,"Auburn Hills, Michigan","Auburn Hills, Michigan"
4,True,Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","October 10, 2016","October 10, 2016"
5,True,Jules Sitruk did not act in any of Garth Jenni...,Son of Rambow,Son of Rambow,Son of Rambow
6,True,Michelle Dockery played Lady Mary.,Lady Mary Crawley,Michelle Dockery,Lady Mary Crawley
7,True,"Anahí is married to Governor of Chiapas, Manue...",Gloria Trevi.,Anahí.,Baby Blue
8,True,The costume designer for Homage to the Queen w...,Irish,British,English
11,True,The Arizona Cardinals baseball team.,Texas Rangers,Boston Red Sox,Texas Rangers
13,True,The tracks were premiered at Federation Square...,St Kilda Road,Federation Square,Flinders Street railway station
14,True,"The laboratory across from McKenzie, Maryland ...","Approximately 1,500 people.","Approximately 6,000 people.","some 1,000"
15,True,Comedian is a 2002 American documentary film f...,1957,1954,"July 21, 1952"


In [None]:
he_open[he_open['hallucination_context']==True][['hallucination_context', 'hallucinated_answer', 'initial_open_context_response', 'modified_context_response','right_answer']]

Unnamed: 0,hallucination_context,hallucinated_answer,initial_open_context_response,modified_context_response,right_answer
2,True,The Knicks-Nuggets brawl was more infamous tha...,Detroit,Detroit,"Auburn Hills, Michigan"
4,True,Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","October 10, 2016","October 10, 2016"
7,True,"Anahí is married to Governor of Chiapas, Manue...",Inesperado,Inesperado,Baby Blue
15,True,Comedian is a 2002 American documentary film f...,Born in 1970.,Born in 1954.,"July 21, 1952"
19,True,The television movie Shadow of the Cobra is ba...,"Danish photographer and filmmaker, Bjørn Stigson.",Shadow of the Cobra is a 1989 television movie...,Charles Sobhraj
20,True,"""The Mighty Atom"" was a 1917 British silent dr...",The Mighty Atom,The Mighty Atom,A Pit Boy's Romance
25,True,Kim Clijsters is older by months.,Kim Clijsters,Mary Pierce,Mary Pierce
26,True,Necropolis is a fantasy novel by Anthony Horow...,"Anthony Horowitz was born on April 5, 1955.","Anthony Horowitz was born on April 5, 1955.",5 April 1955
28,True,Adolf Hitler was commanding the SM UC-25 minel...,Karl Dönitz,Wilhelm II,Karl Dönitz
29,True,The man who won the gold medal in the competit...,Nigeria,We cannot determine the birthplace of the male...,Windhoek


## Evaluation

In [None]:
# !pip install bert-score
# !pip install evaluate
# !pip install rouge_score
# !pip install sentencepiece

In [None]:
import evaluate
from evaluate import load
import tensorflow as tf
import bleurt
from bleurt import score
import evaluate

bertscore = load("bertscore")
checkpoint = "bleurt/test_checkpoint"
scorer = score.BleurtScorer(checkpoint)
meteor = evaluate.load('meteor')
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

[nltk_data] Downloading package wordnet to /home/jenny/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jenny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jenny/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
def eval(input_df, pred, ref): # pred = 'initial_response'
    df = input_df[[pred, ref]]
    df[f"EM_{ref}"] = df.apply(lambda x: answer_exact_match(dspy.Example(answer= [x[ref]]), dspy.Prediction(answer=x[pred][0])) ,axis =1)
    df[f"PM_{ref}"] = df.apply(lambda x: answer_passage_match(dspy.Example(answer= [x[ref]]), dspy.Prediction(context=x[pred][0])) ,axis =1)
    df[f'BertScore_f1_{ref}'] = df.apply(lambda x: bertscore.compute(predictions=[x[pred]], references=[x[ref]], model_type='bert-base-uncased')['f1'][0] ,axis =1)
    df[f'BertScore_p_{ref}'] = df.apply(lambda x: bertscore.compute(predictions=[x[pred]], references=[x[ref]], model_type='bert-base-uncased')['precision'][0] ,axis =1)
    df[f'BertScore_r_{ref}'] = df.apply(lambda x: bertscore.compute(predictions=[x[pred]], references=[x[ref]], model_type='bert-base-uncased')['recall'][0] ,axis =1)
    # df[f'bleurt_{ref}'] = df.apply(lambda x: scorer.score(references=x[ref], candidates=x[pred])[0] ,axis =1)

    df[f'meteor_{ref}'] = df.apply(lambda x: meteor.compute(predictions=[x[pred]], references=[x[ref]])['meteor'] ,axis =1)
    df[f'bleu_{ref}'] = df.apply(lambda x: bleu.compute(predictions=[x[pred]], references=[x[ref]])['bleu'] ,axis =1)
    df[f'rouge1_{ref}'] = df.apply(lambda x: rouge.compute(predictions=[x[pred]], references=[x[ref]])['rouge1'] ,axis =1)
    df[f'rouge2_{ref}'] = df.apply(lambda x: rouge.compute(predictions=[x[pred]], references=[x[ref]])['rouge2'] ,axis =1)
    df[f'rougeL_{ref}'] = df.apply(lambda x: rouge.compute(predictions=[x[pred]], references=[x[ref]])['rougeL'] ,axis =1)
    return df

In [None]:
eval(squad_open, 'initial_open_response', 'answer')
eval(squad_open, 'initial_open_response', 'context')
eval(squad_open, 'initial_open_context_response', 'answer')
eval(squad_open, 'initial_open_context_response', 'context')
squad_open

Unnamed: 0,context,question,answer,initial_open_response,initial_open_context_response,claims_open,hallucination_open,support_open,claims_context,hallucination_context,...,EM_context,PM_context,BertScore_f1_context,BertScore_p_context,BertScore_r_context,meteor_context,bleu_context,rouge1_context,rouge2_context,rougeL_context
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000,Approximately 800 people.,"3,000 people",[Approximately 800 people.],True,[No\n\nEvidence: The context clearly states th...,"[3,000 people]",False,...,False,False,0.348897,0.604015,0.245293,0.005187,0.0,0.03,0.020202,0.03
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism,Catholicism,Catholicism,[Catholicism],False,[Yes\n\nEvidence: The context explicitly state...,[Catholicism],False,...,False,False,0.34186,0.557015,0.246605,0.004785,0.0,0.018868,0.0,0.018868
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969,2001,1931,[2001],True,[No\n\nThere is no mention in the context of t...,[1931],False,...,False,False,0.280465,0.407808,0.213726,0.0,0.0,0.0,0.0,0.0
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen,Zen teachings.,No-mind experience and enlightenment.,[Zen teachings.],False,"[Yes, Zen teachings are often full of paradox,...",[No-mind experience and enlightenment.],False,...,False,False,0.41399,0.530935,0.339263,0.008019,0.0,0.015748,0.0,0.015748
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam,The Quran,Encyclopaedia of Islam,[The Quran],True,[No\n\nEvidence: The claim is not mentioned in...,[Encyclopaedia of Islam],False,...,False,False,0.371661,0.788287,0.243151,0.006739,0.0,0.027397,0.018433,0.027397
5,The foundation of Northwestern University is t...,What type of church is Northwestern University...,Methodist Episcopal Church,Northwestern University is associated with the...,Reformed Church in America,[Northwestern University is associated with th...,True,[No\n\nEvidence: The context explicitly states...,[Reformed Church in America],False,...,False,False,0.325724,0.466795,0.250132,0.008084,0.0,0.03125,0.0,0.03125
6,Brasília is known as a departing point for the...,What air sports event did Brasilia host in 2003?,the 14th Hang Gliding World Championship,World Air Games,Hang Gliding World Championship,[World Air Games],True,[No\n\nEvidence: The context specifically ment...,[Hang Gliding World Championship],False,...,False,False,0.428669,0.738198,0.302027,0.033577,1.783247e-11,0.081633,0.0625,0.081633
7,"As of 2013, the American Idol alumni in their ...",How many single have been sold by American Ido...,120 million,Over 33 million singles.,Over 5 million copies worldwide.,[Over 33 million singles.],False,"[Yes, the claim is supported by the context as...",[Over 5 million copies worldwide.],False,...,False,False,0.486992,0.599206,0.410177,0.05102,0.0,0.111111,0.0,0.111111
8,Professional wrestling shows can be considered...,How do the wrestlers treat the audience?,The audience is recognized and acknowledged by...,With respect and appreciation.,As a combat sport.,[With respect and appreciation.],False,[Yes\n\nEvidence: The context mentions that pu...,[As a combat sport.],False,...,False,False,0.355968,0.473854,0.285052,0.013263,0.0,0.02963,0.0,0.02963
9,"In the 19th century, a house near Ephesus in T...",In which country is Ephesus?,Turkey,Turkey,Turkey,[Turkey],False,"[Yes, the claim that Ephesus is in Turkey is m...",[Turkey],False,...,False,False,0.289664,0.577141,0.193353,0.00455,0.0,0.018018,0.0,0.018018


In [None]:
eval(he_open, 'initial_open_response', 'right_answer')
eval(he_open, 'initial_open_response', 'hallucinated_answer')
eval(he_open, 'initial_open_context_response', 'right_answer')
eval(he_open, 'initial_open_context_response', 'hallucinated_answer')
he_open

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_open_response,initial_open_context_response,claims_open,hallucination_open,support_open,claims_context,...,EM_hallucinated_answer,PM_hallucinated_answer,BertScore_f1_hallucinated_answer,BertScore_p_hallucinated_answer,BertScore_r_hallucinated_answer,meteor_hallucinated_answer,bleu_hallucinated_answer,rouge1_hallucinated_answer,rouge2_hallucinated_answer,rougeL_hallucinated_answer
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.,The actor served in the U.S. Navy.,Jack Elam served in the United States Navy bef...,[The actor served in the U.S. Navy.],False,Yes\n\nEvidence: The context does not explicit...,[Jack Elam served in the United States Navy be...,...,False,False,0.719747,0.651335,0.804217,0.69281,0.302664,0.526316,0.470588,0.526316
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...,German,German,[German],False,Yes\n\nEvidence: The context mentions that Han...,[German],...,False,False,0.338611,0.436686,0.27651,0.0,0.0,0.0,0.0,0.0
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit,Detroit,[Detroit],True,No\n\nEvidence: The claim is incorrect. The mo...,[Detroit],...,False,False,0.262563,0.262151,0.262977,0.0,0.0,0.0,0.0,0.0
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on...",1967,"The song ""Your Mother Should Know"" is based on...",[1967],False,Yes\n\nEvidence: The context explicitly states...,"[The song ""Your Mother Should Know"" is based o...",...,False,False,0.996379,0.996379,0.996379,0.949446,0.903602,0.941176,0.9375,0.941176
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","October 10, 2016","[October 10, 2016]",True,No\n\nEvidence: Tyler Hoechlin first appeared ...,"[October 10, 2016]",...,False,False,0.465039,0.753055,0.336384,0.183853,0.0,0.25,0.0,0.25
5,"Jules Sitruk (born April 16, 1990 in Lilas, ne...","What comedy film, written and directed by Gart...",Son of Rambow,Jules Sitruk did not act in any of Garth Jenni...,Son of Rambow,"""Son of Rambow""",[Son of Rambow],True,No\n\nEvidence: The context does not mention J...,"[""Son of Rambow""]",...,False,False,0.31758,0.331002,0.305204,0.040984,0.0,0.142857,0.0,0.142857
6,Non-Stop is a 2014 American mystery action thr...,Non-Stop starred the English actress best know...,Lady Mary Crawley,Michelle Dockery played Lady Mary.,Lady Mary Crawley,Beryl Patmore,[Lady Mary Crawley],True,No\n\nEvidence: The context clearly states tha...,[Beryl Patmore],...,False,False,0.39922,0.399456,0.398984,0.0,0.0,0.0,0.0,0.0
7,Baby Blue is the fourth studio album by Mexica...,What is the is the fourth studio album by Mexi...,Baby Blue,"Anahí is married to Governor of Chiapas, Manue...",Gloria Trevi.,Inesperado,[Gloria Trevi.],True,No\n\nEvidence: The context clearly states tha...,[Inesperado],...,False,False,0.321514,0.36823,0.285317,0.0,0.0,0.0,0.0,0.0
8,"Homage to the Queen, Op. 42, by Malcolm Arnold...",What was the nationality of the costume design...,English,The costume designer for Homage to the Queen w...,Irish,Oliver Messel was British.,[Irish],True,No\n\nEvidence: The context does not provide a...,[Oliver Messel was British.],...,False,False,0.530588,0.59627,0.477941,0.28312,0.0,0.285714,0.166667,0.285714
9,Shortia is a small genus of subshrubs or peren...,"Which genus, Shortia or Schizophragma, has mor...",Shortia,Schizophragma has fewer species.,Shortia,Shortia,[Shortia],False,"Yes, the claim that Shortia has more species i...",[Shortia],...,False,False,0.381335,0.431628,0.34154,0.0,0.0,0.0,0.0,0.0


In [None]:
# he_open_results = he_open[['hallucination_open', 'BertScore_f1_right_answer', 'BertScore_f1_hallucinated_answer']]
# 'right_answer', 'hallucinated_answer', 'initial_open_response', 'initial_open_context_response', 'hallucination_context',

he_open[['hallucination_context', 'BertScore_f1_right_answer', 'BertScore_f1_hallucinated_answer']].groupby(by=['hallucination_context']).mean()

Unnamed: 0_level_0,BertScore_f1_right_answer,BertScore_f1_hallucinated_answer
hallucination_context,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.645178,0.465062
True,0.566132,0.424398


In [None]:
he_open[['hallucination_open', 'BertScore_f1_right_answer', 'BertScore_f1_hallucinated_answer']].groupby(by=['hallucination_open']).mean()

Unnamed: 0_level_0,BertScore_f1_right_answer,BertScore_f1_hallucinated_answer
hallucination_open,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.654175,0.493348
True,0.590241,0.424613


In [None]:
he_open[['hallucination_open', 'BertScore_f1_right_answer',
       'BertScore_p_right_answer', 'BertScore_r_right_answer',
       'meteor_right_answer', 'bleu_right_answer', 'rouge1_right_answer',
       'rouge2_right_answer', 'rougeL_right_answer']].groupby(by=['hallucination_open']).mean()

Unnamed: 0_level_0,BertScore_f1_right_answer,BertScore_p_right_answer,BertScore_r_right_answer,meteor_right_answer,bleu_right_answer,rouge1_right_answer,rouge2_right_answer,rougeL_right_answer
hallucination_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,0.654175,0.650072,0.681386,0.389437,0.070199,0.491453,0.231092,0.491453
True,0.590241,0.578547,0.616806,0.185828,0.034144,0.247742,0.110606,0.242233


In [None]:
he_open[['hallucination_open',  'BertScore_f1_hallucinated_answer',
       'BertScore_p_hallucinated_answer', 'BertScore_r_hallucinated_answer',
       'meteor_hallucinated_answer', 'bleu_hallucinated_answer',
       'rouge1_hallucinated_answer', 'rouge2_hallucinated_answer',
       'rougeL_hallucinated_answer']].groupby(by=['hallucination_open']).mean()

Unnamed: 0_level_0,BertScore_f1_hallucinated_answer,BertScore_p_hallucinated_answer,BertScore_r_hallucinated_answer,meteor_hallucinated_answer,bleu_hallucinated_answer,rouge1_hallucinated_answer,rouge2_hallucinated_answer,rougeL_hallucinated_answer
hallucination_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,0.493348,0.557231,0.461158,0.184409,0.078358,0.215381,0.120201,0.198978
True,0.424613,0.508128,0.3749,0.102595,0.02166,0.136474,0.079267,0.136474


In [None]:
he_open[he_open['hallucination_context'] == True]

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer,initial_open_response,initial_open_context_response,claims_open,hallucination_open,support_open,claims_context,...,EM_hallucinated_answer,PM_hallucinated_answer,BertScore_f1_hallucinated_answer,BertScore_p_hallucinated_answer,BertScore_r_hallucinated_answer,meteor_hallucinated_answer,bleu_hallucinated_answer,rouge1_hallucinated_answer,rouge2_hallucinated_answer,rougeL_hallucinated_answer
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...,Detroit,Detroit,[Detroit],True,[No\n\nEvidence: The claim is incorrect. The m...,[Detroit],...,False,False,0.262563,0.262151,0.262977,0.0,0.0,0.0,0.0,0.0
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...,"October 10, 2016","October 10, 2016","[October 10, 2016]",True,[No\n\nEvidence: Tyler Hoechlin first appeared...,"[October 10, 2016]",...,False,False,0.465039,0.753055,0.336384,0.183853,0.0,0.25,0.0,0.25
7,Baby Blue is the fourth studio album by Mexica...,What is the is the fourth studio album by Mexi...,Baby Blue,"Anahí is married to Governor of Chiapas, Manue...",Gloria Trevi.,Inesperado,[Gloria Trevi.],True,[No\n\nEvidence: The context clearly states th...,[Inesperado],...,False,False,0.321514,0.36823,0.285317,0.0,0.0,0.0,0.0,0.0
15,Comedian is a 2002 American documentary film f...,"2002 American documentary film, Comedian, is a...","July 21, 1952",Comedian is a 2002 American documentary film f...,1957,Born in 1970.,[1957],True,[No\n\nEvidence: Jerry Seinfeld was born on Ap...,[Born in 1970.],...,False,False,0.345794,0.475727,0.27161,0.111596,0.0,0.148148,0.08,0.148148
19,Shadow of the Cobra is a 1989 television movie...,Shadow of the Cobra is a 1989 television movie...,Charles Sobhraj,The television movie Shadow of the Cobra is ba...,Ted Bundy,"Danish photographer and filmmaker, Bjørn Stigson.",[Ted Bundy],True,[No\n\nEvidence: The context provided does not...,"[Danish photographer and filmmaker, Bjørn Stig...",...,False,False,0.362268,0.371591,0.353402,0.053191,0.0,0.0,0.0,0.0
20,A Pit Boy's Romance is a 1917 British silent d...,Which 1917 British silent drama film stars the...,A Pit Boy's Romance,"""The Mighty Atom"" was a 1917 British silent dr...",The Ring,The Mighty Atom,[The Ring],True,[No.],[The Mighty Atom],...,False,False,0.376303,0.646944,0.265313,0.14649,0.0,0.272727,0.2,0.272727
25,Kim Antonie Lode Clijsters (] ; born 8 June 19...,Between two tennis players Kim Clijsters and M...,Mary Pierce,Kim Clijsters is older by months.,Kim Clijsters,Kim Clijsters,[Kim Clijsters],True,[No],[Kim Clijsters],...,False,False,0.662561,0.848099,0.543631,0.288462,0.0,0.5,0.333333,0.5
26,Necropolis is a fantasy novel by British write...,"Necropolis is a fantasy novel by English, nove...",5 April 1955,Necropolis is a fantasy novel by Anthony Horow...,"April 5, 1955","Anthony Horowitz was born on April 5, 1955.","[April 5, 1955]",True,"[No, the claim that Anthony Horowitz was born ...","[Anthony Horowitz was born on April 5, 1955.]",...,False,False,0.654172,0.722422,0.597704,0.35249,0.0,0.380952,0.210526,0.380952
28,"SM ""UC-25"" was a German Type UC II minelaying ...",Who is the head of state of Germany who comman...,Karl Dönitz,Adolf Hitler was commanding the SM UC-25 minel...,Kaiser Wilhelm II,Karl Dönitz,[Kaiser Wilhelm II],True,[No\n\nEvidence: The context clearly states th...,[Karl Dönitz],...,False,False,0.347896,0.43231,0.291063,0.0,0.0,0.0,0.0,0.0
29,He was born in Windhoek. He won a gold medal ...,Where was the man who won the gold medal in th...,Windhoek,The man who won the gold medal in the competit...,"Born in Havana, Cuba.",Nigeria,"[Born in Havana, Cuba.]",True,[No. There is no mention of the birthplace of ...,[Nigeria],...,False,False,0.253208,0.355504,0.196628,0.0,0.0,0.0,0.0,0.0


In [None]:
df1 = pd.DataFrame(he_open[(he_open['hallucination_context'] == True) & (he_open['hallucination_open'] == True)][['hallucination_open',  'BertScore_f1_hallucinated_answer',
       'BertScore_p_hallucinated_answer', 'BertScore_r_hallucinated_answer',
       'meteor_hallucinated_answer', 'bleu_hallucinated_answer',
       'rouge1_hallucinated_answer', 'rouge2_hallucinated_answer',
       'rougeL_hallucinated_answer']].mean()).reset_index()

In [None]:
df2 = pd.DataFrame(he_open[(he_open['hallucination_context'] == True) & (he_open['hallucination_open'] == True)][['hallucination_open', 'BertScore_f1_right_answer',
       'BertScore_p_right_answer', 'BertScore_r_right_answer',
       'meteor_right_answer', 'bleu_right_answer', 'rouge1_right_answer',
       'rouge2_right_answer', 'rougeL_right_answer']].mean()).reset_index()

In [None]:
pd.concat([df1, df2], axis = 1)

Unnamed: 0,index,0,index.1,0.1
0,hallucination_open,1.0,hallucination_open,1.0
1,BertScore_f1_hallucinated_answer,0.406451,BertScore_f1_right_answer,0.589181
2,BertScore_p_hallucinated_answer,0.516156,BertScore_p_right_answer,0.587726
3,BertScore_r_hallucinated_answer,0.342934,BertScore_r_right_answer,0.611653
4,meteor_hallucinated_answer,0.106461,meteor_right_answer,0.234506
5,bleu_hallucinated_answer,0.016381,bleu_right_answer,0.059302
6,rouge1_hallucinated_answer,0.156582,rouge1_right_answer,0.286853
7,rouge2_hallucinated_answer,0.094489,rouge2_right_answer,0.139474
8,rougeL_hallucinated_answer,0.156582,rougeL_right_answer,0.277284


In [None]:
he_open.columns

Index(['knowledge', 'question', 'right_answer', 'hallucinated_answer',
       'initial_open_response', 'initial_open_context_response', 'claims_open',
       'hallucination_open', 'support_open', 'claims_context',
       'hallucination_context', 'support_context', 'support_context_len',
       'modified_open_response', 'modified_context_response',
       'EM_right_answer', 'PM_right_answer', 'BertScore_f1_right_answer',
       'BertScore_p_right_answer', 'BertScore_r_right_answer',
       'meteor_right_answer', 'bleu_right_answer', 'rouge1_right_answer',
       'rouge2_right_answer', 'rougeL_right_answer', 'EM_hallucinated_answer',
       'PM_hallucinated_answer', 'BertScore_f1_hallucinated_answer',
       'BertScore_p_hallucinated_answer', 'BertScore_r_hallucinated_answer',
       'meteor_hallucinated_answer', 'bleu_hallucinated_answer',
       'rouge1_hallucinated_answer', 'rouge2_hallucinated_answer',
       'rougeL_hallucinated_answer'],
      dtype='object')

In [None]:
# eval(he_open, 'modified_open_response', 'right_answer')
# eval(he_open, 'modified_context_response', 'right_answer')
he_open_eval = he_open[['right_answer', 'modified_open_response', 'initial_open_response']]
he_open_eval['BertScore_f1_initial'] = he_open_eval.apply(lambda x: bertscore.compute(predictions=[x['initial_open_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
he_open_eval['BertScore_f1_modified'] = he_open_eval.apply(lambda x: bertscore.compute(predictions=[x['modified_open_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
he_open_eval['BertScore_f1_improvement'] = he_open_eval['BertScore_f1_modified'] - he_open_eval['BertScore_f1_initial']
he_open_eval




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  he_open_eval['BertScore_f1_initial'] = he_open_eval.apply(lambda x: bertscore.compute(predictions=[x['initial_open_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  he_open_eval['BertScore_f1_modified'] = he_open_eval.apply(lambda x: bertscore.compute(predictions=[x['modified_open_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
A value is trying to be set on a copy of a sli

Unnamed: 0,right_answer,modified_open_response,initial_open_response,BertScore_f1_initial,BertScore_f1_modified,BertScore_f1_improvement
0,the United States Navy,U.S. Navy.,The actor served in the U.S. Navy.,0.57913,0.654006,0.074876
1,German,German,German,1.0,1.0,0.0
2,"Auburn Hills, Michigan","Auburn Hills, Michigan",Detroit,0.491526,1.0,0.508474
3,1961,1967,1967,0.881259,0.881259,0.0
4,"October 10, 2016","October 10, 2016","October 10, 2016",1.0,1.0,0.0
5,Son of Rambow,Son of Rambow,Son of Rambow,1.0,1.0,0.0
6,Lady Mary Crawley,Michelle Dockery,Lady Mary Crawley,1.0,0.407682,-0.592318
7,Baby Blue,Anahí.,Gloria Trevi.,0.455289,0.499462,0.044174
8,English,Irish,Irish,0.702996,0.702996,0.0
9,Shortia,Shortia,Shortia,1.0,1.0,0.0


In [None]:
he_open_eval['BertScore_f1_improvement'].mean()*100

1.948958873748779

In [None]:
he_context_eval = he_open[['right_answer', 'modified_context_response', 'initial_open_context_response']]
he_context_eval['BertScore_f1_initial'] = he_context_eval.apply(lambda x: bertscore.compute(predictions=[x['initial_open_context_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
he_context_eval['BertScore_f1_modified'] = he_context_eval.apply(lambda x: bertscore.compute(predictions=[x['modified_context_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
he_context_eval['BertScore_f1_improvement'] = he_context_eval['BertScore_f1_modified'] - he_open_eval['BertScore_f1_initial']
he_context_eval



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  he_context_eval['BertScore_f1_initial'] = he_context_eval.apply(lambda x: bertscore.compute(predictions=[x['initial_open_context_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  he_context_eval['BertScore_f1_modified'] = he_context_eval.apply(lambda x: bertscore.compute(predictions=[x['modified_context_response']], references=[x['right_answer']], model_type='bert-base-uncased')['f1'][0] ,axis =1)
A value is trying to be

Unnamed: 0,right_answer,modified_context_response,initial_open_context_response,BertScore_f1_initial,BertScore_f1_modified,BertScore_f1_improvement
0,the United States Navy,United States Navy.,Jack Elam served in the United States Navy bef...,0.476122,0.750588,0.171458
1,German,German,German,1.0,1.0,0.0
2,"Auburn Hills, Michigan","Auburn Hills, Michigan",Detroit,0.491526,1.0,0.508474
3,1961,1961,"The song ""Your Mother Should Know"" is based on...",0.353518,1.0,0.118742
4,"October 10, 2016","October 10, 2016","October 10, 2016",1.0,1.0,0.0
5,Son of Rambow,"""Son of Rambow""","""Son of Rambow""",0.791778,0.791778,-0.208222
6,Lady Mary Crawley,Beryl Patmore,Beryl Patmore,0.481616,0.481616,-0.518384
7,Baby Blue,Inesperado,Inesperado,0.395143,0.395143,-0.060146
8,English,British.,Oliver Messel was British.,0.345054,0.407046,-0.29595
9,Shortia,Shortia,Shortia,1.0,1.0,0.0


In [None]:
he_context_eval['BertScore_f1_improvement'].mean()*100

1.6687226891517641

In [None]:
squad_open.columns

Index(['context', 'question', 'answer', 'initial_open_response',
       'initial_open_context_response', 'claims_open', 'hallucination_open',
       'support_open', 'claims_context', 'hallucination_context',
       'support_context', 'EM_answer', 'PM_answer', 'BertScore_f1_answer',
       'BertScore_p_answer', 'BertScore_r_answer', 'meteor_answer',
       'bleu_answer', 'rouge1_answer', 'rouge2_answer', 'rougeL_answer',
       'EM_context', 'PM_context', 'BertScore_f1_context',
       'BertScore_p_context', 'BertScore_r_context', 'meteor_context',
       'bleu_context', 'rouge1_context', 'rouge2_context', 'rougeL_context'],
      dtype='object')

In [None]:
squad_open[['hallucination_open', 'BertScore_f1_answer', 'BertScore_f1_context']].groupby(by=['hallucination_open']).mean()

Unnamed: 0_level_0,BertScore_f1_answer,BertScore_f1_context
hallucination_open,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.578381,0.392958
True,0.631238,0.383228


In [None]:
squad_open[['hallucination_context', 'BertScore_f1_answer', 'BertScore_f1_context']].groupby(by=['hallucination_context']).mean()

Unnamed: 0_level_0,BertScore_f1_answer,BertScore_f1_context
hallucination_context,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.607214,0.388407
True,0.434139,0.382436


### testing

In [None]:
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
results1 = meteor.compute(predictions=predictions, references=references)
results2 = bleu.compute(predictions=predictions, references=references)
results3 = rouge.compute(predictions=predictions, references=references)
print(results1, results2, results3)

{'meteor': 0.9375} {'bleu': 0.0, 'precisions': [1.0, 1.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 4, 'reference_length': 4} {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}


In [None]:
squad_df['context_EM'] = squad_df.apply(lambda x: answer_exact_match(dspy.Example(answer= x['answer']), dspy.Prediction(answer=x['initial_response'][0])) ,axis =1)
squad_df['context_PM'] = squad_df.apply(lambda x: answer_passage_match(dspy.Example(answer= x['answer']), dspy.Prediction(context=x['initial_response'][0])) ,axis =1)
squad_df['context_BertScore_f1'] = squad_df.apply(lambda x: bertscore.compute(predictions=x['initial_response'], references=x['answer'], model_type='bert-base-uncased')['f1'][0] ,axis =1)
squad_df['context_BertScore_p'] = squad_df.apply(lambda x: bertscore.compute(predictions=x['initial_response'], references=x['answer'], model_type='bert-base-uncased')['precision'][0] ,axis =1)
squad_df['context_BertScore_r'] = squad_df.apply(lambda x: bertscore.compute(predictions=x['initial_response'], references=x['answer'], model_type='bert-base-uncased')['recall'][0] ,axis =1)
squad_df['context_bleurt'] = squad_df.apply(lambda x: scorer.score(references=x['answer'], candidates=x['initial_response'])[0] ,axis =1)

squad_df['context_meteor'] = squad_df.apply(lambda x: meteor.compute(predictions=x['initial_response'], references=x['answer'])['meteor'] ,axis =1)
squad_df['context_bleu'] = squad_df.apply(lambda x: bleu.compute(predictions=x['initial_response'], references=x['answer'])['bleu'] ,axis =1)
squad_df['context_rouge1'] = squad_df.apply(lambda x: rouge.compute(predictions=x['initial_response'], references=x['answer'])['rouge1'] ,axis =1)
squad_df['context_rouge2'] = squad_df.apply(lambda x: rouge.compute(predictions=x['initial_response'], references=x['answer'])['rouge2'] ,axis =1)
squad_df['context_rougeL'] = squad_df.apply(lambda x: rouge.compute(predictions=x['initial_response'], references=x['answer'])['rougeL'] ,axis =1)

squad_df


Unnamed: 0,context,question,answer,initial_response,EM,PM,BertScore_f1,BertScore_p,BertScore_r,bleurt,meteor,bleu,rouge1,rouge2,rougeL
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,"[3,000]","[3,000 people.]",False,False,0.733794,0.678739,0.798569,0.49532,0.416667,0.0,0.8,0.666667,0.8
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",[Roman Catholicism],[Roman Catholicism],True,False,1.0,1.0,1.0,0.916482,0.9375,0.0,1.0,1.0,1.0
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,[March 1969],[The war resumed in March 1969.],False,False,0.539062,0.426093,0.733545,-0.240882,0.75,0.0,0.5,0.333333,0.5
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,[Zen],[Zen Buddhist teachings],False,False,0.435897,0.416563,0.457113,-0.231144,0.416667,0.0,0.5,0.0,0.5
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,[Encyclopaedia of Islam],[Encyclopaedia of Islam],True,False,1.0,1.0,1.0,0.913636,0.981481,0.0,1.0,1.0,1.0
5,The foundation of Northwestern University is t...,What type of church is Northwestern University...,[Methodist Episcopal Church],[Methodist Episcopal Church],True,False,1.0,1.0,1.0,0.891541,0.981481,0.0,1.0,1.0,1.0
6,Brasília is known as a departing point for the...,What air sports event did Brasilia host in 2003?,[the 14th Hang Gliding World Championship],[Hang Gliding World Championship],False,False,0.80239,0.912431,0.716035,-0.076505,0.684267,0.606531,0.8,0.75,0.8
7,"As of 2013, the American Idol alumni in their ...",How many single have been sold by American Ido...,[120 million],[Over 120 million singles.],False,False,0.608368,0.541435,0.694183,0.215278,0.815217,0.0,0.666667,0.5,0.666667
8,Professional wrestling shows can be considered...,How do the wrestlers treat the audience?,[The audience is recognized and acknowledged b...,[The wrestlers acknowledge and interact with t...,False,False,0.665603,0.630804,0.704465,0.327087,0.502148,0.0,0.48,0.173913,0.4
9,"In the 19th century, a house near Ephesus in T...",In which country is Ephesus?,[Turkey],[Turkey],True,False,1.0,1.0,1.0,0.870956,0.5,0.0,1.0,0.0,1.0


In [None]:
squad_df['context_meteor'] = squad_df.apply(lambda x: meteor.compute(predictions=x['initial_response'], references=[x['context']])['meteor'] ,axis =1)
squad_df

# [squad_df['context'][0]]

Unnamed: 0,context,question,answer,initial_response,EM,PM,BertScore_f1,BertScore_p,BertScore_r,bleurt,meteor,bleu,rouge1,rouge2,rougeL,context_meteor
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,"[3,000]","[3,000 people.]",False,False,0.733794,0.678739,0.798569,0.49532,0.416667,0.0,0.8,0.666667,0.8,0.007776
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",[Roman Catholicism],[Roman Catholicism],True,False,1.0,1.0,1.0,0.916482,0.9375,0.0,1.0,1.0,1.0,0.00956
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,[March 1969],[The war resumed in March 1969.],False,False,0.539062,0.426093,0.733545,-0.240882,0.75,0.0,0.5,0.333333,0.5,0.066409
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,[Zen],[Zen Buddhist teachings],False,False,0.435897,0.416563,0.457113,-0.231144,0.416667,0.0,0.5,0.0,0.5,0.020527
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,[Encyclopaedia of Islam],[Encyclopaedia of Islam],True,False,1.0,1.0,1.0,0.913636,0.981481,0.0,1.0,1.0,1.0,0.006739
5,The foundation of Northwestern University is t...,What type of church is Northwestern University...,[Methodist Episcopal Church],[Methodist Episcopal Church],True,False,1.0,1.0,1.0,0.891541,0.981481,0.0,1.0,1.0,1.0,0.023822
6,Brasília is known as a departing point for the...,What air sports event did Brasilia host in 2003?,[the 14th Hang Gliding World Championship],[Hang Gliding World Championship],False,False,0.80239,0.912431,0.716035,-0.076505,0.684267,0.606531,0.8,0.75,0.8,0.033577
7,"As of 2013, the American Idol alumni in their ...",How many single have been sold by American Ido...,[120 million],[Over 120 million singles.],False,False,0.608368,0.541435,0.694183,0.215278,0.815217,0.0,0.666667,0.5,0.666667,0.152218
8,Professional wrestling shows can be considered...,How do the wrestlers treat the audience?,[The audience is recognized and acknowledged b...,[The wrestlers acknowledge and interact with t...,False,False,0.665603,0.630804,0.704465,0.327087,0.502148,0.0,0.48,0.173913,0.4,0.055306
9,"In the 19th century, a house near Ephesus in T...",In which country is Ephesus?,[Turkey],[Turkey],True,False,1.0,1.0,1.0,0.870956,0.5,0.0,1.0,0.0,1.0,0.00455


In [None]:
correctness_map = {
    "EM": lambda x: answer_exact_match(dspy.Example(answer= x["answer"]), dspy.Prediction(answer=x['initial_response'][0])),
}

def correctness(answer_column):
        if type(golden_or_context) == str:
        golden_or_context = [golden_or_context]
    correctness_map = {
    "EM": lambda x: answer_exact_match(dspy.Example(answer= x[answer_column]), dspy.Prediction(answer=x['initial_response'][0])),
    }
    for column_name, f in correctness_map.items():
        squad_df[f"{answer_column}_{column_name}"] = squad_df.apply(f, axis=1)
