In [39]:
from datasets import load_dataset
import openai
import os
import itertools
import logging
import sys

import dspy
from dsp.utils import deduplicate
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
from dspy.evaluate import answer_exact_match, answer_passage_match
from dspy.evaluate.evaluate import Evaluate
from dsp.utils import deduplicate
from dspy.primitives import module

from copy import copy
import random
import json
import tqdm
import pickle
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

random.seed(1)

In [None]:
# !pip uninstall dspy-ai -y
# !pip install git+https://github.com/stanfordnlp/dspy.git
# !pip install dspy-ai

# LM and RM

In [3]:
# setting lm and rm in dspy
openai_key = '<api key>'
colbert_server = 'http://index.contextual.ai:8893/api/search'

lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key=openai_key)
rm = dspy.ColBERTv2(url=colbert_server)
dspy.settings.configure(lm=lm, rm=rm)

# llama = dspy.OllamaLocal(    
#     model="llama2:latest", 
#     stop=['---','Explanation:','<|im_start|>','<|im_end|>'],
#     model_type = "chat")
# rm = dspy.ColBERTv2(url=colbert_server)
# dspy.settings.configure(lm=llama, rm=rm)
 

# Dataset

## SQuAD

In [7]:
# dataset download and split
def get_squad_split(squad, split="validation"):
    data = zip(*[squad[split][field] for field in squad[split].features])
    exs = [dspy.Example(question=q, answer=a['text'][0]).with_inputs("question")
           for eid, title, context, q, a in data]
    return exs
    
squad = load_dataset("squad")
squad_train = get_squad_split(squad, split="train")
squad_dev = get_squad_split(squad)
dev_exs = random.sample(squad_dev, k=20)
dev_exs

[Example({'question': 'Where was the Muslim Brotherhood founded?', 'answer': 'Ismailiyah, Egypt'}) (input_keys={'question'}),
 Example({'question': 'How many types of science fiction have been impacted by Tesla?', 'answer': 'several'}) (input_keys={'question'}),
 Example({'question': "How man of Grainger Town's 450 buildings are listed?", 'answer': '244'}) (input_keys={'question'}),
 Example({'question': 'Where was the new media day event for Super Bowl 50 held?', 'answer': 'SAP Center in San Jose.'}) (input_keys={'question'}),
 Example({'question': 'How old was Newton during Super Bowl 50?', 'answer': '26'}) (input_keys={'question'}),
 Example({'question': "At what university's facility did the Panthers practice?", 'answer': 'San Jose State'}) (input_keys={'question'}),
 Example({'question': 'What distinguishes stromal thylakoids?', 'answer': 'are in contact with the stroma'}) (input_keys={'question'}),
 Example({'question': 'What date were the top two stadium choices for Super Bowl 5

In [92]:
# squad_train_df = pd.json_normalize(squad['train'])[['question', 'answers.text']]
squad_dev_df = squad_train_df.sample(n=50, random_state=1).reset_index(drop=True)
squad_dev_df.rename({'answers.text': 'answer'}, axis=1, inplace=True)
squad_dev_df

Unnamed: 0,question,answer
0,How many people were estimated by authorities ...,"[3,000]"
1,"Until the Reformation, what was the establishe...",[Roman Catholicism]
2,When did the war start up again?,[March 1969]
3,What Buddhist teachings are often full of para...,[Zen]
4,In which work did Welch express his belief tha...,[Encyclopaedia of Islam]
5,What type of church is Northwestern University...,[Methodist Episcopal Church]
6,What air sports event did Brasilia host in 2003?,[the 14th Hang Gliding World Championship]
7,How many single have been sold by American Ido...,[120 million]
8,How do the wrestlers treat the audience?,[The audience is recognized and acknowledged b...
9,In which country is Ephesus?,[Turkey]


## HaluEval

In [31]:
def get_halueval_split(dataset):
    data = zip(*[dataset['data'][field] for field in dataset['data'].features])
    exs = [dspy.Example(question=question, answer=right_ans, halu=halu_ans).with_inputs("question")
           for knowledge, question, right_ans, halu_ans in data]
    return exs

halueval_qa = load_dataset('pminervini/HaluEval', 'qa')
halu_ds = get_halueval_split(halueval_qa)
halu_dev = random.sample(halu_ds, k=20)

In [32]:
halu_dev[1]

Example({'question': 'Gary Groth is editor in chief of an American magazine of news and criticism pertaining to comic books, comic strips and graphic novels, as well as co-founder of what?', 'answer': 'Fantagraphics Books', 'halu': 'Gary Groth is also a co-founder of Marvel Comics.'}) (input_keys={'question'})

# Evalation 

In [15]:
from dspy.evaluate import answer_exact_match
from dspy.evaluate.evaluate import Evaluate

In [16]:
answer_exact_match(dspy.Example(answer="STAGE 2!"), dspy.Prediction(answer="stage 2"))

True

In [17]:
dev_evaluater = Evaluate(
    devset=dev_exs, # 200 examples
    num_threads=1,
    display_progress=True,
    display_table=5)

In [18]:
tiny_evaluater = Evaluate(
    devset=dev_exs[: 15],
    num_threads=1,
    display_progress=True,
    display_table=5)

In [75]:
# tiny_evaluater(basic_openqa_model, metric=answer_exact_match)

In [None]:
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

## Segmentation 

In [10]:
from nltk.tokenize import sent_tokenize
import nltk

try:
    nltk.download("punkt", quiet=True)
except FileExistsError:  # multiprocessing race condition
    pass

In [11]:
class SentenceSplitter:
    def __init__(self):
        pass

    # use NLTK to split a text string into sentences for now
    def split_into_sentences(self, text):
        return sent_tokenize(text)

In [12]:
splitter = SentenceSplitter() 

In [13]:
text = "Gary Zukav | Gary Zukav Gary Zukav (born October 17, 1942) is an American spiritual teacher and the author of four consecutive New York Times Best Sellers. Gary Zukav was born in Port Arthur, Texas, and spent his early childhood in San Antonio and Houston. His family moved to Pittsburg, Kansas, while he was in fourth grade."

In [14]:
sentences = splitter.split_into_sentences(text)
sentences

['Gary Zukav | Gary Zukav Gary Zukav (born October 17, 1942) is an American spiritual teacher and the author of four consecutive New York Times Best Sellers.',
 'Gary Zukav was born in Port Arthur, Texas, and spent his early childhood in San Antonio and Houston.',
 'His family moved to Pittsburg, Kansas, while he was in fourth grade.']

In [15]:
len(sentences)

3

### Signatures testing

In [55]:
# Singnatures
class BasicQASignature(dspy.Signature):
    __doc__ = """Answer questions with short factoid answers."""
    question = dspy.InputField()
    answer = dspy.OutputField() #desc="often between 1 and 5 words")

# class QASignature(dspy.Signature):
#     __doc__ = """Answer questions with short factoid answers."""
#     question = dspy.InputField()
#     answer = dspy.OutputField(
#         desc="return mostly in english, short answers (limited to less than 8 words), no conversaztional response, no complete sentence")

# class ContextQASignature(dspy.Signature):
#     __doc__ = """Answer questions with short factoid answers."""
#     context = dspy.InputField(desc="may contain relevant facts")
#     question = dspy.InputField()
#     answer = dspy.OutputField(
#         desc="return mostly in english, short answers (limited to less than 8 words), no conversaztional response, no complete sentence")

# MultiHop
class GenerateSearchQuery(dspy.Signature):
    __doc__ = """Write a simple search query that will help answer a complex question."""
    context = dspy.InputField(desc="may contain relevant facts")
    claim = dspy.InputField()
    query = dspy.OutputField(desc = "A short question uniquely answered by the context.")

class Summarizer(dspy.Signature):
    __doc__ = """Summarize the main points of the text."""
    context = dspy.InputField()
    summary = dspy.OutputField(desc = "do not start with 'Context:' or 2'Summary:'")
    

In [25]:
sig_predictor = dspy.Predict(BasicQASignature)
ans = sig_predictor(question="Which U.S. states border no U.S. states?")
ans.answer

'Hawaii and Alaska'

In [26]:
# cot_predictor = dspy.ChainOfThought("question -> answer")
cot_predictor = dspy.ChainOfThought(BasicQASignature)


In [27]:
# response = basic_predictor(question="Which award did Gary Zukav's first book receive?")
response = cot_predictor(question="Which award did Gary Zukav's first book receive?")
response

Prediction(
    rationale='Answer: The Seat of the Soul received the American Book Award in 1989.',
    answer='The Seat of the Soul received the American Book Award in 1989.'
)

In [28]:
response.rationale

'Answer: The Seat of the Soul received the American Book Award in 1989.'

In [29]:

# basic_predictor = dspy.ChainOfThought(BasicQASignature)
# response = basic_predictor(question="Which award did Gary Zukav's first book receive?")
# response.answer

# RAG pipelines

## Pipeline for initial query 

### Signatures

In [110]:
class BasicQASignature(dspy.Signature):
    __doc__ = """Answer questions with short factoid answers."""
    question = dspy.InputField()
    answer = dspy.OutputField(desc = 'do not repeat the question, only show final answer') 



### RAG

In [111]:
class BasicOpenQaRAG(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(BasicQASignature)
        self.split = SentenceSplitter() 
    
    def forward(self, question):
        prediction = self.generate_answer(question=question)
        response = prediction.answer
        claims = self.split.split_into_sentences(response)
        return claims

# with retrieval
class ContextOpenQaRAG(dspy.Module):
    def __init__(self, num_passages=1):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.Predict(ContextQASignature)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)



In [112]:
basic_openqa_model = BasicOpenQaRAG()     

In [113]:
claims = basic_openqa_model(question="Which award did Gary Zukav's first book receive?")
claims

['The American Book Award']

In [114]:
len(claims)

1

## Pipelines for hallucination verification 

### Signatures

In [119]:
class ContextQASignature(dspy.Signature):
    __doc__ = """Answer if the claim is supported by the context."""
    context = dspy.InputField(desc="may contain relevant facts")
    claim = dspy.InputField()
    answer = dspy.OutputField(
        desc="return binary response, yes or no.")

class GenerateSearchQuery(dspy.Signature):
    __doc__ = """Write a simple search query that will help answer a complex question."""
    context = dspy.InputField(desc="may contain relevant facts")
    claim = dspy.InputField()
    query = dspy.OutputField(desc = "A short question uniquely answered by the context.")


### RAG

In [120]:
class HaluCheckRAG(dspy.Module):
    def __init__(self, num_passages=1):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(ContextQASignature)

    def forward(self, claim):
        context = self.retrieve(claim).passages
        prediction = self.generate_answer(context=context, claim=claim)
        return dspy.Prediction(context=context, answer=prediction.answer)
        # return dspy.Prediction(answer=prediction.answer)



In [121]:
class MultiHopRAG(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()
        self.generate_question = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(ContextQASignature)
        self.max_hops = max_hops
    
    def forward(self, claim):
        context = []
        for hop in range(self.max_hops):
            query = self.generate_question[hop](context=context, claim=claim).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, claim=claim)
        return dspy.Prediction(context=context, answer=pred.answer)




In [122]:
halu_check_model = HaluCheckRAG()
multihop_halu_model = MultiHopRAG()

In [123]:
result = halu_check_model(claim=claims[0])
result

Prediction(
    context=['American Book Awards | for 1984, and were renamed "National" in 1987. The American Book Award is also unrelated to the American Booksellers Association (ABA), although that organization maintains a complete list of award winners that is readily available. Since the 1970s that trade group is also unrelated to the National Book Awards, which it established in 1936 and jointly re-established them as book industry awards in 1950. 2016 2017 American Book Awards The 38th Annual American Book Awards were presented October 22, 2017 at the San Francisco Jazz Center. The winners were as follows: For seven years 1980 to 1986, there were two'],
    answer='No'
)

In [68]:
result = multihop_halu_model(claim=claims[0])
result

Prediction(
    context=['National Book Award for Young People\'s Literature | the US from December 1 to November 30. The National Book Foundation accepts nominations from publishers until June 15, requires mailing nominated books to the panelists by August 1, and announces five finalists in October. The winner is announced on the day of the final ceremony in November. The award is $10,000 and a bronze sculpture; other finalists get $1000, a medal, and a citation written by the panel. There were 230 books nominated for the 2010 award. Books for "children" were first recognized by the National Book Awards in 1969 (publication year 1968). Through 1979 there was a single', 'National Book Award for Poetry | National Book Foundation accepts nominations from publishers until June 15, requires mailing nominated books to the panelists by August 1, and announces five finalists in October. The winner is announced on the day of the final ceremony in November. The award is $10,000 and a bronze scu

In [125]:
def verify(claims, model):
    halu = False
    for c in claims:
        if halu == True:
            return True
        else:
            prediction = model(claim = c)
            if prediction.answer == 'No':
                halu = True
    return halu, prediction.context[0]

In [74]:
halu_check_model = HaluCheckRAG()
multihop_halu_model = MultiHopRAG()

verify(claims, halu_check_model)

(False,
 'National Book Award | National Book Award The National Book Awards are a set of annual U.S. literary awards. At the final National Book Awards Ceremony every November, the National Book Foundation presents the National Book Awards and two lifetime achievement awards to authors. The National Book Awards were established in 1936 by the American Booksellers Association, abandoned during World War II, and re-established by three book industry organizations in 1950. Non-U.S. authors and publishers were eligible for the pre-war awards. Now they are presented to U.S. authors for books published in the United States roughly during the award year. The nonprofit National Book')

In [94]:
squad_dev_df.head()

Unnamed: 0,question,answer
0,How many people were estimated by authorities ...,"[3,000]"
1,"Until the Reformation, what was the establishe...",[Roman Catholicism]
2,When did the war start up again?,[March 1969]
3,What Buddhist teachings are often full of para...,[Zen]
4,In which work did Welch express his belief tha...,[Encyclopaedia of Islam]


In [128]:
def run(df):
    df['initial_response'] = df['question'].apply(lambda x:basic_openqa_model(question=x))
    df['claims'] = df['initial_response'].apply(lambda x:splitter.split_into_sentences(x[0]))
    df['hallucination'] = df['claims'].apply(lambda x:verify(x, halu_check_model)[0])
    return df

In [129]:
run(squad_dev_df)

Unnamed: 0,question,answer,initial_response,claims,hallucination
0,How many people were estimated by authorities ...,"[3,000]",[Approximately 800 people.],[Approximately 800 people.],False
1,"Until the Reformation, what was the establishe...",[Roman Catholicism],[Catholicism],[Catholicism],False
2,When did the war start up again?,[March 1969],[2001],[2001],False
3,What Buddhist teachings are often full of para...,[Zen],[Zen teachings.],[Zen teachings.],False
4,In which work did Welch express his belief tha...,[Encyclopaedia of Islam],[The Quran],[The Quran],False
5,What type of church is Northwestern University...,[Methodist Episcopal Church],[Northwestern University is associated with th...,[Northwestern University is associated with th...,False
6,What air sports event did Brasilia host in 2003?,[the 14th Hang Gliding World Championship],[World Air Games],[World Air Games],False
7,How many single have been sold by American Ido...,[120 million],[Over 33 million singles.],[Over 33 million singles.],False
8,How do the wrestlers treat the audience?,[The audience is recognized and acknowledged b...,[With respect and appreciation.],[With respect and appreciation.],False
9,In which country is Ephesus?,[Turkey],[Turkey],[Turkey],False


# Optimizers

In [23]:
# Optimizer
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

def bootstrap_optimize(model):
    teleprompter = BootstrapFewShot(metric=validate_context_and_answer) 
    optimized_program = teleprompter.compile(model, trainset=dev_exs)
    return optimized_program

def bootstrap_rm_optimize(model):
    teleprompter = BootstrapFewShotWithRandomSearch(
          metric=validate_context_and_answer
        , num_candidate_programs = 2
        , max_bootstrapped_demos= 2
        , max_labeled_demos= 2) 
    optimized_program = teleprompter.compile(model, trainset=dev_exs)
    return optimized_program


In [None]:
optimized_halu_check_model = bootstrap_rm_optimize(halu_check_model) 
optimized_multihop_halu_model = bootstrap_rm_optimize(multihop_halu_model) 

In [None]:
optimized_openqa_model = bootstrap_rm_optimize(basic_openqa_model)

In [26]:
def create_bakeoff_submission(model):
    """"
    The argument `model` is a `dspy.Module`. The return value of its
    `forward` method must have an `answer` attribute.
    """

    filename = os.path.join("data", "openqa", "cs224u-openqa-test-unlabeled.txt")

    # This should become a mapping from questions (str) to response
    # dicts from your system.
    gens = {}

    with open(filename) as f:
        questions = f.read().splitlines()
    # questions = questions[:100]
    # Here we loop over the questions, run the system `model`, and
    # store its `answer` value as the prediction:
    for question in tqdm.tqdm(questions):
        gens[question] = model(question=question).answer

    # Quick tests we advise you to run:
    # 1. Make sure `gens` is a dict with the questions as the keys:
    assert all(question in gens for q in questions)
    # 2. Make sure the values are str:
    assert all(isinstance(d, str) for d in gens.values())

    # And finally the output file:
    with open("cs224u-openqa-bakeoff-entry.json", "wt") as f:
        json.dump(gens, f, indent=4)

In [181]:
# loaded_model = MultiHopRAG()
# loaded_model.load("gpt_optimized_model.sav")
# gpt_optimized_model.save("gpt_optimized_model.sav")
gpt_optimized_model.save = open_qa_model.save
gpt_optimized_model.save("gpt_optimized_model.sav")

### Michael's model (for ref)

In [None]:
def flexible_optimize(model, config_optimizer=None, optimizer=BootstrapFewShotWithRandomSearch, metric=answer_passage_match, trainset=squad_train, devset=squad_dev, n_samples=30):
    if config_optimizer is None:
        config_optimizer = {}
    return optimizer(metric=metric, **config_optimizer).compile(model, trainset=sample(trainset, n_samples))

log_format = '%(funcName)s - %(levelname)s - %(message)s'
logging.basicConfig(format=log_format, level=logging.INFO, stream=sys.stdout, force=True)
logger = logging.getLogger(__name__)

class GenerateSearchQuery(dspy.Signature):
    """Write a short and concise search query that will help answer a complex question."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()
    
class SummarizeSignature(dspy.Signature):
    # __doc__ = """Reply with a concise summary of the key facts and statements from the following text."""
    __doc__ = """Summarize the following text in one paragraph focusing on key facts and statements."""

    context = dspy.InputField()
    summary = dspy.OutputField()
    
class ContextQASignature(dspy.Signature):
    __doc__ = """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")
    
class FlexibleRAG(dspy.Module):
    def __init__(self, 
                 passages_per_hop=3, 
                 max_hops=2,
                 summarize_retrieved_passages_per_hop=True,
                 summarize_whole_context=False,
                 module_gen_answer=dspy.Predict, 
                 signature_gen_answer=ContextQASignature,
                 module_summarize=dspy.Predict,
                 signature_summarize=SummarizeSignature,
                 module_gen_query=dspy.Predict,
                 signature_gen_query=GenerateSearchQuery,
                 loglevel=logging.INFO):
        super().__init__()    
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(loglevel)    
        self.max_hops = max_hops
        self.summarize_retrieved_passages_per_hop = summarize_retrieved_passages_per_hop
        self.summarize_whole_context = summarize_whole_context
        self.summarize = None
        self.generate_query = None
        if module_summarize is not None and signature_summarize is not None:
            self.summarize = module_summarize(signature_summarize)
        if module_gen_query is not None and signature_gen_query is not None:
            self.generate_query = [module_gen_query(signature_gen_query) for _ in range(max_hops)]            
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = module_gen_answer(signature_gen_answer)
    
    def get_context(self, question):
        if self.generate_query is not None:
            context = []
            for hop in range(self.max_hops):
                self.logger.debug(f'{hop=}')
                query = self.generate_query[hop](context=context, question=question).query            
                self.logger.debug(f'{query=}')
                passages = self.retrieve(query).passages
                if self.summarize_retrieved_passages_per_hop and self.summarize is not None:
                    # Summarize every passage to ease generation of answer and reduce context size
                    passages = [self.get_summary(passage) for passage in passages]                   
                context = deduplicate(context + passages)
                self.logger.debug(f'{context=}')
        else:
            context = self.retrieve(question).passages
        return context
            
    def get_summary(self, context):
        if self.summarize is not None:
            summary = self.summarize(context=context).summary
            self.logger.debug(f'{context=}')
            self.logger.debug(f'{summary=}')
            return summary
        else:
            return context        

    def forward(self, question):
        context = self.get_context(question=question)  
        if self.summarize_whole_context and self.summarize is not None:  
            summary = self.get_summary(context=context)
        else:
            summary = context
        prediction = self.generate_answer(context=summary, question=question)        
        self.logger.debug(f'{question=}')
        self.logger.debug(f'{context=}')
        self.logger.debug(f'{summary=}')
        self.logger.debug(f'{prediction.answer=}')
        return dspy.Prediction(context=summary, answer=prediction.answer)    