In [1]:
import os
import tqdm as notebook_tqdm
from dotenv import load_dotenv

import unify
import dsp
import dspy
from dotdict import dotdict
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFinetune, BootstrapFewShotWithRandomSearch

load_dotenv()
unify_api_key = os.getenv("UNIFY_KEY")



In [2]:
unify.list_models()

['deepseek-coder-33b-instruct',
 'gemma-2b-it',
 'gemma-7b-it',
 'claude-3-haiku',
 'gpt-4-turbo',
 'claude-3.5-sonnet',
 'claude-3-opus',
 'claude-3-sonnet',
 'mistral-small',
 'mistral-large',
 'mixtral-8x22b-instruct-v0.1',
 'llama-3-8b-chat',
 'llama-3-70b-chat',
 'gemini-1.5-pro',
 'gemini-1.5-flash',
 'gpt-4o',
 'codellama-70b-instruct',
 'mistral-7b-instruct-v0.2',
 'command-r-plus',
 'mistral-7b-instruct-v0.3',
 'mistral-medium',
 'qwen-2-7b-instruct',
 'qwen-2-72b-instruct',
 'phi-3-medium-4k-instruct',
 'nemotron-4-340b-instruct',
 'gemma-2-9b-it',
 'mixtral-8x7b-instruct-v0.1',
 'mistral-7b-instruct-v0.1',
 'llama-2-7b-chat',
 'codellama-34b-instruct',
 'llama-2-70b-chat',
 'codellama-7b-instruct',
 'codellama-13b-instruct',
 'yi-34b-chat',
 'pplx-7b-chat',
 'gpt-4',
 'pplx-70b-chat',
 'gpt-3.5-turbo',
 'phind-codellama-34b-v2',
 'llama-2-13b-chat']

In [3]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=200, eval_seed=2023, dev_size=1000, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

(20, 50)

In [4]:
print(devset)
print(trainset)

[Example({'question': 'Are both Cangzhou and Qionghai in the Hebei province of China?', 'answer': 'no', 'gold_titles': {'Qionghai', 'Cangzhou'}}) (input_keys={'question'}), Example({'question': 'Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?', 'answer': 'National Hockey League', 'gold_titles': {'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}}) (input_keys={'question'}), Example({'question': 'The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay Lightning of the National Hockey League (NHL)?', 'answer': 'Steve Yzerman', 'gold_titles': {'Steve Yzerman', '2006–07 Detroit Red Wings season'}}) (input_keys={'question'}), Example({'question': 'What river is near the Crichton Collegiate Church?', 'answer': 'the River Tyne', 'gold_titles': {'Crichton Castle', 'Crichton Collegiate Church'}}) (input_keys={'questi

In [5]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

class QnA(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        return self.generate_answer(question=question)

In [6]:
gemma = dsp.Unify(
    model="gemma-7b-it@anyscale",
    max_tokens=150,
    api_key=unify_api_key,
)

claude = dsp.Unify(
    model="claude-3-haiku@anthropic",
    max_tokens=150,
    api_key=unify_api_key,
)

unify_router = dsp.Unify(
    model="router@q:1|c:1.92e+00|t:7.45e-05|i:1.00e-06",
    max_tokens=150,
    api_key=unify_api_key,
)

turbo = dsp.Unify(
    model="gpt-3.5-turbo@openai", 
    max_tokens=150,
    api_key=unify_api_key,

)

dspy.settings.configure(lm=turbo)

In [7]:
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred, frac=0.6)
    return answer_EM

num_threads = 24

teleprompter_turbo = BootstrapFewShot(metric=validate_context_and_answer)


In [8]:
compiled_qna_turbo = teleprompter_turbo.compile(QnA(), trainset=trainset)

 40%|████      | 8/20 [00:12<00:18,  1.58s/it]


Bootstrapped 4 full traces after 9 examples in round 0.


In [9]:
evaluate_hotpot = Evaluate(devset=devset[:1000], metric=validate_context_and_answer, num_threads=num_threads, display_progress=True, display_table=5)
evaluate_hotpot(compiled_qna_turbo, metric=validate_context_and_answer)

Average Metric: 21 / 50  (42.0): 100%|██████████| 50/50 [00:07<00:00,  7.01it/s]


Unnamed: 0,question,example_answer,gold_titles,rationale,pred_answer,validate_context_and_answer
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Qionghai', 'Cangzhou'}",Answer: No,No,✔️ [True]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}",Answer: George McPhee,George McPhee,False
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'Steve Yzerman', '2006–07 Detroit Red Wings season'}",Answer: Steve Yzerman,Steve Yzerman,✔️ [True]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Castle', 'Crichton Collegiate Church'}",Answer: River Nith,River Nith,False
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}",Answer: Alfred the Great,Alfred the Great,✔️ [True]


42.0

In [10]:
turbo.inspect_history()




Answer questions with short factoid answers.

---

Question: At My Window was released by which American singer-songwriter?
Answer: John Townes Van Zandt

Question: "Everything Has Changed" is a song from an album released under which record label ?
Answer: Big Machine Records

Question: In what year was the club founded that played Manchester City in the 1972 FA Charity Shield
Answer: 1874

Question: Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?
Answer: Aleem Sarwar Dar

Question: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?
Answer: "Outfield of Dreams"

Question: Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?
Answer: Aleksandr Danilovich Aleksandrov

Question: The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas fou

'\n\n\nAnswer questions with short factoid answers.\n\n---\n\nQuestion: At My Window was released by which American singer-songwriter?\nAnswer: John Townes Van Zandt\n\nQuestion: "Everything Has Changed" is a song from an album released under which record label ?\nAnswer: Big Machine Records\n\nQuestion: In what year was the club founded that played Manchester City in the 1972 FA Charity Shield\nAnswer: 1874\n\nQuestion: Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?\nAnswer: Aleem Sarwar Dar\n\nQuestion: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?\nAnswer: "Outfield of Dreams"\n\nQuestion: Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?\nAnswer: Aleksandr Danilovich Aleksandrov\n\nQuestion: The Organisation that allows a community to influence their operation or use and to enjoy t

In [11]:
dspy.settings.configure(lm=claude)
dspy.settings.lm.kwargs

{'temperature': 0.0,
 'max_tokens': 150,
 'top_p': 1,
 'frequency_penalty': 0,
 'presence_penalty': 0,
 'endpoint': 'claude-3-haiku@anthropic'}

In [12]:
teleprompter_claude = BootstrapFewShot(metric=validate_context_and_answer)
compiled_qna_claude = teleprompter_claude.compile(QnA(), trainset=trainset)

 85%|████████▌ | 17/20 [00:35<00:06,  2.08s/it]


Bootstrapped 4 full traces after 18 examples in round 0.


In [34]:
tp = BootstrapFinetune(metric=validate_context_and_answer, teacher_settings=dict({'lm': turbo}))

In [14]:
claude.kwargs

{'temperature': 0.0,
 'max_tokens': 150,
 'top_p': 1,
 'frequency_penalty': 0,
 'presence_penalty': 0,
 'endpoint': 'claude-3-haiku@anthropic'}

In [15]:
student_qna = QnA()

In [16]:
dsp.settings.lm.model

'claude-3-haiku'

In [17]:
QnA().__dict__

{'_compiled': False,
 'generate_answer': self = Predict(StringSignature(question -> rationale, answer
     instructions='Answer questions with short factoid answers.'
     question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
     rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
     answer = Field(annotation=str required=True json_schema_extra={'desc': 'often between 1 and 5 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
 ))}

In [25]:
dir(compiled_qna_turbo)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_assert_failures',
 '_base_init',
 '_compiled',
 '_suggest_failures',
 'activate_assertions',
 'deepcopy',
 'dump_state',
 'forward',
 'generate_answer',
 'load',
 'load_state',
 'map_named_predictors',
 'named_parameters',
 'named_predictors',
 'named_sub_modules',
 'parameters',
 'predictors',
 'reset_copy',
 'save']

In [35]:
finetuned_qna = tp.compile(student=student_qna, teacher=compiled_qna_turbo, trainset=trainset, target='google/flan-t5-base')

100%|██████████| 20/20 [00:30<00:00,  1.53s/it]


Bootstrapped 12 full traces after 20 examples in round 0.
all 12
local_cache/compiler\all.503285c336556a12.jsonl


ModuleNotFoundError: No module named 'evaluate'

In [None]:
evaluate_hotpot(finetuned_qna, metric=validate_context_and_answer)