In [3]:
!pip install dspy-ai

Collecting dspy-ai
  Downloading dspy_ai-2.3.3-py3-none-any.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting regex
  Using cached regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Collecting backoff~=2.2.1
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting openai<2.0.0,>=0.28.1
  Using cached openai-1.12.0-py3-none-any.whl (226 kB)
Collecting optuna
  Using cached optuna-3.5.0-py3-none-any.whl (413 kB)
Collecting pydantic<3.0.0,>=2.0
  Downloading pydantic-2.6.2-py3-none-any.whl (394 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.9/394.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pandas
  Using cached pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
Collecting ujson
  Using cached ujson-5.9.0-cp310-cp310-manylinux_2_17_x86_64.

## Setup

In [4]:
import dspy
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric

# Set up the LM
turbo = dspy.OpenAI(model='gpt-3.5-turbo-instruct', max_tokens=250)
dspy.settings.configure(lm=turbo)

# Load math questions from the GSM8K dataset
gms8k = GSM8K()
gsm8k_trainset, gsm8k_devset = gms8k.train[:10], gms8k.dev[:10]

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 7.94k/7.94k [00:00<00:00, 4.52MB/s]
Downloading data: 100%|██████████| 2.31M/2.31M [00:00<00:00, 4.03MB/s]
Downloading data: 100%|██████████| 419k/419k [00:00<00:00, 1.47MB/s]]
Downloading data files: 100%|██████████| 2/2 [00:00<00:00,  2.17it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 330.74it/s]
Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 91801.42 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 81125.71 examples/s]
100%|██████████| 7473/7473 [00:00<00:00, 27114.64it/s]
100%|██████████| 1319/1319 [00:00<00:00, 25369.55it/s]


## Define the Module

In [5]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought("question -> answer")
    
    def forward(self, question):
        return self.prog(question=question)

class is just initialization(property) with functions(method)

## Compile and Evaluate the Model

In [6]:
from dspy.teleprompt import BootstrapFewShot

# Set up the optimizer: we want to "bootstrap" (i.e., self-generate) 4-shot examples of our CoT program.
config = dict(max_bootstrapped_demos=4, max_labeled_demos=4)

# Optimize! Use the `gms8k_metric` here. In general, the metric is going to tell the optimizer how well it's doing.
teleprompter = BootstrapFewShot(metric=gsm8k_metric, **config)
optimized_cot = teleprompter.compile(CoT(), trainset=gsm8k_trainset, valset=gsm8k_devset)

 50%|█████     | 5/10 [00:06<00:06,  1.35s/it]

Bootstrapped 4 full traces after 6 examples in round 0.





## Evaluate

In [7]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be used multiple times.
evaluate = Evaluate(devset=gsm8k_devset, metric=gsm8k_metric, num_threads=4, display_progress=True, display_table=0)

# Evaluate our `optimized_cot` program.
evaluate(optimized_cot)




Average Metric: 7 / 10  (70.0): 100%|██████████| 10/10 [00:03<00:00,  2.51it/s]

Average Metric: 7 / 10  (70.0%)



  df = df.applymap(truncate_cell)


70.0

## Inspect the Model's History

In [8]:
turbo.inspect_history(n=1)





Given the fields `question`, produce the fields `answer`.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: The result from the 40-item Statistics exam Marion and Ella took already came out. Ella got 4 incorrect answers while Marion got 6 more than half the score of Ella. What is Marion's score?
Reasoning: Let's think step by step in order to find Marion's score. We know that Ella got 4 incorrect answers, which means she got 36 correct answers out of 40. We also know that Marion got 6 more than half of Ella's score, which is 6 more than 36/2 = 18. Therefore, Marion's score is 18 + 6 = 24.
Answer: 24

---

Question: Bridget counted 14 shooting stars in the night sky. Reginald counted two fewer shooting stars than did Bridget, but Sam counted four more shooting stars than did Reginald. How many more shooting stars did Sam count in the night sky than was the average 