# Fine-tuned Expert Ensembling & Routing

In [1]:
%pip install dspy



In [2]:
import dspy
import os
import importlib
from tqdm import tqdm

from dspy.datasets.gsm8k import GSM8K
from dspy.evaluate.evaluate import Evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

## Setup

In [None]:
os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_BASE"] = ""

In [None]:
lm = dspy.LM("openai/gpt-4o-mini")
lm("testing")

["It looks like you're testing the system. How can I assist you today?"]

In [51]:
dspy.settings.configure(lm=lm)

In [None]:
gsm8k = GSM8K()
gsm8k_trainset, gsm8k_devset = gsm8k.train, gsm8k.dev
gsm8k_trainset = gsm8k_trainset[:20]
gsm8k_devset = gsm8k_devset[:50]

# Fine-tuning on GSM8K

### In the first part of this notebook, I attempt to fine-tune a single LM on GSM8K to use as a baseline for comparison against ExpertEnsemble (introduced later).

# Attempt 1: Pure PyTorch (w/o HF)

In [9]:
class QA_Dataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        # Tokenize the question and the answer
        input_encodings = self.tokenizer(question, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        target_encodings = self.tokenizer(answer, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

        # Extract the input_ids and attention_mask (since we need to return tensor format)
        input_ids = input_encodings['input_ids'].squeeze(0)
        attention_mask = input_encodings['attention_mask'].squeeze(0)
        target_ids = target_encodings['input_ids'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }

In [None]:
# Example data (questions and answers)
questions = [f"Question: {example.question} <|sep|>" for example in gsm8k_trainset]
answers = [f"Solution: {example.gold_reasoning}. <|sep|> Answer: {example.answer}" for example in gsm8k_trainset]

# Initialize the tokenizer and model
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Prepare the dataset and dataloader
dataset = QA_Dataset(questions, answers, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

model.gradient_checkpointing_enable()

In [13]:
# Training loop
epochs = 2

for epoch in tqdm(range(epochs)):
    model.train()

    total_loss = 0
    for i, batch in tqdm(enumerate(dataloader)):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Compute loss
        loss = outputs.loss
        print(f"Loss on epoch {epoch}, batch {i}: {loss}")
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Print the loss for the current epoch
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

  0%|          | 0/2 [00:00<?, ?it/s]
0it [00:00, ?it/s][A`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Loss on epoch 0, batch 0: 45.229740142822266



1it [00:14, 14.15s/it][A

Loss on epoch 0, batch 1: 45.99446105957031



2it [00:23, 11.29s/it][A

Loss on epoch 0, batch 2: 42.93730545043945



3it [00:33, 10.65s/it][A

Loss on epoch 0, batch 3: 43.78208923339844



4it [00:42, 10.10s/it][A

Loss on epoch 0, batch 4: 37.72266387939453



5it [00:50,  9.43s/it][A

Loss on epoch 0, batch 5: 29.816068649291992



6it [01:00,  9.65s/it][A

Loss on epoch 0, batch 6: 40.585567474365234



7it [01:10,  9.61s/it][A

Loss on epoch 0, batch 7: 45.219268798828125



8it [01:18,  9.13s/it][A

Loss on epoch 0, batch 8: 44.20671081542969



9it [01:27,  9.19s/it][A

Loss on epoch 0, batch 9: 42.08649826049805



10it [01:37,  9.74s/it]
 50%|█████     | 1/2 [01:37<01:37, 97.39s/it]

Epoch 1/2, Loss: 41.7580



0it [00:00, ?it/s][A

Loss on epoch 1, batch 0: 44.45619583129883



1it [00:08,  8.08s/it][A

Loss on epoch 1, batch 1: 41.182186126708984



2it [00:17,  8.84s/it][A

Loss on epoch 1, batch 2: 39.03208541870117



3it [00:26,  8.87s/it][A

Loss on epoch 1, batch 3: 43.45085525512695



4it [00:34,  8.62s/it][A

Loss on epoch 1, batch 4: 40.45376968383789



5it [00:43,  8.83s/it][A

Loss on epoch 1, batch 5: 37.00904846191406



6it [00:52,  8.73s/it][A

Loss on epoch 1, batch 6: 34.58426284790039



7it [01:00,  8.66s/it][A

Loss on epoch 1, batch 7: 33.200008392333984



8it [01:10,  9.00s/it][A

Loss on epoch 1, batch 8: 35.98979187011719



9it [01:19,  8.97s/it][A

Loss on epoch 1, batch 9: 41.87115478515625



10it [01:28,  8.81s/it]
100%|██████████| 2/2 [03:05<00:00, 92.74s/it]

Epoch 2/2, Loss: 39.1229





In [None]:
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

In [14]:
model.eval()

dev_strs = [f"Question: {example.question} <|sep|>" for example in gsm8k_devset]
test = dev_strs[12]
print(test)

input_ids = tokenizer(test, return_tensors="pt").input_ids
outputs = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Output:", decoded_output)

Question: Carlos and Diego are racing around the block. Carlos runs around the entire block in 3 minutes. Diego runs around half the block in 2.5 minutes and then is tripped by a dog and can't finish. Assuming he kept up the same speed the entire race, what would've been the average time in seconds for the racers? <|sep|>
Generated Output: Carlos and Diego are racing around the block. Carlos runs around the entire block in 3 minutes. Diego runs around half the block in 2.5 minutes and then is tripped by a dog and can't finish. Assuming he


# Attempt 2: Transformers Trainer

In [59]:
from datasets import load_dataset
from datasets import Dataset
from transformers import Trainer, TrainingArguments

# Load the dataset
dataset = load_dataset("gsm8k", "main")

In [60]:
trainset = {'question': dataset['train']['question'], 'answer': dataset['train']['answer']}
devset = {'question': dataset['test']['question'], 'answer': dataset['test']['answer']}

trainset['question'] = trainset['question'][:20]
trainset['answer'] = trainset['answer'][:20]
devset['question'] = devset['question'][:50]
devset['answer'] = devset['answer'][:50]

for i in range(len(trainset['question'])):
    trainset['question'][i] = f"Question: {trainset['question'][i]} <|sep|>"
for i in range(len(trainset['answer'])):
    trainset['answer'][i] = f"Answer: {trainset['answer'][i]}. <|sep|>"
for i in range(len(devset['question'])):
    devset['question'][i] = f"Question: {devset['question'][i]} <|sep|>"
for i in range(len(devset['answer'])):
    devset['answer'][i] = f"Answer: {devset['answer'][i]}. <|sep|>"

trainset = Dataset.from_dict(trainset)

In [61]:
# Load Google FLAN-T5 model and tokenizer
model_name = "google/flan-t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Preprocess the dataset
def preprocess_function(examples):
    # Tokenize the inputs and labels (you may need to adjust this based on your dataset's format)
    inputs = examples["question"]  # Assuming dataset has a "question" field
    targets = examples["answer"]   # Assuming dataset has an "answer" field

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_dataset = trainset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./flan_t5_finetuned",  # output directory
    eval_strategy="no",        # evaluate after every epoch
    learning_rate=2e-5,                # learning rate
    per_device_train_batch_size=4,     # batch size for training
    num_train_epochs=3,                # number of epochs
    weight_decay=0.01,                 # strength of weight decay
    logging_dir="./logs",              # logging directory
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,  # the model to train
    args=training_args,  # training arguments
    train_dataset=train_dataset,  # training dataset
    eval_dataset=None
)

# Start the training
trainer.train()

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Average Metric: 0.0 / 4  (0.0):  20%|██        | 4/20 [45:50<3:03:23, 687.69s/it]
Average Metric: 0.0 / 4  (0.0):  20%|██        | 4/20 [45:29<3:01:58, 682.39s/it]
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhamzaelboudali123[0m ([33mhamzaelboudali123-stanford-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'loss': 40.3016, 'grad_norm': 105.31697845458984, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}
{'train_runtime': 410.5823, 'train_samples_per_second': 0.146, 'train_steps_per_second': 0.037, 'train_loss': 39.4470947265625, 'epoch': 3.0}


TrainOutput(global_step=15, training_loss=39.4470947265625, metrics={'train_runtime': 410.5823, 'train_samples_per_second': 0.146, 'train_steps_per_second': 0.037, 'train_loss': 39.4470947265625, 'epoch': 3.0})

In [32]:
# Save the fine-tuned model
model.save_pretrained("./flan_t5_finetuned")
tokenizer.save_pretrained("./flan_t5_finetuned")

('./flan_t5_finetuned/tokenizer_config.json',
 './flan_t5_finetuned/special_tokens_map.json',
 './flan_t5_finetuned/spiece.model',
 './flan_t5_finetuned/added_tokens.json')

In [None]:
# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./flan_t5_finetuned")
tokenizer = T5Tokenizer.from_pretrained("./flan_t5_finetuned")

In [64]:
# Generate predictions
def generate_answer(question, max_length=512):
    inputs = tokenizer(question, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    outputs = model.generate(input_ids=inputs["input_ids"], max_length=max_length, num_beams=4, early_stopping=True)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

for i in range(10):
  question = devset["question"][i] + "Answer: "
  gold_answer = devset["answer"][i]
  prediction = generate_answer(question)

  print(f"{question}")
  print(f"Gold Answer: {gold_answer}")
  print(f"Prediction: {prediction}")

Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? <|sep|>Answer: 
Gold Answer: Answer: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18. <|sep|>
Prediction: $4 per fresh duck egg
Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take? <|sep|>Answer: 
Gold Answer: Answer: It takes 2/2=<<2/2=1>>1 bolt of white fiber
So the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric
#### 3. <|sep|>
Prediction: 2
Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make? <|sep|>Answer: 


# Attempt 3: finetune_hf.py (also uses Trainer)

In [10]:
!pip install evaluate
!pip install rouge_score



In [11]:
import string
import random
import time
import ujson
from datasets.fingerprint import Hasher
import dsp
from dsp.modules.finetuning import finetune_hf

In [12]:
training_data_directory = "training_data_directory"

if not os.path.exists(training_data_directory):
  os.makedirs(training_data_directory)

In [13]:
def finetune(
    samples,
    cluster_id,
    *,
    target="t5-small",
    bsize=12,
    accumsteps=1,
    lr=5e-5,
    epochs=1,
    bf16=False,
    int8=False,
    peft=False,
    path_prefix=None,
):
  # Prepare finetune <prompt, completion> pairs.
  finetune_data = [dict(prompt=sample.question, completion=sample.answer) for sample in samples]

  #
  # Dump as files.
  #

  data = finetune_data
  hashed_name = cluster_id + "." + Hasher.hash(data)
  output_path = os.path.join(training_data_directory, f"{hashed_name}.jsonl")
  print(output_path)

  with open(output_path, "w") as f:
      for line in data:
          f.write(ujson.dumps(line) + "\n")

  finetune_path = output_path

  #
  # Train!
  #
  compiler_config = {
  "save": "".join(
      random.Random(time.time()).choices(string.ascii_uppercase + string.digits, k=13),
  ),  # https://stackoverflow.com/a/2257449/1493011
      "peft": peft,
      "fp16": False,
      "bf16": bf16,
      "int8": int8,
      "fid": False,
      "rationale": False,
      "batch_size": bsize,
      "epochs": epochs,
      "gradient_accumulation_steps": accumsteps,  # 2,
      "lr": lr,
  }

  compiler_config["save"] = (
      os.path.join(path_prefix, compiler_config["save"]) if path_prefix else compiler_config["save"]
  )

  training_data_path = finetune_path
  compiler_config_ = dict(compiler_config)
  compiler_config_["save"] = compiler_config["save"] + "." + cluster_id
  best_ckpt_path = finetune_hf(training_data_path, target, compiler_config_)

  print(f"#> Best checkpoint path: {best_ckpt_path} for {cluster_id}")
  return dsp.HFModel(model=target, checkpoint=best_ckpt_path)

In [14]:
model = finetune(gsm8k_trainset, str(0))

training_data_directory/0.a4d02e1147c27a38.jsonl


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# examples skipped due to parsing error: 0 / 20


Filter:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Dataset statistics: {'max_source_length': 160, 'max_target_length': 3}
Keys of tokenized dataset: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels']
Finetuning dataset: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 18
    })
    test: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,9.572198,0.0,0.0,0.0,0.0,19.0


Best checkpoint of model: ../finetuning_ckpts/7DB76YERRST89.0/checkpoint-2
#> Best checkpoint path: ../finetuning_ckpts/7DB76YERRST89.0/checkpoint-2 for 0


In [21]:
print("Question: ", gsm8k_devset[0].question)
print("Gold Answer: ", gsm8k_devset[0].answer)
model(gsm8k_devset[0].question, max_new_tokens=50)

Question:  20 birds migrate on a seasonal basis from one lake to another, searching for food. If they fly from lake Jim to lake Disney in one season, which is 50 miles apart, then the next season they fly from lake Disney to lake London, 60 miles apart, calculate the combined distance all of the birds have traveled in the two seasons.
Gold Answer:  2200


['migrate on seasonal basis from one lake to another, searching for food. If they fly from lake Jim to lake Disney in one season, which is 50 miles apart, then the next season they fly from lake Disney to lake London, 60 miles']

## Introducing ExpertEnsemble

##### ExpertEnsemble is initialized and used just like other DSPy optimizer. We will demonstrate its usage and then evaluate its performance compared to a single finetuned t5-small model and other optimizers on a simple GSM8K program.

In [7]:
class GenerateAnswer(dspy.Signature):
    """Answer the math problem with a single number."""

    question = dspy.InputField(desc='a math word problem')
    answer = dspy.OutputField(desc='a single number')

class QA(dspy.Module):
    def __init__(self):
        super().__init__()

        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        prediction = self.generate_answer(question=question)
        return dspy.Prediction(answer=prediction.answer)

In [93]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
!pip install fastembed

In [94]:
from dspy.teleprompt import LabeledFewShot
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFinetune
from expert_ensemble_copy import ExpertEnsemble

dspy.settings.experimental = True

In [85]:
teleprompter = LabeledFewShot()
bootstrap_teleprompter = BootstrapFewShot()
finetune_teleprompter = BootstrapFinetune()
expert_teleprompter = ExpertEnsemble()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [52]:
# compiled_vanilla = teleprompter.compile(QA(), trainset=gsm8k_trainset)
# compiled_bootstrap = bootstrap_teleprompter.compile(QA(), trainset=gsm8k_trainset)
compiled_finetune = finetune_teleprompter.compile(QA(), trainset=gsm8k_trainset)

[BootstrapFinetune] Preparing the student and teacher programs...
Ensuring that the student is not compiled
No teacher provided. Using a copy of the student program as the teacher.
[BootstrapFinetune] Bootstrapping data...


Average Metric: 20 / 20  (100.0): 100%|██████████| 20/20 [00:00<00:00, 202.53it/s]
2024/11/15 22:18:03 INFO dspy.evaluate.evaluate: Average Metric: 20 / 20 (100.0%)


[BootstrapFinetune] Preparing the train data...
Using 20 data points for fine-tuning the model: openai/gpt-4o-mini
[BootstrapFinetune] Starting LM fine-tuning...
[BootstrapFinetune] 1 fine-tuning job(s) to start
[BootstrapFinetune] Starting 1 fine-tuning jobs...
[OpenAI Provider] Validating the data format
[OpenAI Provider] Saving the data to a file
[OpenAI Provider] Data saved to /root/.dspy_cache/finetune/f63f20229d84b269.jsonl
[OpenAI Provider] Uploading the data to the provider


2024/11/15 22:18:04 ERROR dspy.clients.lm: Error code: 401 - {'error': {'message': 'Incorrect API key provided: yUb4z5sT****aYlM. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


Job 1/1 completed.
[BootstrapFinetune] Updating the student program with the fine-tuned LMs...
[BootstrapFinetune] BootstrapFinetune has finished compiling the student program


In [86]:
compiled_expert = expert_teleprompter.compile(QA(), trainset=gsm8k_trainset)

Num clusters:  2
Finetuning model on cluster 0 (size of cluster: 13)...
training_data_directory/0.373b21baf6b94d03.jsonl


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

# examples skipped due to parsing error: 0 / 13


Filter:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Dataset statistics: {'max_source_length': 72, 'max_target_length': 3}
Keys of tokenized dataset: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels']
Finetuning dataset: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11
    })
    test: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})
{'eval_loss': 13.443631172180176, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_gen_len': 19.0, 'eval_runtime': 3.81, 'eval_samples_per_second': 0.525, 'eval_steps_per_second': 0.262, 'epoch': 1.0}
{'train_runtime': 36.2306, 'train_samples_per_second': 0.304, 'train_steps_per_second': 0.028, 'train_loss': 9.528441429138184, 'epoch': 1.0}
Best checkpoint of model: ../finetuning_ckpts/L6Y16QGTHSALN.0/checkpoint-1
#> Best checkpoint path: ../finetuning_ckpts/L6Y16QGTHSALN.0/checkpoint-1 fo

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

# examples skipped due to parsing error: 0 / 7


Filter:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Dataset statistics: {'max_source_length': 160, 'max_target_length': 3}
Keys of tokenized dataset: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels']
Finetuning dataset: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6
    })
    test: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})
{'eval_loss': 12.016982078552246, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_gen_len': 19.0, 'eval_runtime': 1.1688, 'eval_samples_per_second': 0.856, 'eval_steps_per_second': 0.856, 'epoch': 1.0}
{'train_runtime': 48.2826, 'train_samples_per_second': 0.124, 'train_steps_per_second': 0.021, 'train_loss': 11.43848991394043, 'epoch': 1.0}
Best checkpoint of model: ../finetuning_ckpts/52Q5YRJQVQ388.1/checkpoint-1
#> Best checkpoint path: ../finetuning_ckpts/52Q5YRJQVQ388.1/checkpoint-1 

We can access the kmeans model and the LMs that were created and finetuned in the compilation process for ExpertEnsemble like so:

In [77]:
print(compiled_expert.kmeans)
print(compiled_expert.models)

KMeans(n_clusters=2, random_state=0)
[<dsp.modules.hf.HFModel object at 0x7a6b728136d0>, <dsp.modules.hf.HFModel object at 0x7a6a1be12f80>]


We can also inspect the generated clusters.

In [90]:
for key, val in compiled_expert.cluster_assignments.items():
    print(key)
    for example in val:
        print(example)
        print()

0
Example({'question': "The result from the 40-item Statistics exam Marion and Ella took already came out. Ella got 4 incorrect answers while Marion got 6 more than half the score of Ella. What is Marion's score?", 'gold_reasoning': "Ella's score is 40 items - 4 items = <<40-4=36>>36 items. Half of Ella's score is 36 items / 2 = <<36/2=18>>18 items. So, Marion's score is 18 items + 6 items = <<18+6=24>>24 items.", 'answer': '24'}) (input_keys={'question'})

Example({'question': 'Bridget counted 14 shooting stars in the night sky.  Reginald counted two fewer shooting stars than did Bridget, but Sam counted four more shooting stars than did Reginald.  How many more shooting stars did Sam count in the night sky than was the average number of shooting stars observed for the three of them?', 'gold_reasoning': 'Reginald counted two fewer shooting stars than did Bridget, or a total of 14-2=<<14-2=12>>12 shooting stars. Sam counted 4 more shooting stars than did Reginald, or a total of 12+4=16

## Evaluation

In [53]:
evaluate_on_gsm8k = Evaluate(devset=gsm8k_devset, num_threads=1, display_progress=True, display_table=5)
metric = dspy.evaluate.answer_exact_match

# evaluate_on_gsm8k(compiled_vanilla, metric=metric)
# evaluate_on_gsm8k(compiled_bootstrap, metric=metric)

In [101]:
evaluate_on_gsm8k(compiled_finetune, metric=metric)
compiled_finetune(gsm8k_devset[0].question)

  0%|          | 0/50 [00:00<?, ?it/s]

AttributeError: 'AuthenticationError' object has no attribute 'kwargs'

Now that the program's been compiled by ExpertEnsemble, we can run it on new inputs in inference mode. Behind the scenes, the program is routing each input to the relevant expert fine-tuned LM!

In [96]:
compiled_expert.forward([gsm8k_devset[3]])

[['Rita is reading a five-chapter book with 95 pages. Each chapter has three pages more than the previous one.']]

In [97]:
compiled_expert.forward([gsm8k_devset[18]])

[["are drawing with chalk outside. Another 3 friends join them and ask if they can help with the drawing. Erika loses 2 pieces of chalk as they are counting and the group realizes there isn't enough chalk for everyone to have 3 pieces each."]]

In [104]:
evaluate_on_gsm8k(compiled_expert, metric=metric)


  0%|          | 0/50 [00:00<?, ?it/s]

TypeError: ExpertEnsemble.compile.<locals>.forward() missing 1 required positional argument: 'samples'