In [2]:
#datasets = load_dataset("squad_v2" if squad_v2 else "squad")
import pandas as pd
from tqdm.auto import tqdm
import plotly.express as px
from plotly.subplots import make_subplots
import os
import nest_asyncio
import dspy

from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch
import numpy as np
from utils import f1_score
nest_asyncio.apply()

current_dir = os.getcwd()
while not current_dir.endswith("nlp_course"):
    os.chdir("..")
    current_dir = os.getcwd()

ds_train = pd.read_parquet("dataset/train_df.parquet")
ds_val = pd.read_parquet("dataset/val_df.parquet")

In [3]:


#same datasets size for inlang and en
eng_orig = ds_train[:150]
en_idx = int(len(eng_orig)*0.8)
en_train_df = eng_orig[:en_idx]
en_val_df = eng_orig[en_idx:]
en_test_df = ds_val[:300]

inlang_orig  = ds_train[~ds_train['answer_inlang'].isna()]
idx = int(len(inlang_orig)*0.8)

inlang_train_df = inlang_orig[:idx]
inlang_val_df = inlang_orig[idx:]
inlang_test_df = ds_val[~ds_val['answer_inlang'].isna()]

inlang_train = [dspy.Example(question=question, context=context, answer=answer).with_inputs('question', 'context') for question, context, answer in zip(inlang_train_df['question'], inlang_train_df['context'], inlang_train_df['answer_inlang'])]
inlang_val = [dspy.Example(question=question, context=context, answer=answer).with_inputs('question', 'context') for question, context, answer in zip(inlang_val_df['question'], inlang_val_df['context'], inlang_val_df['answer_inlang'])]
inlang_test = [dspy.Example(question=question, context=context, answer=answer).with_inputs('question', 'context') for question, context, answer in zip(inlang_test_df['question'], inlang_test_df['context'], inlang_test_df['answer_inlang'])]

en_train = [dspy.Example(question=question, context=context, answer=answer).with_inputs('question', 'context') for question, context, answer in zip(en_train_df['question'], en_train_df['context'], en_train_df['answer'])]
en_val = [dspy.Example(question=question, context=context, answer=answer).with_inputs('question', 'context') for question, context, answer in zip(en_val_df['question'], en_val_df['context'], en_val_df['answer'])]
en_test = [dspy.Example(question=question, context=context, answer=answer).with_inputs('question', 'context') for question, context, answer in zip(en_test_df['question'], en_test_df['context'], en_test_df['answer'])]


In [5]:
import os

key = '7SZy6719Es1HdhKPoRq7M3t2eRDHtElrt5yV7chGb0DJsjb0'
os.environ["FIREWORKS_AI_API_KEY"] = key
print(os.getenv("FIREWORKS_AI_API_KEY"))


7SZy6719Es1HdhKPoRq7M3t2eRDHtElrt5yV7chGb0DJsjb0


In [10]:

fireworks_llama3_1_8b_instruct = dspy.LM("fireworks_ai/accounts/fireworks/models/llama-v3p1-8b-instruct")
dspy.settings.configure(lm=fireworks_llama3_1_8b_instruct)

from collections import Counter
from utils import normalize_text

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if len(prediction_tokens) == len(ground_truth_tokens) == 0:
        # Unlike most tasks, QReCC and SQuAD-2.0 assign 1.0 in this edge case. We don't for uniformity.
        print(
            "\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

def answer_f1_score(example, pred, trace=None, frac=1.0):
    assert(type(example.answer) is str )
    
    f1 = f1_score(pred.answer, example.answer)
    return f1

metric_EM = dspy.evaluate.answer_exact_match
metric_F1 = answer_f1_score

#TODO: add F1 metric

class CoT(dspy.Module):  # let's define a new module
    def __init__(self):
        super().__init__()

        # here we declare the chain of thought sub-module, so we can later compile it (e.g., teach it a prompt)
        self.generate_answer = dspy.ChainOfThought('question, context -> answer')
    
    def forward(self, question, context):
        return self.generate_answer(question=question, context=context)
    

In [11]:
from tqdm import tqdm
from itertools import product

for (lang, train_set, val_set), (metric_name, metric_func) in tqdm(list(product(
    [('inlang', inlang_train, inlang_val), ('en', en_train, en_val)],
    [('F1', metric_F1), ('EM', metric_EM)]
)), desc="Compiling models"):
        teleprompter = BootstrapFewShotWithRandomSearch(metric=metric_func, max_bootstrapped_demos=1)
        cot_compiled = teleprompter.compile(CoT(), trainset=train_set, valset=val_set)
        cot_compiled.save(f'code/{lang}_compiled_cot_{metric_name.lower()}.json')


Compiling models:   0%|          | 0/4 [00:00<?, ?it/s]

Going to sample between 1 and 1 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


Average Metric: 2.8904761904761904 / 30  (9.6): 100%|██████████| 30/30 [00:05<00:00,  5.75it/s] 


New best score: 9.63 for seed -3
Scores so far: [9.63]
Best score so far: 9.63


Average Metric: 6.071428571428571 / 30  (20.2): 100%|██████████| 30/30 [00:04<00:00,  6.89it/s] 


New best score: 20.24 for seed -2
Scores so far: [9.63, 20.24]
Best score so far: 20.24


  6%|▌         | 7/120 [00:05<01:27,  1.29it/s]


Bootstrapped 1 full traces after 8 examples in round 0.


Average Metric: 2.4357142857142855 / 30  (8.1): 100%|██████████| 30/30 [00:04<00:00,  6.85it/s] 


Scores so far: [9.63, 20.24, 8.12]
Best score so far: 20.24


  1%|          | 1/120 [00:00<01:36,  1.24it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 4.083333333333333 / 30  (13.6): 100%|██████████| 30/30 [00:04<00:00,  7.13it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61]
Best score so far: 20.24


  1%|          | 1/120 [00:00<01:26,  1.38it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 8.720028011204482 / 30  (29.1): 100%|██████████| 30/30 [00:04<00:00,  7.11it/s] 


New best score: 29.07 for seed 1
Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07]
Best score so far: 29.07


  2%|▎         | 3/120 [00:02<01:26,  1.35it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 6.704761904761904 / 30  (22.3): 100%|██████████| 30/30 [00:04<00:00,  7.30it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35]
Best score so far: 29.07


  1%|          | 1/120 [00:00<01:29,  1.34it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 4.383333333333333 / 30  (14.6): 100%|██████████| 30/30 [00:04<00:00,  7.38it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61]
Best score so far: 29.07


  1%|          | 1/120 [00:00<01:34,  1.26it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 5.452380952380952 / 30  (18.2): 100%|██████████| 30/30 [00:04<00:00,  7.34it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17]
Best score so far: 29.07


  3%|▎         | 4/120 [00:03<01:32,  1.25it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 7.4523809523809526 / 30  (24.8): 100%|██████████| 30/30 [00:04<00:00,  6.69it/s]


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84]
Best score so far: 29.07


  6%|▌         | 7/120 [00:06<01:43,  1.09it/s]


Bootstrapped 1 full traces after 8 examples in round 0.


Average Metric: 6.669047619047619 / 30  (22.2): 100%|██████████| 30/30 [00:04<00:00,  6.32it/s]


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23]
Best score so far: 29.07


  2%|▏         | 2/120 [00:01<01:33,  1.26it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 3.904761904761904 / 30  (13.0): 100%|██████████| 30/30 [00:04<00:00,  6.85it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02]
Best score so far: 29.07


  1%|          | 1/120 [00:00<01:23,  1.42it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 5.702380952380952 / 30  (19.0): 100%|██████████| 30/30 [00:03<00:00,  7.60it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01]
Best score so far: 29.07


  3%|▎         | 4/120 [00:03<01:33,  1.24it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 6.452380952380952 / 30  (21.5): 100%|██████████| 30/30 [00:04<00:00,  7.26it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01, 21.51]
Best score so far: 29.07


  1%|          | 1/120 [00:00<01:28,  1.34it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 3.285714285714286 / 30  (11.0): 100%|██████████| 30/30 [00:03<00:00,  7.63it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01, 21.51, 10.95]
Best score so far: 29.07


  1%|          | 1/120 [00:00<01:50,  1.08it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 4.035714285714286 / 30  (13.5): 100%|██████████| 30/30 [00:04<00:00,  7.36it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01, 21.51, 10.95, 13.45]
Best score so far: 29.07


  2%|▏         | 2/120 [00:01<01:31,  1.29it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 3.9166666666666665 / 30  (13.1): 100%|██████████| 30/30 [00:04<00:00,  7.42it/s]


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01, 21.51, 10.95, 13.45, 13.06]
Best score so far: 29.07


  2%|▎         | 3/120 [00:02<01:23,  1.39it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 5.716666666666667 / 30  (19.1): 100%|██████████| 30/30 [00:04<00:00,  7.12it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01, 21.51, 10.95, 13.45, 13.06, 19.06]
Best score so far: 29.07


  2%|▎         | 3/120 [00:02<01:33,  1.25it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 4.071428571428571 / 30  (13.6): 100%|██████████| 30/30 [00:04<00:00,  6.70it/s] 


Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01, 21.51, 10.95, 13.45, 13.06, 19.06, 13.57]
Best score so far: 29.07


  3%|▎         | 4/120 [00:03<01:32,  1.25it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 6.119047619047619 / 30  (20.4): 100%|██████████| 30/30 [00:04<00:00,  6.82it/s] 
Compiling models:  25%|██▌       | 1/4 [01:58<05:56, 118.91s/it]

Scores so far: [9.63, 20.24, 8.12, 13.61, 29.07, 22.35, 14.61, 18.17, 24.84, 22.23, 13.02, 19.01, 21.51, 10.95, 13.45, 13.06, 19.06, 13.57, 20.4]
Best score so far: 29.07
19 candidate programs found.
[('generate_answer', Predict(StringSignature(question, context -> reasoning, answer
    instructions='Given the fields `question`, `context`, produce the fields `answer`.'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
)))]
Going 

Average Metric: 1 / 30  (3.3): 100%|██████████| 30/30 [00:00<00:00, 3101.45it/s]


New best score: 3.33 for seed -3
Scores so far: [3.33]
Best score so far: 3.33


Average Metric: 5 / 30  (16.7): 100%|██████████| 30/30 [00:00<00:00, 4130.01it/s]


New best score: 16.67 for seed -2
Scores so far: [3.33, 16.67]
Best score so far: 16.67


  9%|▉         | 11/120 [00:03<00:32,  3.40it/s]


Bootstrapped 1 full traces after 12 examples in round 0.


Average Metric: 2 / 30  (6.7): 100%|██████████| 30/30 [00:03<00:00,  7.84it/s]


Scores so far: [3.33, 16.67, 6.67]
Best score so far: 16.67


  1%|          | 1/120 [00:00<00:00, 1128.41it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 2 / 30  (6.7): 100%|██████████| 30/30 [00:00<00:00, 3810.92it/s] 


Scores so far: [3.33, 16.67, 6.67, 6.67]
Best score so far: 16.67


  6%|▌         | 7/120 [00:04<01:13,  1.53it/s]


Bootstrapped 1 full traces after 8 examples in round 0.


Average Metric: 5 / 30  (16.7): 100%|██████████| 30/30 [00:04<00:00,  7.39it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67]
Best score so far: 16.67


 22%|██▏       | 26/120 [00:18<01:07,  1.39it/s]


Bootstrapped 1 full traces after 27 examples in round 0.


Average Metric: 4 / 30  (13.3): 100%|██████████| 30/30 [00:04<00:00,  7.25it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33]
Best score so far: 16.67


  1%|          | 1/120 [00:00<00:00, 1305.82it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 3 / 30  (10.0): 100%|██████████| 30/30 [00:00<00:00, 3986.73it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0]
Best score so far: 16.67


  2%|▏         | 2/120 [00:00<00:42,  2.78it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 3 / 30  (10.0): 100%|██████████| 30/30 [00:04<00:00,  7.17it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0]
Best score so far: 16.67


  3%|▎         | 4/120 [00:00<00:00, 1988.76it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 6 / 30  (20.0): 100%|██████████| 30/30 [00:00<00:00, 4411.81it/s]


New best score: 20.0 for seed 5
Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0]
Best score so far: 20.0


  6%|▌         | 7/120 [00:00<00:00, 2947.51it/s]


Bootstrapped 1 full traces after 8 examples in round 0.


Average Metric: 5 / 30  (16.7): 100%|██████████| 30/30 [00:00<00:00, 4546.34it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67]
Best score so far: 20.0


  8%|▊         | 10/120 [00:06<01:06,  1.66it/s]


Bootstrapped 1 full traces after 11 examples in round 0.


Average Metric: 3 / 30  (10.0): 100%|██████████| 30/30 [00:03<00:00,  7.85it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0]
Best score so far: 20.0


  3%|▎         | 4/120 [00:02<01:01,  1.89it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 4 / 30  (13.3): 100%|██████████| 30/30 [00:04<00:00,  7.50it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33]
Best score so far: 20.0


  3%|▎         | 4/120 [00:00<00:00, 2532.03it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 5 / 30  (16.7): 100%|██████████| 30/30 [00:00<00:00, 4970.14it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33, 16.67]
Best score so far: 20.0


  1%|          | 1/120 [00:00<00:00, 2122.62it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 1 / 30  (3.3): 100%|██████████| 30/30 [00:00<00:00, 5473.21it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33, 16.67, 3.33]
Best score so far: 20.0


  6%|▌         | 7/120 [00:04<01:08,  1.65it/s]


Bootstrapped 1 full traces after 8 examples in round 0.


Average Metric: 4 / 30  (13.3): 100%|██████████| 30/30 [00:03<00:00,  7.60it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33, 16.67, 3.33, 13.33]
Best score so far: 20.0


  2%|▏         | 2/120 [00:00<00:00, 2259.86it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 2 / 30  (6.7): 100%|██████████| 30/30 [00:00<00:00, 4618.94it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33, 16.67, 3.33, 13.33, 6.67]
Best score so far: 20.0


  2%|▎         | 3/120 [00:00<00:00, 2631.31it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 4 / 30  (13.3): 100%|██████████| 30/30 [00:00<00:00, 4973.48it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33, 16.67, 3.33, 13.33, 6.67, 13.33]
Best score so far: 20.0


  3%|▎         | 4/120 [00:00<00:12,  9.01it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 3 / 30  (10.0): 100%|██████████| 30/30 [00:03<00:00,  7.54it/s]


Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33, 16.67, 3.33, 13.33, 6.67, 13.33, 10.0]
Best score so far: 20.0


  3%|▎         | 4/120 [00:00<00:00, 3378.42it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 4 / 30  (13.3): 100%|██████████| 30/30 [00:00<00:00, 4545.52it/s]
Compiling models:  50%|█████     | 2/4 [03:11<03:03, 91.61s/it] 

Scores so far: [3.33, 16.67, 6.67, 6.67, 16.67, 13.33, 10.0, 10.0, 20.0, 16.67, 10.0, 13.33, 16.67, 3.33, 13.33, 6.67, 13.33, 10.0, 13.33]
Best score so far: 20.0
19 candidate programs found.
[('generate_answer', Predict(StringSignature(question, context -> reasoning, answer
    instructions='Given the fields `question`, `context`, produce the fields `answer`.'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
)))]
Going to sampl

Average Metric: 2.0 / 30  (6.7): 100%|██████████| 30/30 [00:05<00:00,  5.31it/s] 


New best score: 6.67 for seed -3
Scores so far: [6.67]
Best score so far: 6.67


Average Metric: 13.857142857142858 / 30  (46.2): 100%|██████████| 30/30 [00:04<00:00,  6.89it/s]


New best score: 46.19 for seed -2
Scores so far: [6.67, 46.19]
Best score so far: 46.19


  2%|▎         | 3/120 [00:02<01:26,  1.35it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 11.0 / 30  (36.7): 100%|██████████| 30/30 [00:04<00:00,  7.01it/s]


Scores so far: [6.67, 46.19, 36.67]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:31,  1.30it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 12.457142857142857 / 30  (41.5): 100%|██████████| 30/30 [00:04<00:00,  6.52it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52]
Best score so far: 46.19


  2%|▎         | 3/120 [00:02<01:29,  1.30it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 10.857142857142858 / 30  (36.2): 100%|██████████| 30/30 [00:04<00:00,  7.17it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:27,  1.37it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 9.857142857142858 / 30  (32.9): 100%|██████████| 30/30 [00:03<00:00,  7.57it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86]
Best score so far: 46.19


  7%|▋         | 8/120 [00:06<01:28,  1.26it/s]


Bootstrapped 1 full traces after 9 examples in round 0.


Average Metric: 10.5 / 30  (35.0): 100%|██████████| 30/30 [00:04<00:00,  6.27it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:24,  1.40it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 11.357142857142858 / 30  (37.9): 100%|██████████| 30/30 [00:04<00:00,  6.52it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:27,  1.36it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 11.0 / 30  (36.7): 100%|██████████| 30/30 [00:03<00:00,  7.81it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:30,  1.31it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 12.257142857142858 / 30  (40.9): 100%|██████████| 30/30 [00:04<00:00,  7.49it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86]
Best score so far: 46.19


  2%|▏         | 2/120 [00:01<01:25,  1.37it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 9.357142857142858 / 30  (31.2): 100%|██████████| 30/30 [00:04<00:00,  7.37it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:41,  1.18it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 10.5 / 30  (35.0): 100%|██████████| 30/30 [00:03<00:00,  7.61it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:56,  1.02it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 12.457142857142857 / 30  (41.5): 100%|██████████| 30/30 [00:04<00:00,  6.62it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0, 41.52]
Best score so far: 46.19


  2%|▎         | 3/120 [00:02<01:23,  1.39it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 13.0 / 30  (43.3): 100%|██████████| 30/30 [00:03<00:00,  8.02it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0, 41.52, 43.33]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:25,  1.39it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 10.857142857142858 / 30  (36.2): 100%|██████████| 30/30 [00:03<00:00,  7.78it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0, 41.52, 43.33, 36.19]
Best score so far: 46.19


  2%|▏         | 2/120 [00:01<01:35,  1.24it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 12.257142857142858 / 30  (40.9): 100%|██████████| 30/30 [00:04<00:00,  6.87it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0, 41.52, 43.33, 36.19, 40.86]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:34,  1.26it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 11.257142857142858 / 30  (37.5): 100%|██████████| 30/30 [00:04<00:00,  6.50it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0, 41.52, 43.33, 36.19, 40.86, 37.52]
Best score so far: 46.19


  1%|          | 1/120 [00:00<01:45,  1.12it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 11.974789915966387 / 30  (39.9): 100%|██████████| 30/30 [00:04<00:00,  6.82it/s]


Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0, 41.52, 43.33, 36.19, 40.86, 37.52, 39.92]
Best score so far: 46.19


  2%|▎         | 3/120 [00:02<01:32,  1.26it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 11.857142857142858 / 30  (39.5): 100%|██████████| 30/30 [00:04<00:00,  6.85it/s]
Compiling models:  75%|███████▌  | 3/4 [05:00<01:39, 99.54s/it]

Scores so far: [6.67, 46.19, 36.67, 41.52, 36.19, 32.86, 35.0, 37.86, 36.67, 40.86, 31.19, 35.0, 41.52, 43.33, 36.19, 40.86, 37.52, 39.92, 39.52]
Best score so far: 46.19
19 candidate programs found.
[('generate_answer', Predict(StringSignature(question, context -> reasoning, answer
    instructions='Given the fields `question`, `context`, produce the fields `answer`.'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
)))]
Going 

Average Metric: 2 / 30  (6.7): 100%|██████████| 30/30 [00:00<00:00, 3615.78it/s] 


New best score: 6.67 for seed -3
Scores so far: [6.67]
Best score so far: 6.67


Average Metric: 13 / 30  (43.3): 100%|██████████| 30/30 [00:00<00:00, 4558.70it/s]


New best score: 43.33 for seed -2
Scores so far: [6.67, 43.33]
Best score so far: 43.33


  3%|▎         | 4/120 [00:00<00:21,  5.45it/s]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 7 / 30  (23.3): 100%|██████████| 30/30 [00:03<00:00,  7.90it/s]


Scores so far: [6.67, 43.33, 23.33]
Best score so far: 43.33


  1%|          | 1/120 [00:00<00:00, 1632.02it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 11 / 30  (36.7): 100%|██████████| 30/30 [00:00<00:00, 4495.02it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67]
Best score so far: 43.33


  2%|▎         | 3/120 [00:00<00:00, 2780.14it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 10 / 30  (33.3): 100%|██████████| 30/30 [00:00<00:00, 4627.60it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33]
Best score so far: 43.33


  1%|          | 1/120 [00:00<00:00, 2576.35it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 9 / 30  (30.0): 100%|██████████| 30/30 [00:00<00:00, 5151.02it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0]
Best score so far: 43.33


  7%|▋         | 8/120 [00:00<00:00, 3762.97it/s]


Bootstrapped 1 full traces after 9 examples in round 0.


Average Metric: 10 / 30  (33.3): 100%|██████████| 30/30 [00:00<00:00, 5025.73it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33]
Best score so far: 43.33


  8%|▊         | 9/120 [00:06<01:17,  1.43it/s]


Bootstrapped 1 full traces after 10 examples in round 0.


Average Metric: 8 / 30  (26.7): 100%|██████████| 30/30 [00:04<00:00,  6.08it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67]
Best score so far: 43.33


  1%|          | 1/120 [00:00<00:00, 1189.20it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 11 / 30  (36.7): 100%|██████████| 30/30 [00:00<00:00, 3857.07it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67]
Best score so far: 43.33


  2%|▏         | 2/120 [00:00<00:46,  2.54it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 12 / 30  (40.0): 100%|██████████| 30/30 [00:03<00:00,  7.95it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0]
Best score so far: 43.33


  2%|▏         | 2/120 [00:00<00:00, 2906.66it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 8 / 30  (26.7): 100%|██████████| 30/30 [00:00<00:00, 4431.38it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67]
Best score so far: 43.33


  4%|▍         | 5/120 [00:02<01:07,  1.70it/s]


Bootstrapped 1 full traces after 6 examples in round 0.


Average Metric: 10 / 30  (33.3): 100%|██████████| 30/30 [00:03<00:00,  7.52it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33]
Best score so far: 43.33


  1%|          | 1/120 [00:00<00:00, 1107.26it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 10 / 30  (33.3): 100%|██████████| 30/30 [00:00<00:00, 2859.62it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33, 33.33]
Best score so far: 43.33


  2%|▎         | 3/120 [00:00<00:00, 2227.06it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 13 / 30  (43.3): 100%|██████████| 30/30 [00:00<00:00, 4728.82it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33, 33.33, 43.33]
Best score so far: 43.33


  1%|          | 1/120 [00:00<00:00, 2563.76it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 10 / 30  (33.3): 100%|██████████| 30/30 [00:00<00:00, 4140.75it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33, 33.33, 43.33, 33.33]
Best score so far: 43.33


  2%|▏         | 2/120 [00:00<00:00, 2591.48it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 11 / 30  (36.7): 100%|██████████| 30/30 [00:00<00:00, 4012.28it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33, 33.33, 43.33, 33.33, 36.67]
Best score so far: 43.33


  1%|          | 1/120 [00:00<00:00, 2413.29it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 10 / 30  (33.3): 100%|██████████| 30/30 [00:00<00:00, 4007.42it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33, 33.33, 43.33, 33.33, 36.67, 33.33]
Best score so far: 43.33


  2%|▎         | 3/120 [00:01<00:54,  2.17it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 10 / 30  (33.3): 100%|██████████| 30/30 [00:03<00:00,  8.04it/s]


Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33, 33.33, 43.33, 33.33, 36.67, 33.33, 33.33]
Best score so far: 43.33


  2%|▎         | 3/120 [00:00<00:00, 1726.05it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 11 / 30  (36.7): 100%|██████████| 30/30 [00:00<00:00, 4237.67it/s]
Compiling models: 100%|██████████| 4/4 [05:33<00:00, 83.30s/it]

Scores so far: [6.67, 43.33, 23.33, 36.67, 33.33, 30.0, 33.33, 26.67, 36.67, 40.0, 26.67, 33.33, 33.33, 43.33, 33.33, 36.67, 33.33, 33.33, 36.67]
Best score so far: 43.33
19 candidate programs found.
[('generate_answer', Predict(StringSignature(question, context -> reasoning, answer
    instructions='Given the fields `question`, `context`, produce the fields `answer`.'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
)))]





In [13]:


eval = True

if eval:
    for (lang, test_set), (metric_name, metric_func) in tqdm(list(product(
        [('inlang', inlang_test), ('en', en_test)],
        [('F1', metric_F1), ('EM', metric_EM)]
    )), desc="Evaluating models"):
        
        cot_compiled = CoT()
        cot_compiled.load(f'code/{lang}_compiled_cot_{metric_name.lower()}.json')
        
        evaluate_exact_match = Evaluate(devset=test_set, metric=metric_EM, num_threads=int(os.cpu_count()), display_progress=True, display_table=15)
        evaluate_f1 = Evaluate(devset=test_set, metric=metric_F1, num_threads=int(os.cpu_count()), display_progress=True, display_table=15)
        res_em = evaluate_exact_match(cot_compiled)
        res_f1 = evaluate_f1(cot_compiled)
        
        print(lang, metric_name, "exact match", res_em, "f1", res_f1)

Average Metric: 64 / 300  (21.3): 100%|██████████| 300/300 [00:00<00:00, 4602.43it/s]


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_exact_match
0,Kuinka monta väriä on Nigerian lipussa?,The flag of Nigeria is the official flag of the Federal Republic of Nigeria. It was designed in 1959 and was officially put into use...,kaksi,Not supplied for this particular example.,3,
1,Kuka johti konfederaatiota Yhdysvaltojen sisällissodassa?,"New Orleans, however, surrendered to the Union on May 1, 1862. The economy was a problem for the Confederacy throughout its existence. Politicians did not...",Jefferson Davis,Not supplied for this particular example.,Jefferson Davis,✔️ [True]
2,MItä tarkoittaa siirtymäalkuaine?,The transition state is the point on the reaction coordinate of a chemical reaction where the group of atoms has the highest Gibbs energy value....,elementtien siirtymäsarja,Not supplied for this particular example.,alkyliryhmän siirtymäalkuaine,
3,Miksi Liettuan sosialistinen neuvostotasavalta toimi vain vuoden?,"From 1940 to 1990, the Lithuanian Soviet Socialist Republic (, ) or Lithuanian SNT (, ) was one of the Soviet republics belonging to the...",Operaatio Barbarossa,Not supplied for this particular example.,Neuvostoliiton hajoamisen vuoksi,
4,Milloin Dibleyn kirkon herraa aloitettiin esittämään Suomessa?,The Vicar of Dibley () is a British sitcom created by Richard Curtis and written for its lead actress Dawn French. The series was shown...,10. marraskuuta 1994,Not supplied for this particular example.,1994,
5,Milloin Itä-Rooma perustettiin?,"around the 14th century BC. The urbanization of Rome must have started in the 7th century BC. Over the centuries, Rome developed into a world...",395,Not supplied for this particular example.,753 eaa.,
6,Milloin Rooman perustamistarina sai alkunsa?,"The story of the founding of Rome is a fable that describes the founding of the city of Rome. According to the legend, Rome was...",21. huhtikuuta 753 eaa,Not supplied for this particular example.,21. huhtikuuta 753 eaa.,✔️ [True]
7,Missä on Nintendon pääkonttori?,"by October 2, 2008. In addition to its hardware, Nintendo has become known for its many game titles such as ""Mario"", ""Donkey Kong"", ""Kirby"", ""Metroid"",...",Kioto ja Tokio,Not supplied for this particular example.,Kyoto,
8,Mistä hevoset ovat tulleet Suomeen?,"they could not be scheduled. Linguists also consider it possible that already in the Bronze Age they knew how to ride, based on the Germanic...",1200-luvulla,Not supplied for this particular example.,Vikingit,
9,Miten määritellään pienoismalli?,"A miniature model is a significantly smaller copy of an object, building or other object than the original. Common are e.g. Vehicle-themed miniature models, as...","pieni kuva, jota käytetään korostamaan muinaista tai keskiaikaista valaistua käsikirjoitusta",Not supplied for this particular example.,pienempi kuin alkuperäinen,


Average Metric: 76.6908072319837 / 300  (25.6): 100%|██████████| 300/300 [00:00<00:00, 4590.04it/s]  


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_f1_score
0,Kuinka monta väriä on Nigerian lipussa?,The flag of Nigeria is the official flag of the Federal Republic of Nigeria. It was designed in 1959 and was officially put into use...,kaksi,Not supplied for this particular example.,3,
1,Kuka johti konfederaatiota Yhdysvaltojen sisällissodassa?,"New Orleans, however, surrendered to the Union on May 1, 1862. The economy was a problem for the Confederacy throughout its existence. Politicians did not...",Jefferson Davis,Not supplied for this particular example.,Jefferson Davis,✔️ [1.0]
2,MItä tarkoittaa siirtymäalkuaine?,The transition state is the point on the reaction coordinate of a chemical reaction where the group of atoms has the highest Gibbs energy value....,elementtien siirtymäsarja,Not supplied for this particular example.,alkyliryhmän siirtymäalkuaine,
3,Miksi Liettuan sosialistinen neuvostotasavalta toimi vain vuoden?,"From 1940 to 1990, the Lithuanian Soviet Socialist Republic (, ) or Lithuanian SNT (, ) was one of the Soviet republics belonging to the...",Operaatio Barbarossa,Not supplied for this particular example.,Neuvostoliiton hajoamisen vuoksi,
4,Milloin Dibleyn kirkon herraa aloitettiin esittämään Suomessa?,The Vicar of Dibley () is a British sitcom created by Richard Curtis and written for its lead actress Dawn French. The series was shown...,10. marraskuuta 1994,Not supplied for this particular example.,1994,✔️ [0.5]
5,Milloin Itä-Rooma perustettiin?,"around the 14th century BC. The urbanization of Rome must have started in the 7th century BC. Over the centuries, Rome developed into a world...",395,Not supplied for this particular example.,753 eaa.,
6,Milloin Rooman perustamistarina sai alkunsa?,"The story of the founding of Rome is a fable that describes the founding of the city of Rome. According to the legend, Rome was...",21. huhtikuuta 753 eaa,Not supplied for this particular example.,21. huhtikuuta 753 eaa.,✔️ [1.0]
7,Missä on Nintendon pääkonttori?,"by October 2, 2008. In addition to its hardware, Nintendo has become known for its many game titles such as ""Mario"", ""Donkey Kong"", ""Kirby"", ""Metroid"",...",Kioto ja Tokio,Not supplied for this particular example.,Kyoto,
8,Mistä hevoset ovat tulleet Suomeen?,"they could not be scheduled. Linguists also consider it possible that already in the Bronze Age they knew how to ride, based on the Germanic...",1200-luvulla,Not supplied for this particular example.,Vikingit,
9,Miten määritellään pienoismalli?,"A miniature model is a significantly smaller copy of an object, building or other object than the original. Common are e.g. Vehicle-themed miniature models, as...","pieni kuva, jota käytetään korostamaan muinaista tai keskiaikaista valaistua käsikirjoitusta",Not supplied for this particular example.,pienempi kuin alkuperäinen,




inlang F1 exact match 21.33 f1 25.56


Average Metric: 39 / 300  (13.0): 100%|██████████| 300/300 [00:01<00:00, 161.08it/s] 


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_exact_match
0,Kuinka monta väriä on Nigerian lipussa?,The flag of Nigeria is the official flag of the Federal Republic of Nigeria. It was designed in 1959 and was officially put into use...,kaksi,"Nigerian lippu on vihreä, valkoinen ja punainen.",3,
1,Kuka johti konfederaatiota Yhdysvaltojen sisällissodassa?,"New Orleans, however, surrendered to the Union on May 1, 1862. The economy was a problem for the Confederacy throughout its existence. Politicians did not...",Jefferson Davis,Not supplied for this particular example.,Jefferson Davis,✔️ [True]
2,MItä tarkoittaa siirtymäalkuaine?,The transition state is the point on the reaction coordinate of a chemical reaction where the group of atoms has the highest Gibbs energy value....,elementtien siirtymäsarja,Not supplied for this particular example.,"Energian korkeimmillaan oleva reaktioketjun piste, jossa reagoivat aineet ovat muodostuneet reaktiotilavaiheeksi.",
3,Miksi Liettuan sosialistinen neuvostotasavalta toimi vain vuoden?,"From 1940 to 1990, the Lithuanian Soviet Socialist Republic (, ) or Lithuanian SNT (, ) was one of the Soviet republics belonging to the...",Operaatio Barbarossa,Not supplied for this particular example.,1940,
4,Milloin Dibleyn kirkon herraa aloitettiin esittämään Suomessa?,The Vicar of Dibley () is a British sitcom created by Richard Curtis and written for its lead actress Dawn French. The series was shown...,10. marraskuuta 1994,Not supplied for this particular example.,10. marraskuuta 1994,✔️ [True]
5,Milloin Itä-Rooma perustettiin?,"around the 14th century BC. The urbanization of Rome must have started in the 7th century BC. Over the centuries, Rome developed into a world...",395,Not supplied for this particular example.,753 eaa.,
6,Milloin Rooman perustamistarina sai alkunsa?,"The story of the founding of Rome is a fable that describes the founding of the city of Rome. According to the legend, Rome was...",21. huhtikuuta 753 eaa,Not supplied for this particular example.,753 eaa.,
7,Missä on Nintendon pääkonttori?,"by October 2, 2008. In addition to its hardware, Nintendo has become known for its many game titles such as ""Mario"", ""Donkey Kong"", ""Kirby"", ""Metroid"",...",Kioto ja Tokio,Not supplied for this particular example.,"Kyoto, Japan",
8,Mistä hevoset ovat tulleet Suomeen?,"they could not be scheduled. Linguists also consider it possible that already in the Bronze Age they knew how to ride, based on the Germanic...",1200-luvulla,Not supplied for this particular example.,Vikingit,
9,Miten määritellään pienoismalli?,"A miniature model is a significantly smaller copy of an object, building or other object than the original. Common are e.g. Vehicle-themed miniature models, as...","pieni kuva, jota käytetään korostamaan muinaista tai keskiaikaista valaistua käsikirjoitusta",Pienoismalli on pienempi kappale alkuperäisestä kohteesta.,Pienempi kappale alkuperäisestä kohteesta.,


Average Metric: 52.0906277056277 / 300  (17.4): 100%|██████████| 300/300 [00:00<00:00, 4597.84it/s]  


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_f1_score
0,Kuinka monta väriä on Nigerian lipussa?,The flag of Nigeria is the official flag of the Federal Republic of Nigeria. It was designed in 1959 and was officially put into use...,kaksi,"Nigerian lippu on vihreä, valkoinen ja punainen.",3,
1,Kuka johti konfederaatiota Yhdysvaltojen sisällissodassa?,"New Orleans, however, surrendered to the Union on May 1, 1862. The economy was a problem for the Confederacy throughout its existence. Politicians did not...",Jefferson Davis,Not supplied for this particular example.,Jefferson Davis,✔️ [1.0]
2,MItä tarkoittaa siirtymäalkuaine?,The transition state is the point on the reaction coordinate of a chemical reaction where the group of atoms has the highest Gibbs energy value....,elementtien siirtymäsarja,Not supplied for this particular example.,"Energian korkeimmillaan oleva reaktioketjun piste, jossa reagoivat aineet ovat muodostuneet reaktiotilavaiheeksi.",
3,Miksi Liettuan sosialistinen neuvostotasavalta toimi vain vuoden?,"From 1940 to 1990, the Lithuanian Soviet Socialist Republic (, ) or Lithuanian SNT (, ) was one of the Soviet republics belonging to the...",Operaatio Barbarossa,Not supplied for this particular example.,1940,
4,Milloin Dibleyn kirkon herraa aloitettiin esittämään Suomessa?,The Vicar of Dibley () is a British sitcom created by Richard Curtis and written for its lead actress Dawn French. The series was shown...,10. marraskuuta 1994,Not supplied for this particular example.,10. marraskuuta 1994,✔️ [1.0]
5,Milloin Itä-Rooma perustettiin?,"around the 14th century BC. The urbanization of Rome must have started in the 7th century BC. Over the centuries, Rome developed into a world...",395,Not supplied for this particular example.,753 eaa.,
6,Milloin Rooman perustamistarina sai alkunsa?,"The story of the founding of Rome is a fable that describes the founding of the city of Rome. According to the legend, Rome was...",21. huhtikuuta 753 eaa,Not supplied for this particular example.,753 eaa.,✔️ [0.6666666666666666]
7,Missä on Nintendon pääkonttori?,"by October 2, 2008. In addition to its hardware, Nintendo has become known for its many game titles such as ""Mario"", ""Donkey Kong"", ""Kirby"", ""Metroid"",...",Kioto ja Tokio,Not supplied for this particular example.,"Kyoto, Japan",
8,Mistä hevoset ovat tulleet Suomeen?,"they could not be scheduled. Linguists also consider it possible that already in the Bronze Age they knew how to ride, based on the Germanic...",1200-luvulla,Not supplied for this particular example.,Vikingit,
9,Miten määritellään pienoismalli?,"A miniature model is a significantly smaller copy of an object, building or other object than the original. Common are e.g. Vehicle-themed miniature models, as...","pieni kuva, jota käytetään korostamaan muinaista tai keskiaikaista valaistua käsikirjoitusta",Pienoismalli on pienempi kappale alkuperäisestä kohteesta.,Pienempi kappale alkuperäisestä kohteesta.,




inlang EM exact match 13.0 f1 17.36


Average Metric: 149 / 300  (49.7): 100%|██████████| 300/300 [00:23<00:00, 12.62it/s]


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_exact_match
0,Missä maassa Jack Churchill syntyi?,"Churchill was born at Colombo, British Ceylon to Alec Fleming ""Alex"" Churchill (1876–1961), later of Hove, East Sussex and Elinor Elizabeth, daughter of John Alexander...","Colombo, British Ceylon",Not supplied for this particular example.,Ceylon,
1,Mikä on yleisin uskonto maailmassa?,"The five largest religious groups by world population, estimated to account for 5.8 billion people and 84% of the population, are Christianity, Islam, Buddhism, Hinduism...",Christianity,Not supplied for this particular example.,Christianity,✔️ [True]
2,Kuka oli Glee sarjan pääosassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Rachel Barbra Berry,Not supplied for this particular example.,Lea Michele,
3,Milloin Killzone-sarjan peli julkaistiin ensimmäisen kerran?,Killzone is a series of first-person shooter and twin sticks shooter video games for Sony Computer Entertainment's (SCE) video game consoles. The main series and...,November 2004,Peli julkaistiin ensimmäisen kerran PlayStation 2:lle marraskuussa 2004.,2004,
4,Milloin Pennsylvania liitty USA?,The state is one of the 13 original founding states of the United States; it came into being in 1681 as a result of a...,"December 12, 1787",Not supplied for this particular example.,1681,
5,Missä Richard Attenborough kuoli?,"In June 2012, shortly before her 90th birthday, Sheila Sim entered the professional actors' retirement home Denville Hall, for which she and Attenborough had helped...",Denville Hall,Not supplied for this particular example.,London,
6,Milloin ateismi on syntynyt?,"In early ancient Greek, the adjective ' (, from the privative ἀ- + ""god"") meant ""godless"". It was first used as a term of censure...",5th century BCE,Not supplied for this particular example.,5th century BCE,✔️ [True]
7,Mistä tulee nimitys digitelevisio?,"Digital television (DTV) is the transmission of television signals, including the sound channel, using digital encoding, in contrast to the earlier television technology, analog television,...","transmission of television signals, including the sound channel, using digital encoding","Nimitys ""digitelevisio"" tulee siitä, että televisio lähetetään digitaalisesti, eli digitaalisella koodauksella.",digitaalisesta lähetystekniikasta,
8,Ovatko suomen kielen murteet olleet aina olemassa?,Westrobothnian () is a number of closely related non-standardized Scandinavian dialects spoken natively along the coast of the historical province of Westrobothnia in co-existence with...,no,Not supplied for this particular example.,ei,
9,Kuka näytteli pääosaa Glee sarjassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Lea Michele,Lea Michele näytteli Rachel Barbra Berry roolia Glee sarjassa.,Lea Michele,✔️ [True]


Average Metric: 175.30327814469254 / 300  (58.4): 100%|██████████| 300/300 [00:00<00:00, 4994.09it/s]


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_f1_score
0,Missä maassa Jack Churchill syntyi?,"Churchill was born at Colombo, British Ceylon to Alec Fleming ""Alex"" Churchill (1876–1961), later of Hove, East Sussex and Elinor Elizabeth, daughter of John Alexander...","Colombo, British Ceylon",Not supplied for this particular example.,Ceylon,✔️ [0.5]
1,Mikä on yleisin uskonto maailmassa?,"The five largest religious groups by world population, estimated to account for 5.8 billion people and 84% of the population, are Christianity, Islam, Buddhism, Hinduism...",Christianity,Not supplied for this particular example.,Christianity,✔️ [1.0]
2,Kuka oli Glee sarjan pääosassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Rachel Barbra Berry,Not supplied for this particular example.,Lea Michele,
3,Milloin Killzone-sarjan peli julkaistiin ensimmäisen kerran?,Killzone is a series of first-person shooter and twin sticks shooter video games for Sony Computer Entertainment's (SCE) video game consoles. The main series and...,November 2004,Peli julkaistiin ensimmäisen kerran PlayStation 2:lle marraskuussa 2004.,2004,✔️ [0.6666666666666666]
4,Milloin Pennsylvania liitty USA?,The state is one of the 13 original founding states of the United States; it came into being in 1681 as a result of a...,"December 12, 1787",Not supplied for this particular example.,1681,
5,Missä Richard Attenborough kuoli?,"In June 2012, shortly before her 90th birthday, Sheila Sim entered the professional actors' retirement home Denville Hall, for which she and Attenborough had helped...",Denville Hall,Not supplied for this particular example.,London,
6,Milloin ateismi on syntynyt?,"In early ancient Greek, the adjective ' (, from the privative ἀ- + ""god"") meant ""godless"". It was first used as a term of censure...",5th century BCE,Not supplied for this particular example.,5th century BCE,✔️ [1.0]
7,Mistä tulee nimitys digitelevisio?,"Digital television (DTV) is the transmission of television signals, including the sound channel, using digital encoding, in contrast to the earlier television technology, analog television,...","transmission of television signals, including the sound channel, using digital encoding","Nimitys ""digitelevisio"" tulee siitä, että televisio lähetetään digitaalisesti, eli digitaalisella koodauksella.",digitaalisesta lähetystekniikasta,
8,Ovatko suomen kielen murteet olleet aina olemassa?,Westrobothnian () is a number of closely related non-standardized Scandinavian dialects spoken natively along the coast of the historical province of Westrobothnia in co-existence with...,no,Not supplied for this particular example.,ei,
9,Kuka näytteli pääosaa Glee sarjassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Lea Michele,Lea Michele näytteli Rachel Barbra Berry roolia Glee sarjassa.,Lea Michele,✔️ [1.0]




en F1 exact match 49.67 f1 58.43


Average Metric: 149 / 300  (49.7): 100%|██████████| 300/300 [00:00<00:00, 4455.86it/s]


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_exact_match
0,Missä maassa Jack Churchill syntyi?,"Churchill was born at Colombo, British Ceylon to Alec Fleming ""Alex"" Churchill (1876–1961), later of Hove, East Sussex and Elinor Elizabeth, daughter of John Alexander...","Colombo, British Ceylon",Not supplied for this particular example.,Ceylon,
1,Mikä on yleisin uskonto maailmassa?,"The five largest religious groups by world population, estimated to account for 5.8 billion people and 84% of the population, are Christianity, Islam, Buddhism, Hinduism...",Christianity,Not supplied for this particular example.,Christianity,✔️ [True]
2,Kuka oli Glee sarjan pääosassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Rachel Barbra Berry,Not supplied for this particular example.,Lea Michele,
3,Milloin Killzone-sarjan peli julkaistiin ensimmäisen kerran?,Killzone is a series of first-person shooter and twin sticks shooter video games for Sony Computer Entertainment's (SCE) video game consoles. The main series and...,November 2004,Peli julkaistiin ensimmäisen kerran PlayStation 2:lle marraskuussa 2004.,2004,
4,Milloin Pennsylvania liitty USA?,The state is one of the 13 original founding states of the United States; it came into being in 1681 as a result of a...,"December 12, 1787",Not supplied for this particular example.,1681,
5,Missä Richard Attenborough kuoli?,"In June 2012, shortly before her 90th birthday, Sheila Sim entered the professional actors' retirement home Denville Hall, for which she and Attenborough had helped...",Denville Hall,Not supplied for this particular example.,London,
6,Milloin ateismi on syntynyt?,"In early ancient Greek, the adjective ' (, from the privative ἀ- + ""god"") meant ""godless"". It was first used as a term of censure...",5th century BCE,Not supplied for this particular example.,5th century BCE,✔️ [True]
7,Mistä tulee nimitys digitelevisio?,"Digital television (DTV) is the transmission of television signals, including the sound channel, using digital encoding, in contrast to the earlier television technology, analog television,...","transmission of television signals, including the sound channel, using digital encoding","Nimitys ""digitelevisio"" tulee siitä, että televisio lähetetään digitaalisesti, eli digitaalisella koodauksella.",digitaalisesta lähetystekniikasta,
8,Ovatko suomen kielen murteet olleet aina olemassa?,Westrobothnian () is a number of closely related non-standardized Scandinavian dialects spoken natively along the coast of the historical province of Westrobothnia in co-existence with...,no,Not supplied for this particular example.,ei,
9,Kuka näytteli pääosaa Glee sarjassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Lea Michele,Lea Michele näytteli Rachel Barbra Berry roolia Glee sarjassa.,Lea Michele,✔️ [True]


Average Metric: 175.30327814469257 / 300  (58.4): 100%|██████████| 300/300 [00:00<00:00, 2699.86it/s]


Unnamed: 0,question,context,example_answer,reasoning,pred_answer,answer_f1_score
0,Missä maassa Jack Churchill syntyi?,"Churchill was born at Colombo, British Ceylon to Alec Fleming ""Alex"" Churchill (1876–1961), later of Hove, East Sussex and Elinor Elizabeth, daughter of John Alexander...","Colombo, British Ceylon",Not supplied for this particular example.,Ceylon,✔️ [0.5]
1,Mikä on yleisin uskonto maailmassa?,"The five largest religious groups by world population, estimated to account for 5.8 billion people and 84% of the population, are Christianity, Islam, Buddhism, Hinduism...",Christianity,Not supplied for this particular example.,Christianity,✔️ [1.0]
2,Kuka oli Glee sarjan pääosassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Rachel Barbra Berry,Not supplied for this particular example.,Lea Michele,
3,Milloin Killzone-sarjan peli julkaistiin ensimmäisen kerran?,Killzone is a series of first-person shooter and twin sticks shooter video games for Sony Computer Entertainment's (SCE) video game consoles. The main series and...,November 2004,Peli julkaistiin ensimmäisen kerran PlayStation 2:lle marraskuussa 2004.,2004,✔️ [0.6666666666666666]
4,Milloin Pennsylvania liitty USA?,The state is one of the 13 original founding states of the United States; it came into being in 1681 as a result of a...,"December 12, 1787",Not supplied for this particular example.,1681,
5,Missä Richard Attenborough kuoli?,"In June 2012, shortly before her 90th birthday, Sheila Sim entered the professional actors' retirement home Denville Hall, for which she and Attenborough had helped...",Denville Hall,Not supplied for this particular example.,London,
6,Milloin ateismi on syntynyt?,"In early ancient Greek, the adjective ' (, from the privative ἀ- + ""god"") meant ""godless"". It was first used as a term of censure...",5th century BCE,Not supplied for this particular example.,5th century BCE,✔️ [1.0]
7,Mistä tulee nimitys digitelevisio?,"Digital television (DTV) is the transmission of television signals, including the sound channel, using digital encoding, in contrast to the earlier television technology, analog television,...","transmission of television signals, including the sound channel, using digital encoding","Nimitys ""digitelevisio"" tulee siitä, että televisio lähetetään digitaalisesti, eli digitaalisella koodauksella.",digitaalisesta lähetystekniikasta,
8,Ovatko suomen kielen murteet olleet aina olemassa?,Westrobothnian () is a number of closely related non-standardized Scandinavian dialects spoken natively along the coast of the historical province of Westrobothnia in co-existence with...,no,Not supplied for this particular example.,ei,
9,Kuka näytteli pääosaa Glee sarjassa?,"Rachel Barbra Berry (Lea Michele) is the lead character and is a ""strong, driven"" member of the glee club, who is misunderstood by her peers....",Lea Michele,Lea Michele näytteli Rachel Barbra Berry roolia Glee sarjassa.,Lea Michele,✔️ [1.0]


Evaluating models: 100%|██████████| 4/4 [00:26<00:00,  6.74s/it]

en EM exact match 49.67 f1 58.43



