In [1]:
from typing import Literal

import logfire
from dotenv import load_dotenv
from langfuse import Langfuse
from langfuse.decorators import langfuse_context
from pydantic_ai import Agent

load_dotenv("../smol/.env")

logfire.configure(send_to_logfire=False)

%load_ext autoreload
%autoreload 2

In [2]:
langfuse = Langfuse()
dataset_name = "football_facts"
langfuse.create_dataset(name=dataset_name)

items = [
    {"input": "did messi win the fifa world cup?", "expected_output": "yes"},
    {"input": "did ronaldo win the fifa world cup?", "expected_output": "yes"},
]

# for item in items:
#     langfuse.create_dataset_item(
#         dataset_name="football_facts", input=item["input"], expected_output=item["expected_output"]
#     )


In [3]:
agent = Agent(
    model="google-gla:gemini-1.5-flash",
    name="football_expert",
    system_prompt="You are a football(soccer) expert",
    instrument=True,
    result_type=Literal["yes", "no"],  # type: ignore
)

In [4]:
def simple_evaluation(output, expected_output):
    return output == expected_output

In [5]:
async def run_experiment(experiment_name):
    dataset = langfuse.get_dataset(dataset_name)

    for item in dataset.items:
        with logfire.span(experiment_name) as span:
            trace_id = span.get_span_context().trace_id
            trace_id = f"{trace_id:032x}"
            print(trace_id)
            with item.observe(run_name=experiment_name, trace_id=trace_id) as _:
                output = await agent.run(item.input)
                langfuse.score(
                    trace_id=trace_id,
                    name="exact_match",
                    value=simple_evaluation(output.data, item.expected_output),
                )

In [6]:
experiment_name = "football_expert_run_1"

In [7]:
await run_experiment(experiment_name=experiment_name)

15:59:38.084 football_expert_run_1
0195767bc664429c1bcd0bd97f7e68ad
15:59:38.088   football_expert run
15:59:38.090     preparing model request params
15:59:38.092     chat gemini-1.5-flash
15:59:41.012 football_expert_run_1
0195767bd1d448b4130a56614da8fddb
15:59:41.013   football_expert run
15:59:41.014     preparing model request params
15:59:41.014     chat gemini-1.5-flash


In [8]:
langfuse_context.flush()
langfuse.flush()

In [9]:
dataset_runs = langfuse.get_dataset_runs(dataset_name=dataset_name)

In [10]:
run = langfuse.get_dataset_run(dataset_name=dataset_name, dataset_run_name=experiment_name)

In [11]:
dataset_runs.dict()

{'data': [{'id': 'cm80e486p03v0ad0643h8qw3n',
   'name': 'football_expert_run_1',
   'description': None,
   'metadata': None,
   'datasetId': 'cm809dde0042pad07jzu49ikf',
   'datasetName': 'football_facts',
   'createdAt': datetime.datetime(2025, 3, 8, 15, 59, 40, 658000, tzinfo=datetime.timezone.utc),
   'updatedAt': datetime.datetime(2025, 3, 8, 15, 59, 40, 658000, tzinfo=datetime.timezone.utc)}],
 'meta': {'page': 1, 'limit': 50, 'totalItems': 1, 'totalPages': 1}}

In [12]:
run.dict()

{'id': 'cm80e486p03v0ad0643h8qw3n',
 'name': 'football_expert_run_1',
 'datasetId': 'cm809dde0042pad07jzu49ikf',
 'datasetName': 'football_facts',
 'createdAt': datetime.datetime(2025, 3, 8, 15, 59, 40, 658000, tzinfo=datetime.timezone.utc),
 'updatedAt': datetime.datetime(2025, 3, 8, 15, 59, 40, 658000, tzinfo=datetime.timezone.utc),
 'datasetRunItems': [{'id': 'cm80e486x03v2ad06lzny8t2h',
   'datasetRunId': 'cm80e486p03v0ad0643h8qw3n',
   'datasetRunName': 'football_expert_run_1',
   'datasetItemId': '7ec945c3-57a9-408f-8a5b-46ab0e6b26f3',
   'traceId': '0195767bc664429c1bcd0bd97f7e68ad',
   'observationId': None,
   'createdAt': datetime.datetime(2025, 3, 8, 15, 59, 40, 665000, tzinfo=datetime.timezone.utc),
   'updatedAt': datetime.datetime(2025, 3, 8, 15, 59, 40, 665000, tzinfo=datetime.timezone.utc)},
  {'id': 'cm80e49ct03ydad06er9ao723',
   'datasetRunId': 'cm80e486p03v0ad0643h8qw3n',
   'datasetRunName': 'football_expert_run_1',
   'datasetItemId': 'fd52e28e-8e6f-49c0-8e6b-bd57

Doesn't it depend on the flow/use case?
If you have 4 tasks that take 5 minutes each but need to be done in sequence:
- Agent A does tasks 1 -> 2 (10 minutes)
- Agent A "hands-off" to Agent B
- Agent B does tasks 3 -> 4 (10 minutes)
It will take 20 minutes.
They can't run concurrently because each tasks input depends on the output of the previous task.