In [1]:
import base64
import os
from collections.abc import Callable

import logfire
from langfuse import Langfuse
from langfuse.client import DatasetItemClient
from langfuse.decorators import langfuse_context
from langfuse.model import TextPromptClient
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.models import KnownModelName

LANGFUSE_PUBLIC_KEY = "pk-lf-7b34804f-78c2-4c5d-8e35-f3b7b9a4123c"
LANGFUSE_SECRET_KEY = "sk-lf-6754a73e-e265-44ae-8ee9-4fe717f8ccb5"

os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = (
    f"Authorization=Basic {base64.b64encode(f'{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}'.encode()).decode()}"
)
os.environ["LANGFUSE_PUBLIC_KEY"] = LANGFUSE_PUBLIC_KEY
os.environ["LANGFUSE_SECRET_KEY"] = LANGFUSE_SECRET_KEY

logfire.configure(send_to_logfire=False)

%load_ext autoreload
%autoreload 2

In [2]:
langfuse = Langfuse()
DATASET_NAME = "rfp_evals"
CRITERIA_EXTRACTOR_NAME = "criteria_extractor"
DEFAULT_MODEL = "google-gla:gemini-2.0-flash"
DEFAULT_SYSTEM_PROMPT = "Given a datasheet, terms of reference, and position, extract the job requirements."

In [3]:
class JobRequirements(BaseModel):
    education: str
    experience: str


class Evaluation(BaseModel):
    correct: bool
    feedback: str = Field(
        default="",
        description="Feedback for the AI Agent so that it can improve its performance. Only needed if the AI Agent is incorrect.",
    )


def add_dataset_items(items: list[dict], dataset_name: str):
    langfuse.create_dataset(name=dataset_name)
    for item in items:
        langfuse.create_dataset_item(
            dataset_name=dataset_name,
            input=item["input"],
            expected_output=item["expected_output"],
            metadata=item["metadata"],
        )


def dataset_item_client_to_prompt(item: DatasetItemClient) -> str:
    return (
        f"Datasheet: {item.input['datasheet']}\n"
        f"Terms of Reference: {item.input['tor']}\n"
        f"Position: {item.metadata['position']}\n"  # type: ignore
    )


def get_or_create_prompt(
    agent_name: str,
    prompt_name: str,
    prompt: str,
    prompt_version: int | None = None,
    model: KnownModelName = DEFAULT_MODEL,
    labels: list[str] = ["production"],
) -> TextPromptClient:
    try:
        return langfuse.get_prompt(name=prompt_name, version=prompt_version, type="text")
    except Exception:
        created_prompt = langfuse.create_prompt(
            name=prompt_name,
            type="text",
            prompt=prompt,
            config={"model": model, "agent_name": agent_name},
            labels=labels,
        )
        langfuse_context.flush()
        langfuse.flush()
        return created_prompt


async def extract_criteria(
    model: KnownModelName, agent_name: str, system_prompt: str, item: DatasetItemClient
) -> JobRequirements:
    criteria_extractor = Agent(
        model=model, name=agent_name, system_prompt=system_prompt, instrument=True, result_type=JobRequirements
    )
    return (await criteria_extractor.run(dataset_item_client_to_prompt(item))).data


async def evaluate_criteria(item: DatasetItemClient, output: JobRequirements) -> Evaluation:
    evaluator = Agent(
        model=DEFAULT_MODEL,
        name="criteria_evaluator",
        system_prompt=(
            "Given a datasheet, terms of reference, position, and actual job requirements, "
            "evaluate if the extracted job requirements by an AI Agent are correct.\n"
            "Be ultra strict. You are a strict evaluator. "
            "Your feedback will be used to improve the model to eventually generate exactly what the expected output is"
        ),
        instrument=True,
        result_type=Evaluation,
    )
    user_prompt = dataset_item_client_to_prompt(item) + f"Extracted Job Requirements: {output.model_dump_json()}\n"
    evaluation = (await evaluator.run(user_prompt)).data
    evaluation.feedback = (
        (
            f"{dataset_item_client_to_prompt(item)}"
            f"Extracted Job Requirements: {output.model_dump_json()}\n"
            f"Actual Job Requirements: {item.expected_output}\n"
            f"Feedback: {evaluation.feedback}"
        )
        if not evaluation.correct
        else ""
    )
    return evaluation


async def add_criteria_extractor_score(
    item: DatasetItemClient, trace_id: str, score_name: str, prompt: TextPromptClient
):
    output = await extract_criteria(
        model=prompt.config["model"],
        agent_name=prompt.config["agent_name"],
        system_prompt=prompt.prompt.strip(),
        item=item,
    )
    evaluation = await evaluate_criteria(item=item, output=output)
    langfuse.score(
        trace_id=trace_id,
        name=score_name,
        value=evaluation.correct,
        comment=evaluation.feedback or None,
        data_type="BOOLEAN",
    )


async def run_experiment(
    experiment_name: str,
    dataset_name: str,
    scoring_function: Callable,
    agent_name: str,
    prompt: str,
    prompt_name: str = "",
    prompt_version: int | None = None,
    model: KnownModelName = DEFAULT_MODEL,
):
    created_prompt = get_or_create_prompt(
        agent_name=agent_name,
        prompt_name=prompt_name or f"{agent_name}_system_prompt",
        prompt=prompt,
        prompt_version=prompt_version,
        model=model,
    )

    dataset = langfuse.get_dataset(dataset_name)

    for item in dataset.items:
        with logfire.span(experiment_name) as span:
            trace_id = span.get_span_context().trace_id  # type: ignore
            trace_id = f"{trace_id:032x}"
            print(f"trace_id: {trace_id}")

            with item.observe(
                run_name=experiment_name, trace_id=trace_id, run_metadata={"langfuse_prompt": created_prompt}
            ) as _:
                await scoring_function(
                    item=item,
                    trace_id=trace_id,
                    score_name=f"{agent_name}_evaluation",
                    prompt=created_prompt,
                )
    langfuse_context.flush()
    langfuse.flush()


def get_feedback_from_dataset_run(dataset_name: str, experiment_name: str) -> str:
    run = langfuse.get_dataset_run(dataset_name=dataset_name, dataset_run_name=experiment_name)
    feedback = ""
    for run_item in run.dataset_run_items:
        trace = langfuse.get_trace(run_item.trace_id)
        for score in trace.scores:
            if not score.value and score.comment:
                feedback += f"Run:\n{score.comment}\n---\n"
    return feedback


def update_prompt_with_feedback(
    dataset_name: str,
    experiment_name: str,
    agent_name: str = "",
    prompt_name: str = "",
    prompt: str = "",
    prompt_version: int | None = None,
    model: KnownModelName = DEFAULT_MODEL,
    labels: list[str] = ["production"],
) -> TextPromptClient:
    feedback = get_feedback_from_dataset_run(dataset_name=dataset_name, experiment_name=experiment_name)
    if feedback:
        if prompt:
            assert agent_name and prompt_name and prompt and prompt_version, (
                "agent_name, prompt_name, prompt, and prompt_version must be provided if prompt is provided"
            )
        else:
            run = langfuse.get_dataset_run(dataset_name=dataset_name, dataset_run_name=experiment_name)
            langfuse_prompt: dict = run.metadata["langfuse_prompt"]  # type: ignore
            agent_name = langfuse_prompt["config"]["agent_name"]
            prompt_name = langfuse_prompt["name"]
            prompt = langfuse_prompt["prompt"]
            prompt_version = langfuse_prompt["version"] + 1
            model = langfuse_prompt["config"].get("model", model)
            labels = labels or langfuse_prompt["config"].get("labels", labels)
        prompt = prompt.strip() + "\n\nSome feedback from previous runs you got wrong:\n" + feedback
    return get_or_create_prompt(
        agent_name=agent_name,
        prompt_name=prompt_name,
        prompt=prompt,
        prompt_version=prompt_version,
        model=model,
        labels=labels,
    )


In [4]:
dataset_items = [
    {
        "input": {
            "datasheet": "Must have a bachelor's degree in mechanical engineering or a related field.",
            "tor": "Must have done a project for skyscrapers",
        },
        "expected_output": {"education": "BS in Mechanical Engineering", "experience": "Skyscraper Project"},
        "metadata": {"position": "Mechanical Engineer"},
    },
    {
        "input": {
            "datasheet": "Must have a master's degree in data science or a related field.",
            "tor": "Must have experience with machine learning algorithms",
        },
        "expected_output": {"education": "MS in Data Science", "experience": "Machine Learning"},
        "metadata": {"position": "Data Scientist"},
    },
    {
        "input": {
            "datasheet": "Must have a PhD in physics or a related field.",
            "tor": "Must have published research in quantum mechanics",
        },
        "expected_output": {"education": "PhD in Physics", "experience": "Quantum Mechanics Research"},
        "metadata": {"position": "Quantum Physicist"},
    },
]

system_prompt = "Given a datasheet, terms of reference, and position, extract the job requirements.\n"

messy_system_prompt = (
    system_prompt
    + "(mess up the output a bit (like MS instead of BS). i wanna see if the evaluator catches it).\n"
)


In [5]:
add_dataset_items(items=dataset_items, dataset_name=DATASET_NAME)

In [17]:
prompt_name = f"{CRITERIA_EXTRACTOR_NAME}_system_prompt"

experiment_number = 1
experiment_name = f"{CRITERIA_EXTRACTOR_NAME}_run_{experiment_number}"
prompt_version = None

In [20]:
created_system_prompt = get_or_create_prompt(
    agent_name=CRITERIA_EXTRACTOR_NAME,
    prompt_name=prompt_name,
    prompt=messy_system_prompt,
    prompt_version=prompt_version,
    model=DEFAULT_MODEL,
)

In [None]:
print(created_system_prompt.prompt)

In [None]:
await run_experiment(
    experiment_name=experiment_name,
    dataset_name=DATASET_NAME,
    scoring_function=add_criteria_extractor_score,
    agent_name=CRITERIA_EXTRACTOR_NAME,
    prompt_name=prompt_name,
    prompt=messy_system_prompt,
    prompt_version=prompt_version,
    model=DEFAULT_MODEL,
)


In [None]:
updated_system_prompt = update_prompt_with_feedback(
    dataset_name=DATASET_NAME,
    experiment_name=experiment_name,
    agent_name=CRITERIA_EXTRACTOR_NAME,
    prompt=system_prompt,
    prompt_name=prompt_name,
    prompt_version=2,
)

In [None]:
print(updated_system_prompt.prompt)


In [None]:
feedback = get_feedback_from_dataset_run(
    dataset_name=DATASET_NAME, experiment_name=f"{CRITERIA_EXTRACTOR_NAME}_run_1"
)
print(feedback)
