# Evaluating Trajectory

In [None]:
from langsmith import Client

In [None]:
client = Client()

## 1. Single step evaluator

### Create dataset

In [None]:
# Create dataset
examples = [
    {"messages": [{"role": "user", "content": "i bought some tracks recently and i dont like them"}], "route": "refund_agent"},
    {"messages": [{"role": "user", "content": "I was thinking of purchasing some Rolling Stones tunes, any recommendations?"}], "route": "question_answering_agent"},
    {"messages": [{"role": "user", "content": "i want a refund on purchase 237"}, {"role": "assistant", "content": "I've refunded you a total of $1.98. How else can I help you today?"}, {"role": "user", "content": "did prince release any albums in 2000?"}], "route": "question_answering_agent"},
    {"messages": [{"role": "user", "content": "i purchased a cover of Yesterday recently but can't remember who it was by, which versions of it do you have?"}], "route": "question_answering_agent"},
    {"messages": [{"role": "user", "content": "Can I get my money back? I bought an album from the store last week, but it was the wrong one."}], "route": "refund_agent"}
]

dataset_name = "KGBot Evaluation: Single Step"
if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name=dataset_name)
    client.create_examples(
        inputs = [{"messages": ex["messages"]} for ex in examples],
        outputs = [{"route": ex["route"]} for ex in examples],
        dataset_id=dataset.id
    )

### Define application logic to be evaluated

In [None]:
# Target function for running the relevant step
async def run_intent_classifier(inputs: dict) -> dict:
    # Note that we can access and run the intent_classifier node of our graph directly.
    command = await graph.nodes['intent_classifier'].ainvoke(inputs)
    return {"route": command.goto}

: 

### Define evaluator

In [None]:
# Evaluator
def correct(outputs: dict, reference_outputs: dict) -> bool:
    """Check if the agent chose the correct route."""
    return outputs["route"] == reference_outputs["route"]

### Run evaluation

In [None]:
# Run evaluation
experiment_results = await client.aevaluate(
    run_intent_classifier,
    data=dataset_name,
    evaluators=[correct],
    experiment_prefix="sql-agent-gpt4o-intent-classifier",
    max_concurrency=4,
)
experiment_results.to_pandas()

## 2. Trajectory evaluator

### Create dataset

In [None]:
from langsmith import Client

client = Client()

# Create a dataset
examples = [
    {
        "question": "How many songs do you have by James Brown",
        "trajectory": ["intent_classifier", "question_answering_agent", "agent", "tools", "lookup_track", "agent", "compile_followup"]
    },
    {
        "question": "My name is Aaron Mitchell and I'd like a refund.",
        "trajectory": ["intent_classifier", "refund_agent", "gather_info", "compile_followup"],
    },
    {
        "question": "My name is Aaron Mitchell and I'd like a refund on my Led Zeppelin purchases. My number is +1 (204) 452-6452",
        "trajectory": ["intent_classifier", "refund_agent", "gather_info", "lookup", "compile_followup"],
    },
    {
        "question": "Who recorded Wish You Were Here again? What other albums by them do you have?",
        "trajectory": ["intent_classifier", "question_answering_agent", "agent", "tools", "lookup_track", "agent", "tools", "lookup_album", "agent", "compile_followup"],
    },
    {
        "question": "My name is Aaron Mitchell. My number is +1 (204) 452-6452 and I want a full refund for invoice id 237",
        "trajectory": ["intent_classifier", "refund_agent", "gather_info", "refund", "compile_followup"],
    },
]

dataset_name = "Chinook Customer Service Bot: Trajectory"

if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name=dataset_name)
    client.create_examples(
        inputs=[{"question": ex["question"]} for ex in examples],
        outputs=[{"trajectory": ex["trajectory"]} for ex in examples],
        dataset_id=dataset.id
    )

### Define application logic to be evaluated

In [None]:
async def run_graph(inputs: dict) -> dict:
    """Run graph and track the trajectory it takes along with the final response."""
    trajectory = []
    # Set subgraph=True to stream events from subgraphs of the main graph: https://langchain-ai.github.io/langgraph/how-tos/streaming-subgraphs/
    # Set stream_mode="debug" to stream all possible events: https://langchain-ai.github.io/langgraph/concepts/streaming
    async for chunk in graph.astream({"messages": [
            {
                "role": "user",
                "content": inputs['question'],
            }
        ]}, subgraphs=True, stream_mode="debug"):
        # Event type for entering a node
        if chunk[1]['type'] == 'task':
            # Record the node name
            trajectory.append(chunk[1]['payload']['name'])
            # Given how we defined our dataset, we also need to track when specific tools are
            # called by our question answering ReACT agent. These tool calls can be found
            # when the ToolsNode (named "tools") is invoked by looking at the AIMessage.tool_calls
            # of the latest input message.
            if chunk[1]['payload']['name'] == 'tools' and chunk[1]['type'] == 'task':
                for tc in chunk[1]['payload']['input']['messages'][-1].tool_calls:
                    trajectory.append(tc['name'])
    return {"trajectory": trajectory}

### Define evaluators

In [None]:
def evaluate_extra_steps(outputs: dict, reference_outputs: dict) -> dict:
    """Evaluate the number of extra steps in the agent's output."""
    extra_steps = len(outputs['trajectory']) - len(reference_outputs['trajectory'])
    return {
        "key": "extra_steps",
        "score": extra_steps,
    }

def evaluate_unmatched_steps(outputs: dict, reference_outputs: dict) -> dict:
    # ["step1", "step2", "step3"]
    # ["step3", "step2", "step1"]
    """Evaluate the number of unmatched steps in the agent's output."""
    i = j = 0
    unmatched_steps = 0

    while i < len(reference_outputs['trajectory']) and j < len(outputs['trajectory']):
        if reference_outputs['trajectory'][i] == outputs['trajectory'][j]:
            i += 1  # Match found, move to the next step in reference trajectory
        else:
            unmatched_steps += 1  # Step is not part of the reference trajectory
        j += 1  # Always move to the next step in outputs trajectory

    # Count remaining unmatched steps in outputs beyond the comparison loop
    unmatched_steps += len(outputs['trajectory']) - j

    return {
        "key": "unmatched_steps",
        "score": unmatched_steps,
    }

### Run evaluation

In [None]:
experiment_results = await client.aevaluate(
    run_graph,
    data=dataset_name,
    evaluators=[evaluate_extra_steps, evaluate_unmatched_steps],
    experiment_prefix="sql-agent-gpt4o-trajectory",
    num_repetitions=1,
    max_concurrency=4,
)
experiment_results.to_pandas()