In [2]:
# !pip install agentevals

# Strict match

Strict режим гарантирует, что траектории содержат идентичные сообщения в одном и том же порядке при одинаковых вызовах инструментов, хотя и допускает различия в содержании сообщений. 

In [7]:
from langchain.agents import create_agent
from langchain.tools import tool
from langchain.messages import HumanMessage, AIMessage, ToolMessage
from agentevals.trajectory.match import create_trajectory_match_evaluator
from langchain_ollama import ChatOllama

@tool
def get_weather(city: str):
    """Get weather information for a city."""
    return f"It's 75 degrees and sunny in {city}."

model = ChatOllama(model="gpt-oss:20b-cloud")

agent = create_agent(model, tools=[get_weather])

evaluator = create_trajectory_match_evaluator(  
    trajectory_match_mode="strict",  
)  

def test_weather_tool_called_strict():
    result = agent.invoke({
        "messages": [HumanMessage(content="What's the weather in San Francisco?")]
    })

    reference_trajectory = [
        HumanMessage(content="What's the weather in San Francisco?"),
        AIMessage(content="", tool_calls=[
            {"id": "call_1", "name": "get_weather", "args": {"city": "San Francisco"}}
        ]),
        ToolMessage(content="It's 75 degrees and sunny in San Francisco.", tool_call_id="call_1"),
        AIMessage(content="The weather in San Francisco is 75 degrees and sunny."),
    ]

    evaluation = evaluator(
        outputs=result["messages"],
        reference_outputs=reference_trajectory
    )
    assert evaluation["score"] is True
    return evaluation

In [8]:
test_weather_tool_called_strict()

{'key': 'trajectory_strict_match',
 'score': True,
 'comment': None,
 'metadata': None}

# Unordered match

Неупорядоченный режим позволяет вызывать один и тот же инструмент в любом порядке, что полезно, когда вы хотите проверить, была ли получена конкретная информация, но не заботитесь о последовательности. Например, агенту может потребоваться проверить погоду и события в городе, но порядок не имеет значения.

In [41]:
from langchain.agents import create_agent
from langchain.tools import tool
from langchain.messages import HumanMessage, AIMessage, ToolMessage
from agentevals.trajectory.match import create_trajectory_match_evaluator
from langchain_ollama import ChatOllama

@tool
def get_weather(city: str):
    """Get weather information for a city."""
    return f"It's 75 degrees and sunny in {city}."

@tool
def get_events(city: str):
    """Get events happening in a city."""
    return f"Concert at the park in {city} tonight."


system_prompt = """You are a helpful assistant.
    Use your tools to answer questions. If you do not have a tool to
    answer the question, say so. 
    Return only the answers of tools - concatenate them.
    E.g.
    Human: What weather in SF and what happening in SF today?
    AI: It's 75 degrees and sunny in SF. Concert at the park in SF tonight.
    """


model = ChatOllama(model="gpt-oss:20b-cloud")
agent = create_agent(model, tools=[get_weather, get_events], system_prompt=system_prompt)

evaluator = create_trajectory_match_evaluator(  
    trajectory_match_mode="unordered",  
)  

def test_multiple_tools_any_order():
    result = agent.invoke({
        "messages": [HumanMessage(content="What weather and wat's happening in SF today?")]
    })

    # Reference shows tools called in different order than actual execution
    reference_trajectory = [
        HumanMessage(content="What weather and wat's happening in SF today?"),
        AIMessage(content="", tool_calls=[
            {"id": "call_1", "name": "get_events", "args": {"city": "SF"}},
            {"id": "call_2", "name": "get_weather", "args": {"city": "SF"}},
        ]),
        ToolMessage(content="Concert at the park in SF tonight.", tool_call_id="call_1"),
        ToolMessage(content="It's 75 degrees and sunny in SF.", tool_call_id="call_2"),
        AIMessage(content="It's 75 degrees and sunny in San Francisco.Concert at the park in San Francisco tonight."),
    ]

    evaluation = evaluator(
        outputs=result["messages"],
        reference_outputs=reference_trajectory,
    )
    assert evaluation["score"] is True
    return evaluation, result

In [42]:
evaluation, result = test_multiple_tools_any_order()

In [44]:
evaluation

{'key': 'trajectory_unordered_match',
 'score': True,
 'comment': None,
 'metadata': None}

In [45]:
for message in result["messages"]:
    print(message.pretty_print())
    print()


What weather and wat's happening in SF today?
None

Tool Calls:
  get_weather (58ee548b-08fc-4aaf-a6b2-a11bcf92a609)
 Call ID: 58ee548b-08fc-4aaf-a6b2-a11bcf92a609
  Args:
    city: SF
None

Name: get_weather

It's 75 degrees and sunny in SF.
None

Tool Calls:
  get_events (55e8072e-55bb-49ab-879d-61ef20881021)
 Call ID: 55e8072e-55bb-49ab-879d-61ef20881021
  Args:
    city: SF
None

Name: get_events

Concert at the park in SF tonight.
None


It's 75 degrees and sunny in SF. Concert at the park in SF tonight.
None



# Subset & Superset match

Режимы Superset и Subset соответствуют частичным траекториям. Режим Superset проверяет, вызывал ли агент по крайней мере инструменты в базовой траектории, что позволяет вызывать дополнительные инструменты. Режим Subset гарантирует, что агент не вызывал никаких инструментов, кроме тех, которые указаны в референсе.

In [49]:
from langchain.agents import create_agent
from langchain.tools import tool
from langchain.messages import HumanMessage, AIMessage, ToolMessage
from agentevals.trajectory.match import create_trajectory_match_evaluator
from langchain_ollama import ChatOllama

@tool
def get_weather(city: str):
    """Get weather information for a city."""
    return f"It's 75 degrees and sunny in {city}."

@tool
def get_detailed_forecast(city: str):
    """Get detailed weather forecast for a city."""
    return f"Detailed forecast for {city}: sunny all week."


system_prompt = """You are a helpful assistant.
    Use your tools to answer questions. If you do not have a tool to
    answer the question, say so. 
    Return only the answers of tools - concatenate them.
    E.g.
    Human: What's weather in SF today?
    AI: It's 75 degrees and sunny in SF.
    """


model = ChatOllama(model="gpt-oss:20b-cloud")
agent = create_agent(model, tools=[get_weather, get_detailed_forecast])

evaluator = create_trajectory_match_evaluator(  
    trajectory_match_mode="superset",  
)  

def test_agent_calls_required_tools_plus_extra():
    result = agent.invoke({
        "messages": [HumanMessage(content="What's the weather in Boston?")]
    })

    # Reference only requires get_weather, but agent may call additional tools
    reference_trajectory = [
        HumanMessage(content="What's the weather in Boston?"),
        AIMessage(content="", tool_calls=[
            {"id": "call_1", "name": "get_weather", "args": {"city": "Boston"}},
        ]),
        ToolMessage(content="It's 75 degrees and sunny in Boston.", tool_call_id="call_1"),
        AIMessage(content="It's 75 degrees and sunny in Boston."),
    ]

    evaluation = evaluator(
        outputs=result["messages"],
        reference_outputs=reference_trajectory,
    )
    assert evaluation["score"] is True
    return evaluation, result

In [50]:
evaluation, result = test_agent_calls_required_tools_plus_extra()

In [51]:
evaluation

{'key': 'trajectory_superset_match',
 'score': True,
 'comment': None,
 'metadata': None}

In [52]:
for message in result["messages"]:
    print(message.pretty_print())
    print()


What's the weather in Boston?
None

Tool Calls:
  get_weather (c264a9e4-7784-4d8c-9ef6-7f3149756d2d)
 Call ID: c264a9e4-7784-4d8c-9ef6-7f3149756d2d
  Args:
    city: Boston
None

Name: get_weather

It's 75 degrees and sunny in Boston.
None


It’s 75 °F and sunny in Boston.
None



# LLM as a Judge

Можно использовать LLM для оценки пути выполнения агента с помощью функции create_trajectory_llm_as_judge. В отличие от средств оценки соответствия траектории, для этого не требуется эталонная траектория, но она может быть предоставлена при наличии.

In [98]:
from langchain.agents import create_agent
from langchain.tools import tool
from langchain.messages import HumanMessage
from agentevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT
from langchain_ollama import ChatOllama

@tool
def get_weather(city: str):
    """Get weather information for a city."""
    return f"It's 75 degrees and sunny in {city}."

system_prompt = """You are a helpful assistant.
    Use your tools to answer questions. If you do not have a tool to
    answer the question, say so. 
    Return only the answers of tools - concatenate them.
    E.g.
    Human: What's weather in SF today?
    AI: It's 75 degrees and sunny in SF.
    """


model = ChatOllama(model="gpt-oss:20b-cloud")
agent = create_agent(model, tools=[get_weather])


evaluator_model = ChatOllama(model="gpt-oss:20b-cloud")
evaluator = create_trajectory_llm_as_judge(
    judge=evaluator_model,
    prompt='You are a helpful judge. You should evaluate the output of another agent according to these rules:' + TRAJECTORY_ACCURACY_PROMPT +'Return the evaluation result **only** in the following JSON structure with field "score", containing evaluation True or False based on how model completed the task and field "reasoning" with reason why you made such evaluation',
)

def test_trajectory_quality():
    result = agent.invoke({
        "messages": [HumanMessage(content="What's the weather in Seattle?")]
    })

    evaluation = evaluator(
        outputs=result["messages"],
    )
    assert evaluation["score"] is True
    return evaluation, result

In [99]:
evaluation, result = test_trajectory_quality()

In [100]:
evaluation

{'key': 'trajectory_accuracy',
 'score': True,
 'comment': 'The assistant accurately interpreted the user’s request, made a relevant tool call, received the correct weather information, and provided a concise, correct response. All steps logically progress toward the goal and are efficient. This trajectory satisfies the rubric for an accurate and efficient solution.',
 'metadata': None}

In [101]:
for message in result["messages"]:
    print(message.pretty_print())
    print()


What's the weather in Seattle?
None

Tool Calls:
  get_weather (c77050b2-a798-478f-bf3f-cd53ebbff6eb)
 Call ID: c77050b2-a798-478f-bf3f-cd53ebbff6eb
  Args:
    city: Seattle
None

Name: get_weather

It's 75 degrees and sunny in Seattle.
None


It’s 75 °F and sunny in Seattle.
None

