In [None]:
from csagent.supervisor.graph import supervisor_graph
from utils import run_langsmith_eval
from csagent.configuration import Configuration
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model
from typing import Literal
import time

In [None]:
configuration = Configuration()


def target_function(config: Configuration):
    """
    Target function for Supervisor Graph.
    """

    def eval_supervisor(inputs: dict) -> dict:
        """
        Eval Supervisor Graph.
        """

        try:
            result = supervisor_graph.invoke(
                inputs,
                context=config,
            )
            time.sleep(15)
            return result
        except Exception as e:
            print(f"Error in eval_supervisor: {e}")
            return {}

    return eval_supervisor


In [None]:
class EvaluationModel(BaseModel):
    """Structured output of the evaluation."""

    rationale: str = Field(
        description="Rationale that explains the alignment between the AI output and the ground truth."
    )
    score: Literal[0, 1] = Field(
        description="0 means the AI output is completely wrong and not related to the ground truth, 1 means the AI output is completely correct and aligned with the ground truth."
    )


def LLM_judge(inputs: str, ai_output: str, ground_truth: str, model: str):
    """Judge the AI output based on the ground truth."""

    instructions = """
        You are given a human question and a pair consisting of a ground truth and an AI-generated output. Your task is to evaluate how well the AI output aligns with the ground truth in the context of the human question.
        1. Provide a brief reasoning (1-2 sentences) explaining the degree of alignment between the AI output and the ground truth.
        2. Assign a binary score:
            - 1 if the AI output aligns with the ground truth.
            - 0 if the AI output does not align with the ground truth.
        Keep your reasoning concise, objective, and focused only on the alignment. Do not add extra commentary, suggestions, or subjective opinions.

        Human Question: {inputs}
        AI output: {ai_output}
        Ground truth: {ground_truth}

        Format Instruction:
        {format_instructions}
    """

    parser = PydanticOutputParser(pydantic_object=EvaluationModel)
    chat_prompt = ChatPromptTemplate.from_messages(
        [
            ("human", instructions),
        ]
    ).partial(
        inputs=inputs,
        ai_output=ai_output,
        ground_truth=ground_truth,
        format_instructions=parser.get_format_instructions(),
    )

    llm = init_chat_model(model, temperature=0)

    response = llm.invoke(chat_prompt.invoke({}))
    # Extract text content from AIMessage before parsing
    response = parser.parse(response.content)
    return response


In [None]:
def llm_alignment_evaluator(
    inputs: dict, outputs: dict, reference_outputs: dict
) -> list:
    """LLM-as-judge alignment evaluator."""

    response_alignment = LLM_judge(
        inputs["messages"][-1]["content"],
        outputs["messages"][-1].content,
        reference_outputs["content"],
        configuration.model_small,
    )
    return [
        {
            "key": "alignment_score",
            "score": response_alignment.score,
        },
        {"key": "alignment_reasoning", "value": response_alignment.rationale},
    ]


In [None]:
run_langsmith_eval(
    target_function(config=configuration),
    "CS Agent Evaluation",
    [
        llm_alignment_evaluator,
    ],
    configuration.model,
)
