Automated LLM Evaluation

In [7]:
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
from langsmith import Client
from langsmith.evaluation import RunEvaluator, EvaluationResult
from langchain.evaluation import load_evaluator
from langchain.smith import RunEvalConfig
from nurse_chat_prime import Nurse2NurseChatbotSummarize


from typing import Optional, TypedDict, Annotated
from langchain_openai import ChatOpenAI

In [75]:
example = {
    "inputs": {
        "prompt": "What was the minimum reading for heart rate, respiratory rate and body temperature and when did they each occur? ",
        "patient_file": "Kate_Data.json",  # This file is used to load the patient data in your chain.
        # Optionally, you can provide a retrieved context (e.g., from your reference document) if you want to evaluate groundedness.
        "retrieved_context": "Vital sign ranges are provided in the document",
        "patient_context": "Patient Data: [List of patient readings extracted from the JSON file]."       
    },
    "outputs": {
        "label": ("Minimum Recorded Vital Signs and Corresponding Timestamps: "
                    "Minimum Heart Rate: 113 bpm"
                    "Occurred at: 2024-11-29 00:31:33"
                    "Clinical Note: Bradycardia episode, below the normal range for preterm neonates (141-171 bpm)."

                    "Minimum Respiratory Rate: 28 breaths/min"
                    "Occurred at:"
                    "2024-11-29 00:26:33"
                    "2024-11-29 01:16:33"
                    "2024-11-29 03:06:33"
                    "2024-11-29 04:46:33"
                    "2024-11-29 07:36:33"
                    "Clinical Note: Bradypnea observed at multiple instances, indicating possible periodic breathing instability."

                    "Minimum Body Temperature: 36.5 °C"
                    "Occurred at:"
                    "2024-11-29 02:16:33"
                    "2024-11-29 04:51:33"
                    "2024-11-29 07:26:33"
                    )  
    }
}

In [76]:
# Create a LangSmith client and dataset.
client = Client()
dataset_name = f"NICU Nurse Chat Evaluation - Minimum Readings"
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
    inputs=[example["inputs"]],
    outputs=[example["outputs"]],
    dataset_id=dataset.id,
)

In [86]:

# Custom evaluator for relevance: compares the generated answer to the original prompt.
class RelevanceEvaluator(RunEvaluator):
    def __init__(self, eval_llm: Optional[ChatOpenAI] = None):
        # If no LLM is provided, instantiate one with "o3-mini"
        if eval_llm is None:
            eval_llm = ChatOpenAI(model="o3-mini", temperature=1)
        self.eval_llm = eval_llm
        
        # Create a labeled_score_string evaluator with the specified LLM
        self.evaluator = load_evaluator(
            "labeled_score_string",
            criteria={"relevance": "How well does the generated response address the initial user input?"},
            normalize_by=10,
            llm=self.eval_llm  # Pass the LLM to the evaluator
        )

    def evaluate_run(self, run, example) -> EvaluationResult:
        res = self.evaluator.evaluate_strings(
            prediction=run.outputs["answer"],
            input=run.inputs["prompt"],
            reference=run.inputs["prompt"],
        )
        return EvaluationResult(key="labeled_criteria:relevance", **res)


# Custom evaluator for groundedness: combines both the retrieved context and patient context.
class GroundednessEvaluator(RunEvaluator):
    def __init__(self, eval_llm: Optional[ChatOpenAI] = None):
        if eval_llm is None:
            eval_llm = ChatOpenAI(model="o3-mini", temperature=1)
        self.eval_llm = eval_llm
        
        self.evaluator = load_evaluator(
            "labeled_score_string",
            criteria={"groundedness": "To what extent does the generated response agree with the retrieved context?"},
            normalize_by=10,
            llm=self.eval_llm  # Pass the LLM to the evaluator
        )

    def evaluate_run(self, run, example) -> EvaluationResult:
        # Get contexts from outputs
        retrieved_context = run.outputs.get("retrieved_context", "")
        patient_context = run.outputs.get("patient_context", "")
        
        # Convert lists to strings if necessary.
        if isinstance(patient_context, list):
            patient_context = "\n".join([doc.page_content for doc in patient_context])
        if isinstance(retrieved_context, list):
            retrieved_context = "\n".join([str(item) for item in retrieved_context])
        
        combined_context = "\n".join([retrieved_context, patient_context]).strip()
        
        res = self.evaluator.evaluate_strings(
            prediction=run.outputs.get("answer", ""),
            input="",
            reference=combined_context,
        )
        return EvaluationResult(key="labeled_criteria:groundedness", **res)


# Custom QA evaluator that maps the output keys correctly
class QAEvaluator(RunEvaluator):
    def __init__(self, eval_llm: Optional[ChatOpenAI] = None):
        if eval_llm is None:
            eval_llm = ChatOpenAI(model="o3-mini", temperature=1)
        self.eval_llm = eval_llm
        
        self.evaluator = load_evaluator(
            "qa",
            llm=self.eval_llm
        )

    def evaluate_run(self, run, example) -> EvaluationResult:
        res = self.evaluator.evaluate_strings(
            prediction=run.outputs.get("answer", ""),
            input=run.inputs.get("prompt", ""),
            reference=example.outputs.get("label", ""),
        )
        return EvaluationResult(key="qa", **res)


# Specify your custom evaluator LLM
custom_eval_llm = ChatOpenAI(model="o3-mini", temperature=1)

eval_config = RunEvalConfig(
    custom_evaluators=[
        QAEvaluator(eval_llm=custom_eval_llm),
        RelevanceEvaluator(eval_llm=custom_eval_llm),
        GroundednessEvaluator(eval_llm=custom_eval_llm)
    ],
    input_key="prompt",
    reference_key="label",
)


# Wrap your summarization chain so that it returns both the answer and the contexts.
def summarization_chain(inputs):
    patient_file = inputs.get("patient_file")
    session_id = "min10"
    prompt = inputs.get("prompt")
    # Directly call your pipeline function, which now returns all desired keys.
    output = Nurse2NurseChatbotSummarize(patient_file, session_id, prompt)
    # output should be a dictionary with keys "answer", "retrieved_context", and "patient_context"
    return output


# Run the evaluation on your dataset.
results = client.run_on_dataset(
    llm_or_chain_factory=summarization_chain,
    dataset_name=dataset_name,
    evaluation=eval_config,
)

print("Evaluation Results:", results)

View the evaluation results for project 'slight-profit-40' at:
https://smith.langchain.com/o/5d77eab9-aadf-5109-87bb-380ed3a68b7e/datasets/e9dfbf8d-fe92-41d1-a31a-4e8df2e29763/compare?selectedSessions=10dd8c4b-7ffa-4747-b984-3575ee7a680a

View all tests for Dataset NICU Nurse Chat Evaluation - Minimum Readings at:
https://smith.langchain.com/o/5d77eab9-aadf-5109-87bb-380ed3a68b7e/datasets/e9dfbf8d-fe92-41d1-a31a-4e8df2e29763
[------------------------------------------------->] 1/1Evaluation Results: {'project_name': 'slight-profit-40', 'results': {'2e84a6ee-c68a-410d-8ae5-4b49f8fb4ec7': {'input': {'prompt': 'What was the minimum reading for heart rate, respiratory rate and body temperature and when did they each occur? ', 'patient_file': 'Kate_Data.json', 'patient_context': 'Patient Data: [List of patient readings extracted from the JSON file].', 'retrieved_context': 'Vital sign ranges are provided in the document'}, 'feedback': [EvaluationResult(key='qa', score=0, value='INCORRECT', c