# Summary Evaluators

### Setup

In [37]:
# You can set them inline
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "langsmith-academy"

In [4]:
# Or you can use a .env file
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../../.env", override=True)
os.environ["USER_AGENT"] = "496"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
import warnings
import numpy as np

# Suppress sklearn warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')
np.seterr(divide='ignore', invalid='ignore', over='ignore')

{'divide': 'ignore', 'over': 'ignore', 'under': 'ignore', 'invalid': 'ignore'}

### Task

Our task here is to analyze the toxictity of random statements, classifying them as `Toxic` or `Not toxic`. 

Take a look at our dataset!

In [6]:
# Cell 3: Clone the public dataset
from langsmith import Client

client = Client()
dataset = client.clone_public_dataset(
    "https://smith.langchain.com/public/89ef0d44-a252-4011-8bb8-6a114afc1522/d"
)

This is a simple toxicity classifier!

In [7]:
# Cell 4: Toxicity Classifier using Anthropic
from anthropic import Anthropic
from pydantic import BaseModel, Field
import json

anthropic_client = Anthropic()

class Toxicity(BaseModel):
    toxicity: str = Field(description="'Toxic' if the statement is toxic, 'Not toxic' if the statement is not toxic.")

def good_classifier(inputs: dict) -> dict:
    prompt = f"""Classify this statement as either 'Toxic' or 'Not toxic'.

Statement: {inputs['statement']}

Return your response as JSON: {{"toxicity": "Toxic"}} or {{"toxicity": "Not toxic"}}"""

    response = anthropic_client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=200,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    # Parse JSON response
    try:
        response_text = response.content[0].text
        start = response_text.find("{")
        end = response_text.rfind("}") + 1
        parsed = json.loads(response_text[start:end])
        toxicity_score = parsed.get("toxicity", "Not toxic")
    except:
        toxicity_score = "Not toxic"  # Default fallback

    return {"class": toxicity_score}

### Summary Evaluator

These are the fields that summary evaluator functions get access to:
- `inputs: list[dict]`: A list of inputs from the examples in our dataset
- `outputs: list[dict]`: A list of the dict outputs produced from running our target over each input
- `reference_outputs: list[dict]`: A list of reference_outputs from the examples in our dataset
- `runs: list[Run]`: A list of the Run objects from running our target over the dataset.
- `examples: list[Example]`: A list of the full dataset Examples, including the example inputs, outputs (if available), and metdata (if available).

Now we'll define our summary evaluator! Here, we'll compute the f1-score, which is a combination of precision and recall.

This sort of metric can only be computed over all of the examples in our experiment, so our evaluator takes in a list of outputs, and a list of reference_outputs.

In [8]:
# Cell 5: F1 Score Summary Evaluator (NO CHANGES NEEDED)
def f1_score_summary_evaluator(outputs: list[dict], reference_outputs: list[dict]) -> dict:
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for output_dict, reference_output_dict in zip(outputs, reference_outputs):
        output = output_dict["class"]
        reference_output = reference_output_dict["class"]

        if output == "Toxic" and reference_output == "Toxic":
            true_positives += 1
        elif output == "Toxic" and reference_output == "Not toxic":
            false_positives += 1
        elif output == "Not toxic" and reference_output == "Toxic":
            false_negatives += 1

    if true_positives == 0:
        return {"key": "f1_score", "score": 0.0}

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return {"key": "f1_score", "score": f1_score}

Note that we pass in `f1_score_summary_evaluator` as a summary evaluator!

In [9]:
# Cell 6: Run Evaluation with Summary Evaluator
results = client.evaluate(
    good_classifier,
    data=dataset,
    summary_evaluators=[f1_score_summary_evaluator],
    experiment_prefix="Good classifier"
)

View the evaluation results for experiment: 'Good classifier-d5bff65a' at:
https://smith.langchain.com/o/072e35aa-3a5b-404d-bd5a-459a19c5e651/datasets/a13f9e86-1c7e-449a-9210-b9b353b40762/compare?selectedSessions=02fd6d64-a283-400f-be20-bb0b3f366949




0it [00:00, ?it/s]