# Pairwise Experiments

### Setup

In [None]:
# You can set them inline
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "langsmith-academy"

In [1]:
# Or you can use a .env file
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../../.env", override=True)
os.environ["USER_AGENT"] = "496"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import warnings
import numpy as np

# Suppress sklearn warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')
np.seterr(divide='ignore', invalid='ignore', over='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

### Task

Let's set up a new task! Here, we have a salesperson named Bob. Bob has a lot of deals, so he wants to summarize what happened in this deals based off of some meeting transcripts.

Bob is iterating on a few different prompts, that will give him nice, concise transcripts for his deals.

Bob has curated a dataset of his deal transcripts, let's go ahead and load that in. You can take a look at the dataset as well if you're curious! Note that this is not a golden dataset, there is no reference output here.

In [3]:
# Cell 3: Clone the public dataset
from langsmith import Client

client = Client()
dataset = client.clone_public_dataset(
    "https://smith.langchain.com/public/9078d2f1-7bef-4ba7-b795-210a17682ef9/d"
)

### Experiments

Now, let's run some experiments on this dataset using two different prompts. Let's add an evaluator that tries to score how good our summaries are!

In [4]:
# Cell 4: Create evaluator using Anthropic
from pydantic import BaseModel, Field
from anthropic import Anthropic
import json

anthropic_client = Anthropic()

SUMMARIZATION_SYSTEM_PROMPT = """You are a judge, aiming to score how well a summary summarizes the content of a transcript"""

SUMMARIZATION_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}
[The Start of Summarization] {summary} [The End of Summarization]

Provide a score from 1-5 ranking how good the summarization is:
- 1: Bad summary
- 5: Great summary

Return JSON format: {{"score": <number>}}"""

class SummarizationScore(BaseModel):
    score: int = Field(description="A score from 1-5 ranking how good the summarization is for the provided transcript, with 1 being a bad summary, and 5 being a great summary")

def summary_score_evaluator(inputs: dict, outputs: dict) -> dict:
    user_message = SUMMARIZATION_HUMAN_PROMPT.format(
        transcript=inputs["transcript"],
        summary=outputs.get("output", "N/A"),
    )

    response = anthropic_client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=500,
        system=SUMMARIZATION_SYSTEM_PROMPT,
        messages=[
            {"role": "user", "content": user_message}
        ]
    )

    # Parse JSON response
    try:
        response_text = response.content[0].text
        start = response_text.find("{")
        end = response_text.rfind("}") + 1
        parsed = json.loads(response_text[start:end])
        summary_score = parsed.get("score", 3)
    except:
        summary_score = 3  # Default fallback

    return {"key": "summary_score", "score": summary_score}

First, we'll run our experiment with a good version of our prompt!

In [5]:
# Cell 5: Good Summarizer Experiment
def good_summarizer(inputs: dict):
    response = anthropic_client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": f"Concisely summarize this meeting in 3 sentences. Make sure to include all of the important events. Meeting: {inputs['transcript']}"
            }
        ]
    )
    return response.content[0].text

client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Good Summarizer"
)

View the evaluation results for experiment: 'Good Summarizer-7a0b855d' at:
https://smith.langchain.com/o/072e35aa-3a5b-404d-bd5a-459a19c5e651/datasets/a8705f7f-27ba-4efa-80f6-1a495efb564d/compare?selectedSessions=9cc63e1a-194b-4869-883c-6d2968ea1952




0it [00:00, ?it/s]

Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob successfully sold Mr. Carter a Ford F-150 ...,,5,4.02556,07a2ffe8-0e46-430b-b580-27f92372dcea,13cf8802-971b-4dab-a8f0-898cc6bd6954
1,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Bob helped Mr. Johnson find a family SUV at Fo...,,5,3.92078,0ee6e17f-93d3-4389-93d6-fadeafbf3e63,cbaff9fe-03f7-4571-bfef-a8ac85a31145
2,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob helped Mr. Patel find a midsize sedan for ...,,5,3.989744,53d1cd6d-7655-4dcf-80a7-f1c6ab4da323,956baf72-6d4f-4367-aaa8-56192bda541f
3,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,Bob met with Ms. Nguyen who was interested in ...,,5,3.762214,b7c66290-4922-4f8a-895d-21dee75b35d3,a9b6d796-4a26-4e3d-acc7-fdbe217b3d0d
4,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...",Ms. Thompson visited Ford Motors to browse SUV...,,5,4.225877,c0ad1714-d1b3-40f6-8006-5c33ed217821,8339edb9-3f19-4e14-a5c1-7288a994059d


Now, we'll run an experiment with a worse version of our prompt, to highlight the difference.

In [6]:
# Cell 6: Bad Summarizer Experiment
def bad_summarizer(inputs: dict):
    response = anthropic_client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": f"Summarize this in one sentence. {inputs['transcript']}"
            }
        ]
    )
    return response.content[0].text

client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Bad Summarizer"
)

View the evaluation results for experiment: 'Bad Summarizer-3ba1406c' at:
https://smith.langchain.com/o/072e35aa-3a5b-404d-bd5a-459a19c5e651/datasets/a8705f7f-27ba-4efa-80f6-1a495efb564d/compare?selectedSessions=773e7fe1-3a19-4a4f-b9ea-04767af2a262




0it [00:00, ?it/s]

Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob successfully sold Mr. Carter a Ford F-150 ...,,5,3.664967,07a2ffe8-0e46-430b-b580-27f92372dcea,722de86b-b415-4fc2-b853-bf1a5630adc3
1,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Bob successfully sold Mr. Johnson a Ford Explo...,,5,3.106818,0ee6e17f-93d3-4389-93d6-fadeafbf3e63,e636eb3b-9073-4a26-acb6-3b478d77265f
2,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob successfully sold Mr. Patel a Ford Fusion ...,,5,3.058527,53d1cd6d-7655-4dcf-80a7-f1c6ab4da323,c3d27edf-cff2-4ff5-8144-45c29e20921b
3,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,"Bob, a car salesman, tries to sell Ms. Nguyen ...",,5,3.810377,b7c66290-4922-4f8a-895d-21dee75b35d3,f9512cb9-6280-4bc9-afb2-2cd679aafb07
4,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...",Bob gave Ms. Thompson information about the Fo...,,4,3.282824,c0ad1714-d1b3-40f6-8006-5c33ed217821,63bed034-f595-46a7-a534-afc809870362


### Pairwise Experiment

Let's define a function that will compare our two experiments. These are the fields that pairwise evaluator functions get access to:
- `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
- `outputs: list[dict]`: A list of the dict outputs produced by each experiment on the given inputs.
- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
- `runs: list[Run]`: A list of the full Run objects generated by the experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
- `example: Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).

First, let's give our LLM-as-Judge some instructions. In our case, we're just going to directly use LLM-as-judge to grade which of the summarizers is the most helpful.

It might be hard to grade our summarizers without a ground truth reference, but here, comparing different prompts head to head will give us a sense of which is better!

In [7]:
# Cell 7: Define Judge Prompts
JUDGE_SYSTEM_PROMPT = """
Please act as an impartial judge and evaluate the quality of the summarizations provided by two AI summarizers to the meeting transcript below.
Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their summarizations.
Begin your evaluation by comparing the two summarizations and provide a short explanation.
Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision.
Do not favor certain names of the assistants.
Be as objective as possible."""

JUDGE_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}

[The Start of Assistant A's Summarization] {answer_a} [The End of Assistant A's Summarization]

[The Start of Assistant B's Summarization] {answer_b} [The End of Assistant B's Summarization]

Which assistant provided a better summarization?
- Output 1 if Assistant A is better
- Output 2 if Assistant B is better
- Output 0 if it's a tie

Return JSON format: {{"preference": <number>}}"""

Our function will take in an `inputs` dictionary, and a list of `outputs` dictionaries for the different experiments that we want to compare.

In [8]:
# Cell 8: Pairwise Evaluator Function
from pydantic import BaseModel, Field

class Preference(BaseModel):
    preference: int = Field(description="1 if Assistant A answer is better, 2 if Assistant B answer is better, 0 if tie")

def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    user_message = JUDGE_HUMAN_PROMPT.format(
        transcript=inputs["transcript"],
        answer_a=outputs[0].get("output", "N/A"),
        answer_b=outputs[1].get("output", "N/A")
    )

    response = anthropic_client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=500,
        system=JUDGE_SYSTEM_PROMPT,
        messages=[
            {"role": "user", "content": user_message}
        ]
    )

    # Parse JSON response
    try:
        response_text = response.content[0].text
        start = response_text.find("{")
        end = response_text.rfind("}") + 1
        parsed = json.loads(response_text[start:end])
        preference_score = parsed.get("preference", 0)
    except:
        preference_score = 0  # Default to tie

    if preference_score == 1:
        scores = [1, 0]
    elif preference_score == 2:
        scores = [0, 1]
    else:
        scores = [0, 0]

    return scores

Now let's run our pairwise experiment with `evaluate()`

In [9]:
# Cell 9: Run Pairwise Experiment
from langsmith import evaluate

# TODO: After running cells 5 and 6, check LangSmith UI for your experiment IDs
# Replace with your actual experiment names/IDs
evaluate(
    ("Good Summarizer-7a0b855d", "Bad Summarizer-3ba1406c"),
    evaluators=[ranked_preference]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/072e35aa-3a5b-404d-bd5a-459a19c5e651/datasets/a8705f7f-27ba-4efa-80f6-1a495efb564d/compare?selectedSessions=9cc63e1a-194b-4869-883c-6d2968ea1952%2C773e7fe1-3a19-4a4f-b9ea-04767af2a262&comparativeExperiment=3f29345d-5813-4fa2-bf50-798d407db006




  0%|          | 0/5 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x1127aa320>