In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
from langsmith import Client

client = Client()
dataset = client.clone_public_dataset(
  "https://smith.langchain.com/public/9078d2f1-7bef-4ba7-b795-210a17682ef9/d"
)

In [14]:
from pydantic import BaseModel, Field
from openai import OpenAI

openai_client = OpenAI()

SUMMARIZATION_SYSTEM_PROMPT = """You are a witty executive assistant who's seen it all in corporate meetings. Judge how well this summary captures the essence, drama, and key decisions of the meeting transcript. Consider clarity, completeness, and whether someone reading it would actually understand what happened!"""

SUMMARIZATION_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}
[The Start of Summarization] {summary} [The End of Summarization]"""

class SummarizationScore(BaseModel):
    score: int = Field(description="""A score from 1-5 ranking how good the summarization is for the provided transcript, with 1 being a bad summary, and 5 being a great summary""")
    
def summary_score_evaluator(inputs: dict, outputs: dict) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": SUMMARIZATION_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": SUMMARIZATION_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    summary=outputs.get("output", "N/A"),
                )}
        ],
        response_format=SummarizationScore,
    )

    summary_score = completion.choices[0].message.parsed.score
    return {"key": "summary_score", "score": summary_score}

In [15]:
# Strategic Summarizer - The Professional
def good_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"You're the CEO's trusted advisor. Create a sharp, executive-level summary of this meeting in 3 clear sentences. Focus on decisions made, action items, and key outcomes that matter for business success. Meeting: {inputs['transcript']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Strategic Summarizer Pro"
)

View the evaluation results for experiment: 'Strategic Summarizer Pro-79223355' at:
https://smith.langchain.com/o/bcaa99f9-3a04-43e7-9c9d-c034ad1da1ea/datasets/0c9dc9f8-5736-4515-9918-4b1351fd5536/compare?selectedSessions=db04cb64-799c-47c4-bcf4-0d1549f94ea4




5it [00:17,  3.40s/it]
5it [00:17,  3.40s/it]


Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,"In today's meeting, a successful transaction w...",,4,2.784464,1c5a3e62-b34c-4e8d-baf4-aec97438d7ef,79e80ee2-dd9a-428c-9543-1b9b5c8aa9ef
1,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,"In today's meeting, Bob and Ms. Nguyen explore...",,5,2.343763,7a0eb143-32ed-409a-bb11-97449f879c02,5d2ee7ac-7c2c-4e40-a896-56cd3a18fd9e
2,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...","During the meeting with Ms. Thompson, no immed...",,4,1.723244,9adcb37b-d5de-4b50-8731-0d4fd3e9b48c,6856e0f0-bb43-4622-877e-1cd69d57ba0c
3,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,"In today's meeting, a successful deal was clos...",,4,2.106777,d5264c1a-f3f0-41a0-9ac8-2f9a5ee9f2ad,67b9d57b-e211-4ee5-9d49-629486f971cb
4,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...","In today's meeting, Bob successfully closed a ...",,5,2.302376,eb093aeb-4b8e-472f-b000-91e02986244e,2e9b1ebf-e34e-4c9a-ad27-472f51ee50e7


In [16]:
# Lazy Intern Summarizer - The Minimalist
def bad_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Quickly skim this meeting and write a super brief one-liner summary. Don't worry about details: {inputs['transcript']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Lazy Intern Summarizer"
)

View the evaluation results for experiment: 'Lazy Intern Summarizer-82c15aeb' at:
https://smith.langchain.com/o/bcaa99f9-3a04-43e7-9c9d-c034ad1da1ea/datasets/0c9dc9f8-5736-4515-9918-4b1351fd5536/compare?selectedSessions=d87eb58c-9b05-4bc2-a830-482d1ea88f06




5it [00:10,  2.17s/it]
5it [00:10,  2.17s/it]


Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob successfully closed a deal with Mr. Carter...,,4,1.142802,1c5a3e62-b34c-4e8d-baf4-aec97438d7ef,38ad78e3-6d74-414e-9cbd-bcf86473ce23
1,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,"Bob discussed vehicle options with Ms. Nguyen,...",,4,1.493121,7a0eb143-32ed-409a-bb11-97449f879c02,ee180b4a-f31c-4b5b-be10-5c0a74aa5449
2,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...",Bob introduced Ms. Thompson to car options at ...,,3,1.208054,9adcb37b-d5de-4b50-8731-0d4fd3e9b48c,5e851fd6-2ea5-4709-861c-f5d79c7d7144
3,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Bob successfully closed a deal with Mr. Johnso...,,3,0.922313,d5264c1a-f3f0-41a0-9ac8-2f9a5ee9f2ad,0912d03b-da0b-42ff-be8e-9b5824c5cdea
4,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob successfully closed a deal with Mr. Patel ...,,2,0.917976,eb093aeb-4b8e-472f-b000-91e02986244e,5fa368f1-0844-43e6-9a88-1ab7e2981ba1


In [17]:
JUDGE_SYSTEM_PROMPT = """
You are a seasoned business consultant who evaluates meeting summaries for Fortune 500 companies. Your job is to determine which summary would be more valuable to a busy executive.

Evaluate based on: 
📊 Completeness - Does it capture all key decisions and action items?
🎯 Clarity - Would someone who missed the meeting understand what happened?
⚡ Actionability - Are next steps and responsibilities clear?
🔍 Accuracy - Does it reflect the actual content without distortion?

Be ruthlessly objective. The order of presentation should not influence your judgment. Choose the summary that would better serve someone making important business decisions."""

JUDGE_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}


[The Start of Assistant A's Summarization] {answer_a} [The End of Assistant A's Summarization][The Start of Assistant B's Summarization] {answer_b} [The End of Assistant B's Summarization]"""


In [18]:
from pydantic import BaseModel, Field

class Preference(BaseModel):
    preference: int = Field(description="""1 if Assistant A answer is better based upon the factors above.
2 if Assistant B answer is better based upon the factors above.
Output 0 if it is a tie.""")
    
def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": JUDGE_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": JUDGE_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    answer_a=outputs[0].get("output", "N/A"),
                    answer_b=outputs[1].get("output", "N/A")
                )}
        ],
        response_format=Preference,
    )

    preference_score = completion.choices[0].message.parsed.preference

    if preference_score == 1:
        scores = [1, 0]
    elif preference_score == 2:
        scores = [0, 1]
    else:
        scores = [0, 0]
    return scores

In [20]:
from langsmith import evaluate

evaluate(
    ("Strategic Summarizer Pro-79223355", "Lazy Intern Summarizer-82c15aeb"),
    evaluators=[ranked_preference]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/bcaa99f9-3a04-43e7-9c9d-c034ad1da1ea/datasets/0c9dc9f8-5736-4515-9918-4b1351fd5536/compare?selectedSessions=db04cb64-799c-47c4-bcf4-0d1549f94ea4%2Cd87eb58c-9b05-4bc2-a830-482d1ea88f06&comparativeExperiment=1a784ba6-5ccd-4407-a11c-df0c3dcd31f0




100%|██████████| 5/5 [00:04<00:00,  1.17it/s]



<langsmith.evaluation._runner.ComparativeExperimentResults at 0x107fb6d50>