### Pairwise Evaluation
일부 평가에서는 두 개 이상의 LLM 생성물을 서로 비교하고자 합니다.

Chatbot Arena 나 LLM 리더보드에서 어렵지 않게 접할 수 있는 비교 평가 방식입니다.

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("CH16-Evaluations")

LangSmith 추적을 시작합니다.
[프로젝트명]
CH16-Evaluations


In [4]:
from langchain import hub

from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langsmith.evaluation import evaluate


def evaluate_pairwise(runs: list, example) -> dict:
    """
    A simple evaluator for pairwise answers to score based on  engagement
    """

    # 점수 저장
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i

    # 각 예제에 대한 실행 쌍
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    # 함수 호출이 있는 LLM, 최고 성능 모델 사용
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # 구조화된 프롬프트
    grade_prompt = PromptTemplate.from_template(
        """
        You are an LLM judge. Compare the following two answers to a question and determine which one is better.
        Better answer is the one that is more detailed and informative.
        If the answer is not related to the question, it is not a good answer.
        
        # Question:
        {question}
        
        #Answer A: 
        {answer_a}
        
        #Answer B: 
        {answer_b}
        
        Output should be either `A` or `B`. Pick the answer that is better.
        
        #Preference:
        """
    )
    answer_grader = grade_prompt | llm | StrOutputParser()

    # 점수 획득
    score = answer_grader.invoke(
        {
            "question": question,
            "answer_a": answer_a,
            "answer_b": answer_b,
        }
    )
    # score = score["Preference"]

    # 점수에 따른 실행 할당 매핑
    if score == "A":  # Assistant A 선호
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B":  # Assistant B 선호
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0

    return {"key": "ranked_preference", "scores": scores}

In [6]:
from langsmith.evaluation import evaluate_comparative

# 실험 이름 또는 ID 배열 교체 - 랭스미스가서 확인해야한다.
evaluate_comparative(
    ["MODEL_COMPARE_EVAL-2e17983d", "MODEL_COMPARE_EVAL-b7c9d702"],
    # 평가자 배열
    evaluators=[evaluate_pairwise],
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/2d0ce887-3f7f-59af-8d5e-12c1371ef5d5/datasets/334d943e-3086-4d03-91ae-4b5145224ffc/compare?selectedSessions=c9c22a10-e18c-45d5-91cd-e9edd1154040%2C43196ad0-54f8-43a0-aa03-e01c4b8e865c&comparativeExperiment=035867dc-d261-4733-a0db-f8246280c356




  0%|          | 0/39 [00:00<?, ?it/s]

KeyError: 'answer'