### Pairwise Evaluation
* 두 개 이상의 LLM 생성물 서로 비교한다.

In [9]:
from dotenv import load_dotenv

load_dotenv()

True

In [10]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

def evaluate_pairwise(runs: list, example) -> dict:

    # 점수 저장
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i
    

    # 각 예제에 대한 실행 쌍
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    grade_prompt = PromptTemplate.from_template(
        """
        You are an LLM judge. Compare the following two answers to a question and determine which one is better.
        Better answer is the one that is more detailed and informative.
        If the answer is not related to the question, it is not a good answer.

        
        # Question:
        {question}
        
        #Answer A: 
        {answer_a}
        
        #Answer B: 
        {answer_b}
        
        Output should be either `A` or `B`. Pick the answer that is better.
        
        #Preference:
        """
    )
    answer_grader = grade_prompt | llm | StrOutputParser()

    score = answer_grader.invoke(
        {
            "question": question,
            "answer_a": answer_a,
            "answer_b": answer_b
        }
    )

    if score == "A": # A가 더 답변을 잘했다.
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B": # B가 더 답변을 잘했다.
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0
        
    return {"key": "ranked_preference", "scores": scores}

In [11]:
from rag import PDFRAG
from langchain_openai import ChatOpenAI

def ask_question_with_llm(llm):

    rag = PDFRAG(
        "data/snow-white.pdf",
        llm
    )

    retriever = rag.create_retriever()

    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        context = retriever.invoke(inputs["question"])
        context = "\n".join([doc.page_content for doc in context])
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"])
        }
    return _ask_question

In [12]:
from langchain_openai import ChatOpenAI

gpt3 = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

gpt3.invoke("안녕하세요?")

AIMessage(content='안녕하세요! 무엇을 도와드릴까요?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-a252bbab-e5b5-4fe7-b03a-3dfc7fcbe857-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

* Ollama 사용 시 참고

In [13]:
# install package
!pip install -q langchain-ollama

In [14]:
from langchain_ollama import ChatOllama

# Ollama 모델을 불러옵니다.
ollama = ChatOllama(model="gemma2:9b")

# Ollama 모델 호출
ollama.invoke("안녕하세요?")

AIMessage(content='안녕하세요! 👋 어떻게 도와드릴까요? 😊', additional_kwargs={}, response_metadata={'model': 'gemma2:9b', 'created_at': '2024-11-04T05:40:39.7260268Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 2688307300, 'load_duration': 47355000, 'prompt_eval_count': 13, 'prompt_eval_duration': 431396000, 'eval_count': 17, 'eval_duration': 2207953000}, id='run-7688f77e-3d09-4846-809b-a21c193960f1-0', usage_metadata={'input_tokens': 13, 'output_tokens': 17, 'total_tokens': 30})

In [15]:
gpt4o_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))
gpt3_chain = ask_question_with_llm(ChatOpenAI(model="gpt-3.5-turbo", temperature=0))

# ollama 사용시
# ollama_chain = ask_question_with_llm(ChatOllama(model=""))

In [16]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)}, # 평가자
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"]
    }
)

dataset_name = "RAG_EVALUATION_DATASET"

experiment_result1 = evaluate(
    gpt3_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-3.5-turbo 평가 (cot_qa)"
    }
)

experiment_result2 = evaluate(
    gpt4o_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-4o-mini 평가 (cot_qa)"
    }
)

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-d25c5fa6' at:
https://smith.langchain.com/o/63dae880-79dc-48d0-99fd-982e8e046f1a/datasets/df6590c3-71b6-4515-bda0-f9cb0b1ca7ab/compare?selectedSessions=7d602e22-e9a5-4d69-89ea-96d25a03ff00




0it [00:00, ?it/s]

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-48586494' at:
https://smith.langchain.com/o/63dae880-79dc-48d0-99fd-982e8e046f1a/datasets/df6590c3-71b6-4515-bda0-f9cb0b1ca7ab/compare?selectedSessions=8ac85d45-54e4-4b01-a7ff-f0533abf063f




0it [00:00, ?it/s]

In [18]:
from langsmith.evaluation import evaluate_comparative

evaluate_comparative(
    ["MODEL_COMPARE_EVALUATION-d25c5fa6", "MODEL_COMPARE_EVALUATION-48586494"],
    # 평가자
    evaluators=[evaluate_pairwise]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/63dae880-79dc-48d0-99fd-982e8e046f1a/datasets/df6590c3-71b6-4515-bda0-f9cb0b1ca7ab/compare?selectedSessions=7d602e22-e9a5-4d69-89ea-96d25a03ff00%2C8ac85d45-54e4-4b01-a7ff-f0533abf063f&comparativeExperiment=ffbc0600-0fbe-4869-9c5f-5e20b51dde36




  0%|          | 0/5 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x1b390299f10>