In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain_teddynote import logging

logging.langsmith("Evaluations")

LangSmith 추적을 시작합니다.
[프로젝트명]
Evaluations


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 
from langchain_upstage import UpstageEmbeddings
from langchain_upstage import ChatUpstage


# 질문에 대한 답변하는 함수를 생성
def ask_question_with_llm(llm):
    loader = PyMuPDFLoader("SPRI_AI_Brief_2023년12월호_F.pdf")
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    split_documents = text_splitter.split_documents(docs)
    
    embeddings = UpstageEmbeddings(model="embedding-query")
    vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

    retriever = vectorstore.as_retriever()

    prompt = PromptTemplate.from_template(
        """You are an assistant for question-answering tasks. 
        Use the following pieces of retrieved context to answer the question. 
        If you don't know the answer, just say that you don't know. 
        Answer in Korean.

        #Question: 
        {question} 
        #Context: 
        {context} 

        #Answer:"""
    )   
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    def _ask_question(inputs: dict):
        # 질문에 대한 컨텍스트 검색
        context = retriever.invoke(inputs["question"])
        # 검색된 문서들을 하나의 문자열로 결합
        context = "\n".join([doc.page_content for doc in context])
        # 질문, 컨텍스트, 답변을 포함한 딕셔너리 반환
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"]),
        }

    return _ask_question

In [4]:
llm = ChatUpstage(model="solar-mini")

upstage_chain = ask_question_with_llm(llm)

In [5]:
from langchain_upstage import UpstageGroundednessCheck

# 업스테이지 Groundness Checker 생성
upstage_groundedness_check = UpstageGroundednessCheck()

In [10]:
# Groundness Checker 를 실행하여 평가
request_input = {
    "context": "테디의 성별은 남자이며, 테디노트 유튜브 채널을 운영하고 있습니다.",
    "answer": "테디는 남자다.",
}

response = upstage_groundedness_check.invoke(request_input)
print(response)

grounded


In [11]:
# Groundness Checker 를 실행하여 평가
request_input = {
    "context": "테디의 성별은 남자이며, 테디노트 유튜브 채널을 운영하고 있습니다.",
    "answer": "테디는 여자다.",
}

response = upstage_groundedness_check.invoke(request_input)
print(response)

notGrounded


In [6]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate


def upstage_groundness_check_evaluator(run: Run, example: Example) -> dict:
    # LLM 생성 답변, 정답 답변 가져오기
    answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")

    # Groundness 체크
    groundedness_score = upstage_groundedness_check.invoke(
        {"answer": answer, "context": context}
    )
    groundedness_score = groundedness_score == "grounded"

    return {"key": "groundness_score", "score": int(groundedness_score)}

In [7]:
from langsmith.evaluation import evaluate

# 데이터셋 이름 설정
dataset_name = "RAG_EVAL_DATASET"

# 실행
experiment_results = evaluate(
    upstage_chain,
    data=dataset_name,
    evaluators=[
        upstage_groundness_check_evaluator,
    ],
    experiment_prefix="GROUNDEDNESS-EVAL",
    # 실험 메타데이터 지정
    metadata={
        "variant": "Upstage Groundness Checker 를 활용한 Hallucination 평가",
    },
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'GROUNDEDNESS-EVAL-15d34997' at:
https://smith.langchain.com/o/8d57949a-053b-4992-8dd3-1ac178b342de/datasets/c32f8c4b-5e6c-4adc-81b0-fbe60afdddfb/compare?selectedSessions=56f80871-ec3d-42ce-8202-cde5c0717712




5it [00:18,  3.67s/it]


# summary evaluators를 활용한 데이터셋 전체에 대한 종합 평가

In [8]:
from typing import List
from langsmith.schemas import Example, Run


def upstage_groundness_check_summary_evaluator(
    runs: List[Run], examples: List[Example]
) -> dict:
    def is_grounded(run: Run) -> bool:
        context = run.outputs["context"]
        answer = run.outputs["answer"]
        return (
            upstage_groundedness_check.invoke({"context": context, "answer": answer})
            == "grounded"
        )

    groundedness_scores = sum(1 for run in runs if is_grounded(run))
    return {"key": "groundness_score", "score": groundedness_scores / len(runs)}

In [10]:
from langsmith.evaluation import evaluate

# 평가 실행
experiment_result1 = evaluate(
    upstage_chain,
    data=dataset_name,
    summary_evaluators=[
        upstage_groundness_check_summary_evaluator,
    ],
    experiment_prefix="GROUNDNESS_UPSTAGE_SUMMARY_EVAL",
    # 실험 메타데이터 지정
    metadata={
        "variant": "Upstage Groundness Checker 를 활용한 Hallucination 평가",
    },
)

View the evaluation results for experiment: 'GROUNDNESS_UPSTAGE_SUMMARY_EVAL-5993caed' at:
https://smith.langchain.com/o/8d57949a-053b-4992-8dd3-1ac178b342de/datasets/c32f8c4b-5e6c-4adc-81b0-fbe60afdddfb/compare?selectedSessions=4982092d-9e3c-4222-bc1b-ae67ca42446b




5it [00:16,  3.35s/it]
