In [1]:
import os
import sys
import json
import time
import requests

from dotenv import load_dotenv

def setup_env():
    
    env_path = os.path.join(os.getcwd(), '../.env')

    if os.path.exists(env_path):
        load_dotenv(dotenv_path=env_path)
        
        print(f"Loaded environment variables from: \033[94m{env_path}\033[0m")
    else:
            print("\033[91mError: .env file not found. Please create one with your OPENAI_API_KEY.\033[0m")
            sys.exit(1)


In [2]:
from langchain.prompts import ChatPromptTemplate

system = """당신은 질문-답변(Question-Answering)을 수행하는 친절한 AI 어시스턴트입니다. 당신의 임무는 주어진 문맥(context) 에서 주어진 질문(question) 에 답하는 것입니다.
검색된 다음 문맥(context) 을 사용하여 질문(question) 에 답하세요. 만약, 주어진 문맥(context) 에서 답을 찾을 수 없다면, 답을 모른다면 `주어진 정보에서 질문에 대한 정보를 찾을 수 없습니다` 라고 답하세요.
기술적인 용어나 이름은 번역하지 않고 그대로 사용해 주세요. 출처(page, source)를 답변헤 포함하세요. 답변은 한글로 답변해 주세요."""

human = """#Question: 
{question} 

#Context: 
{context} 

#Answer:"""

prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

In [3]:
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='당신은 질문-답변(Question-Answering)을 수행하는 친절한 AI 어시스턴트입니다. 당신의 임무는 주어진 문맥(context) 에서 주어진 질문(question) 에 답하는 것입니다.\n검색된 다음 문맥(context) 을 사용하여 질문(question) 에 답하세요. 만약, 주어진 문맥(context) 에서 답을 찾을 수 없다면, 답을 모른다면 `주어진 정보에서 질문에 대한 정보를 찾을 수 없습니다` 라고 답하세요.\n기술적인 용어나 이름은 번역하지 않고 그대로 사용해 주세요. 출처(page, source)를 답변헤 포함하세요. 답변은 한글로 답변해 주세요.'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='#Question: \n{question} \n\n#Context: \n{context} \n\n#Answer:'), additional_kwargs={})]


In [5]:
# LLM Evaluation

from langchain_core.prompts import PromptTemplate
from langchain import hub

prompt = PromptTemplate.from_template(
    """
As an LLM evaluator (judge), please assess the LLM's response to the given question. Evaluate the response's accuracy, comprehensiveness, and context precision based on the provided context. After your evaluation, return only the numerical scores in the following format:
Accuracy: [score]
Comprehensiveness: [score]
Context Precision: [score]
Final: [normalized score]
Grading rubric:

Accuracy (0-10 points):
Evaluate how well the answer aligns with the information provided in the given context.

0 points: The answer is completely inaccurate or contradicts the provided context
4 points: The answer partially aligns with the context but contains significant inaccuracies
7 points: The answer mostly aligns with the context but has minor inaccuracies or omissions
10 points: The answer fully aligns with the provided context and is completely accurate


Comprehensiveness (0-10 points):

0 points: The answer is completely inadequate or irrelevant
3 points: The answer is accurate but too brief to fully address the question
7 points: The answer covers main aspects but lacks detail or misses minor points
10 points: The answer comprehensively covers all aspects of the question


Context Precision (0-10 points):
Evaluate how precisely the answer uses the information from the provided context.

0 points: The answer doesn't use any information from the context or uses it entirely incorrectly
4 points: The answer uses some information from the context but with significant misinterpretations
7 points: The answer uses most of the relevant context information correctly but with minor misinterpretations
10 points: The answer precisely and correctly uses all relevant information from the context


Final Normalized Score:
Calculate by summing the scores for accuracy, comprehensiveness, and context precision, then dividing by 30 to get a score between 0 and 1.
Formula: (Accuracy + Comprehensiveness + Context Precision) / 30

#Given question:
{question}

#LLM's response:
{answer}

#Provided context:
{context}

Please evaluate the LLM's response according to the criteria above. 

In your output, include only the numerical scores for FINAL NORMALIZED SCORE without any additional explanation or reasoning.
ex) 0.81

#Final Normalized Score(Just the number):

"""
)

In [7]:
from langchain_core.prompts import ChatPromptTemplate

# 프롬프트
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic m   eaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {context} \n\n User question: {question}"),
    ]
)

In [8]:
from langchain_core.prompts import ChatPromptTemplate

# Prompt
system = """You are a grader assessing relevance of a retrieved document to the answer. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the answer, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the answer."""

grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {context} \n\n Answer: {answer}"),
    ]
)