In [None]:
import os, json, pandas as pd
from pathlib import Path
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

VECTOR_DIR = "data/vectorstore"

hf_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2")

store = Chroma(
    persist_directory=VECTOR_DIR,
    embedding_function=hf_embeddings,
)
retriever = store.as_retriever(
    search_type="similarity", search_kwargs={"k": 3})

llm = ChatOpenAI(model="gpt-5-nano", temperature=0)

def rag(question: str):
    docs = retriever.invoke(question)
    contexts = [d.page_content for d in docs]
    return {"question": question,
            "contexts": contexts,
            "metadata": [d.metadata for d in docs]}

prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a helpful assistant. "
     "답변은 질문에 입력된 언어로 하며, 아래 제공된 contexts에 있는 정보만 사용해. "
     "contexts에 없는 내용은 추정하지 말고 '제공된 자료로는 확인할 수 없습니다'라고 답해."),
    ("human",
     "Question:\n{question}\n\n"
     "Contexts (use ONLY this info):\n{ctx}\n\n")
])

def generate_answer(question: str, contexts: list[str]) -> str:
    if not contexts:
        return "제공된 자료로는 확인할 수 없습니다."
    ctx_joined = "\n".join(f"- {c}" for c in contexts)
    messages = prompt.format_messages(question=question, ctx=ctx_joined)
    resp = llm.invoke(messages)
    return resp.content.strip()

In [None]:
import glob

# 모든 scenario_*.jsonl 파일을 찾아서 DATASET 리스트로 만듦
DATASET_PATHS = sorted(glob.glob("data/scenario_*.jsonl"))
OUTDIR  = Path("results")

def load_gold_rows(paths: list[str]):
    rows = []
    for path in paths:
        path = Path(path)
        with path.open("r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                item = json.loads(line)
                rows.append(item)
    return rows


gold = load_gold_rows(DATASET_PATHS)
print(f"Loaded {len(gold)} questions from {len(DATASET_PATHS)} files.")

# 1) 질문별로 retrieval + generation 실행
preds = []
for item in gold:
    q  = item["question"]
    gt = item["ground_truth"] 
    r  = rag(q)                # {"question","contexts","metadata"}
    ctxs = r.get("contexts", [])
    ans  = generate_answer(q, ctxs)
    preds.append({
        "question": q,
        "answer": ans,
        "contexts": ctxs,
        "ground_truth": gt,
    })

# 2) HuggingFace Dataset 변환
df = pd.DataFrame(preds)
ds = Dataset.from_pandas(df)

# 3) RAGAS 메트릭 계산
result = evaluate(
    ds,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)

result_df = result.to_pandas()
display(result_df)

import datetime
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# Calculate mean scores for each metric
result_df['faithfulness_mean'] = result_df['faithfulness'].mean()
result_df['answer_relevancy_mean'] = result_df['answer_relevancy'].mean()
result_df['context_precision_mean'] = result_df['context_precision'].mean()
result_df['context_recall_mean'] = result_df['context_recall'].mean()

csv_filename = f"{current_time}.csv"
csv_path = OUTDIR / csv_filename
result_df.to_csv(csv_path, index=False, encoding='utf-8')
print(f"Results saved to: {csv_path}")