### **Import libraries**

In [1]:
import sys
import os

# Go up one directory level from the notebook folder
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [2]:
import re
import ast
import json
import time
import pandas as pd
from enum import Enum
from typing import List
from pydantic import BaseModel, Field

from src.utils import dump_json
from src.rag import (
  chunk_pdf, 
  get_llm
)

from langchain.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Define constants
pdf_file = "../data/halueval.pdf"
chunk_size = 1000
chunk_overlap = int(0.1 * chunk_size)

# Create QA folder
qa_folder_path = "../data/qa"
if not os.path.exists(qa_folder_path):
  os.makedirs(qa_folder_path)

### **Generate QA dataset**

**Chunk document**

In [4]:
chunks = chunk_pdf(pdf_file=pdf_file, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Retain only meaningful chunks
remove_chunks = [4, 7, 8, 15, 16, 21, 27, 49, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65, 66]
chunks = [chunk for i, chunk in enumerate(chunks) if i not in remove_chunks and len(chunk) > 200] 

**Define prompt**

In [None]:
qa_gen_prompt = PromptTemplate(
  template="""
    You are an expert at creating Vietnamese question-answer pairs to evaluate a RAG system.
    Given the following document, generate three relevant question-answer pairs that mimic realistic questions a user might ask in a search query.
    
    Requirements:
      - Include different question types (factual, inferential, analytical).
      - Vary in difficulty (basic recall, complex reasoning).
      - Test understanding of key concepts, relationships, and implications.
    
    The questions should be directly answerable from the documentation and should not require any external knowledge.
    The questions and answers MUST NOT include phrases like "theo đoạn văn" or "theo tài liệu" or "theo ngữ cảnh".
    The questions should sound natural and self-contained, as if a real user is asking without having seen the original text.
    The questions and answers MUST be in Vietnamese.
    The questions MUST NOT be too long or too short.
    
    Structure QA pairs following this format: {generate_format}

    Document: {chunk}
  """,
  input_variables=["generate_format", "chunk"]
)

**Define output format**

In [None]:
class QTypeEnum(str, Enum):
  factual = "factual"
  inferential = "inferential"
  analytical = "analytical"

class QAPairMetaData(BaseModel):
  question_type: QTypeEnum = Field(
    description="Question type"
  )
  required_context: str = Field(
    description="Specific quote string from the chunk needed to answer the question"
  )
  reasoning: str = Field(
    description="A brief description of how you arrived at the answer from the context"
  )

class QAPair(BaseModel):
  metadata: QAPairMetaData
  question: str
  answer: str

class QAPairList(BaseModel):
  qa_pairs: List[QAPair]

**Generate QA pairs**

In [None]:
def process_gen_output(gen_result: str, output_path: str):
  """
    Process output & Dump to JSON file
  """
  qa_pairs = ast.literal_eval(re.sub(r"```json\n|\n```", "", gen_result.content).strip())["qa_pairs"]
  for pair in qa_pairs:
    dump_json(data=pair, output_path=output_path)

In [None]:
llm = get_llm()
qa_gen_chain = qa_gen_prompt | llm

for i, chunk in enumerate(chunks):
  # Avoid rate limit
  if i != 0 and i % 15 == 0:
    time.sleep(60)

  gen_result = qa_gen_chain.invoke({"generate_format": QAPairList.model_json_schema(), "chunk": chunk})
  process_gen_output(result=gen_result, output_path=f"{qa_folder_path}/raw_qa_dataset.json")

### **Filter QA dataset**

The generated question-answer pairs may contain some imperfections; therefore, we will use another model to filter and retain only high-quality pairs for evaluating the RAG system.

**Define prompt**

In [None]:
qa_filter_prompt = PromptTemplate(
  template="""
    You are an expert at filtering high-quality generated question-answer pairs to evaluate a RAG system.
    Given a question-answer pair and it's relevant document, assess whether it meets the following quality criteria.

    The quality criteria:
      - Language Constraint: The question and answer MUST be in Vietnamese, allowing technical terms to be in English.
      - Context Independence: Avoid phrases like "theo đoạn văn" or "theo tài liệu" or "theo ngữ cảnh"; the question and answer MUST be self-contained.
      - Question Clarity: The question MUST be well-formed, unambiguous, grammatically correct, and natural—like something a real user would ask.
      - Hallucination: The answer MUST be fully grounded in the provided document. If any part of the answer is not explicitly stated or logically inferred from the document, or appears to be guessed or assumed by the LLM, it is considered a hallucination and should be rejected.
      - Answer Quality: The answer MUST be accurate, complete, concise, and grammatically correct.

    Structure your response following this format: {filter_format}
    
    Document chunk: {chunk} 

    Now evaluate the question-answer pair below:
    {qa_pair}
  """,
  input_variables=["filter_format", "qa_pair", "chunk"]
)

**Define output format**

In [None]:
class QualityCriteria(str, Enum):
  language_constraint = "language_constraint"
  context_independence = "context_independence"
  question_clarity = "question_clarity"
  answerability = "answerability"
  answer_quality = "answer_quality"

class QAEval(BaseModel):
  reasoning: str = Field(description="A brief explanation of your judge")
  violate_criteria: List[QualityCriteria] = Field(description="A list of quality criteria that are violated")
  judge: int = Field(description="0: if the QA pair violates one of those criteria, 1: if the QA pair meets all criteria")

**Filter QA pairs**

In [None]:
def process_filter_output(filter_result: str, output_path: str):
  clean_output = re.sub(r"```json\n|\n```", "", filter_result.content).strip()
  data = ast.literal_eval(clean_output)
  dump_json(data=data, output_path=output_path)

In [None]:
# Load generated QA pairs
with open(file=f"{qa_folder_path}/raw_qa_dataset.json", mode="r", encoding="utf-8") as file:
  qa_pairs = json.load(file)

In [None]:
filter_chain = qa_filter_prompt | llm

# Evaluate each QA pair
for i, pair in enumerate(qa_pairs):
  # Avoid rate limit
  if i != 0 and i % 15 == 0:
    time.sleep(60)
  
  question = pair["question"]
  answer = pair["answer"]
  qa_pair_text = f"Question: {question}\nAnswer: {answer}"
  filter_result = filter_chain.invoke({"filter_format": QAEval.model_json_schema(), "qa_pair": qa_pair_text, "chunk": chunks[i // 3]})
  process_filter_output(filter_result=filter_result, output_path=f"{qa_folder_path}/filter_results.json")

In [None]:
# Load filtering results
with open(file=f"{qa_folder_path}/filter_results.json", mode="r", encoding="utf-8") as file:
  filter_results = json.load(file)

assert len(qa_pairs) == len(filter_results)

In [7]:
# Create a Dataframe for the QA pairs
qa_df = pd.DataFrame(qa_pairs)
qa_df["question_type"] = qa_df["metadata"].apply(lambda item: item["question_type"])
qa_df = qa_df.drop(columns="metadata")

# Create a Dataframe for the filtering results
filter_df = pd.DataFrame(filter_results)
filter_df = qa_df.merge(right=filter_df, how="inner", right_index=True, left_index=True)
filter_df.head()

Unnamed: 0,question,answer,question_type,reasoning,violate_criteria,judge
0,Halueval được tạo ra để làm gì?,Halueval là một chuẩn đánh giá quy mô lớn về h...,factual,The question and answer are both in Vietnamese...,[],1
1,Có bao nhiêu nhà nghiên cứu đã tham gia vào cô...,Có sáu nhà nghiên cứu đã tham gia vào công trì...,inferential,The question is clear and in Vietnamese. The a...,[],1
2,Đại học Renmin của Trung Quốc tham gia vào ngh...,Đại học Renmin của Trung Quốc tham gia thông q...,analytical,The question and answer are both in Vietnamese...,[],1
3,Các mô hình ngôn ngữ lớn như ChatGPT có xu hướ...,Các mô hình ngôn ngữ lớn như ChatGPT có xu hướ...,factual,The question and answer are both in Vietnamese...,[],1
4,Nội dung do các mô hình ngôn ngữ lớn tạo ra đư...,Nội dung được coi là 'ảo giác' khi nó mâu thuẫ...,inferential,The question and answer are both in Vietnamese...,[],1


In [8]:
# Select rejected QA pairs
rejected_df = filter_df[filter_df["judge"] == 0]
print(f"Number of rejected QA pairs: {rejected_df.shape[0]}\n")

# Print some rejected QA pairs
for index, row in rejected_df.sample(n=1).iterrows():
  print("=====================================================")
  print(f"Chunk: {chunks[index // 3]}\n")
  print(f"Question: {row.question}")
  print(f"Answer: {row.answer}")
  print(f"Violated criteria: {', '.join(row.violate_criteria)}")
  print(f"Reasoning: {row.reasoning}\n")

Number of rejected QA pairs: 8

Chunk: **3.2.2** **improvement strategies**

in this part, we design several strategies to improve
the ability of llms to recognize hallucination. the
results are shown in table 8.

**knowledge retrieval.** retrieving relevant knowledge is a widely used strategy to eliminate hallucination (lewis et al., 2020; li et al., 2023a). therefore, we supply chatgpt with the knowledge facts
retrieved from wikipedia (except for that summarization does not need external information besides

Question: Ngoại trừ tóm tắt, những loại nhiệm vụ nào khác có thể hưởng lợi từ thông tin bên ngoài?
Answer: Các nhiệm vụ khác ngoài tóm tắt có thể hưởng lợi từ thông tin bên ngoài.
Violated criteria: answer_quality
Reasoning: The answer is incomplete and does not provide specific tasks that benefit from external information. The question is clear, but the answer is too vague and does not fulfill the prompt.



In [None]:
# Store QA pairs in a JSON file
quality_df = filter_df[filter_df["judge"] == 1]
quality_df = quality_df.drop(columns=["reasoning", "violate_criteria", "judge"])
quality_df.to_json(f"{qa_folder_path}/qa_dataset.json", orient="records", force_ascii=False, indent=2)