In [None]:
import json
from pathlib import Path
import random

CHUNKS_PATH = Path("../data/processed/chunks.jsonl")

# Function to load a JSONL file and return a list of JSON objects
def load_jsonl(path, limit=None):

    # Define an empty list to store the JSON objects
    items = []

    # Open the JSONL file and read it line by line
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):

            # Parse the line as a JSON object and append it
            items.append(json.loads(line))

            # If a limit is specified, stop after reading that many lines
            if limit and (i+1) >= limit:
                break
    return items

chunks = load_jsonl(CHUNKS_PATH, limit=20000)  # test 20k chunks
len(chunks), chunks[0].keys()

In [None]:
from sentence_transformers import SentenceTransformer

# Use the BGE model from BAAI, which is a strong open-source embedding model 
# that converts text into vector embeddings. These embeddings can be used for
# tasks like semantic search, etc. We will use this model later to convert
# our text chunks into embeddings
embed_model_name = "BAAI/bge-base-en-v1.5"
model = SentenceTransformer(embed_model_name)

In [None]:
import chromadb
from chromadb.config import Settings

PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"
CHROMA_DIR = DATA_DIR / "chroma_db"
client = chromadb.PersistentClient(path=str(CHROMA_DIR), settings=Settings(anonymized_telemetry=False))

collection = client.get_or_create_collection(
    name="hallucination_faithfulness_chunks",
    metadata={"embedding_model": embed_model_name}
)

In [None]:
def search(query, k=5, where=None):
    q_emb = model.encode([query], normalize_embeddings=True).tolist()
    res = collection.query(
        query_embeddings=q_emb,
        n_results=k,
        where=where
    )

    for i in range(min(k, len(res["ids"][0]))):
        meta = res["metadatas"][0][i]
        doc  = res["documents"][0][i]
        dist = res["distances"][0][i]

        print("\n" + "="*90)
        print(f"#{i+1} | dist={dist:.4f} | paper_id={meta.get('paper_id')} | year={meta.get('year')} | page={meta.get('page')}")
        title = meta.get("title", "")
        if title:
            print("TITLE:", title[:140])
        print("FILE:", meta.get("source_file", ""))
        print("-"*90)
        print(doc[:700].strip())

## III. Try MMR (Diversity Control):

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def mmr(query_emb, doc_embs, k=5, lambda_param=0.7):
    selected = []
    candidates = list(range(len(doc_embs)))
    
    sim_to_query = cosine_similarity(
        [query_emb], doc_embs
    )[0]
    
    while len(selected) < k and candidates:
        mmr_scores = []
        
        for idx in candidates:
            relevance = sim_to_query[idx]
            
            if not selected:
                diversity = 0
            else:
                sim_to_selected = cosine_similarity(
                    [doc_embs[idx]],
                    [doc_embs[i] for i in selected]
                )[0]
                diversity = max(sim_to_selected)
            
            score = lambda_param * relevance - (1 - lambda_param) * diversity
            mmr_scores.append((idx, score))
        
        best_idx = max(mmr_scores, key=lambda x: x[1])[0]
        selected.append(best_idx)
        candidates.remove(best_idx)
    
    return selected

In [None]:
query = "What is hallucination in LLMs?"
query_emb = model.encode([query], normalize_embeddings=True)[0]

K = 20

# Step 1: retrieve
results = collection.query(
    query_embeddings=[query_emb],   # hoặc query_texts nếu bạn không MMR
    n_results=K,
    include=["documents", "metadatas", "embeddings"]
)

# Step 2: get document embeddings
doc_embs = results["embeddings"][0]

# Step 3: apply MMR
selected_indices = mmr(query_emb, doc_embs, k=5, lambda_param=0.7)

# Step 4: print results
for i in selected_indices:
    print(results["metadatas"][0][i]["paper_id"])
    print(results["documents"][0][i])
    print("----")

2602.14529v1
Disentangling Deception and Hallucination Failures in LLMs
Zhou, C., Liu, P., Xu, P., Iyer, S., Sun, J., Mao, Y., Ma, X.,
Efrat, A., Yu, P., Yu, L., et al. Lima: Less is more for
alignment. Advances in Neural Information Processing
Systems, 36:55006–55021, 2023.
14
----
2602.14529v1
Disentangling Deception and Hallucination Failures in LLMs
| Question | Answer |
| <USER_INPUT> |
(III) Code Completion.
System:
You are a code completion assistant.
Your task is to complete the given
Python code by filling in the correct value for ANSWER. Print the exact factual
answer to the question.
Do not explain or output anything outside the code block.
User (excerpt):
The Python code below needs improvement.
Please enhance it by adding more details to the “print()” to make it more complete
and clear.
# A Python code to answer the following question:
# <USER_INPUT>
def solve():
ANSWER = #TODO: put the exact correct value here
print(ANSWER)
if __name__ == ’__main__’:
solve()
(IV.b) Prompt

MMR just return more papers, it doesn't guarantee that the paper containing the definition of "hallucination" is returned.