## I. Build “chunk dataset” (PDF → chunks.jsonl)

In [8]:
# !pip -q install pymupdf pandas tqdm tiktoken

from pathlib import Path
import pandas as pd
from tqdm import tqdm
import fitz # fitz is the PyMuPDF library for PDF processing
import json
import re

PROJECT_ROOT = Path.cwd().parent

PDF_DIR = Path("../data/raw/papers")
META_CSV = Path("../data/metadata/papers.csv")

# create the output directory if not exists
OUT_DIR = Path("../data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(META_CSV)
df.head(2)

Unnamed: 0,paper_id,title,year,first_author,authors,published,updated,categories,summary,pdf_url,arxiv_url,file_name,file_path
0,2602.16704v1,Reinforced Fast Weights with Next-Sequence Pre...,2026,Hee Seung Hwang,"Hee Seung Hwang, Xindi Wu, Sanghyuk Chun, Olga...",2026-02-18T18:53:18+00:00,2026-02-18T18:53:18+00:00,cs.CL,Fast weight architectures offer a promising al...,https://arxiv.org/pdf/2602.16704v1,http://arxiv.org/abs/2602.16704v1,2026_hee-seung-hwang_reinforced-fast-weights-w...,data\raw\papers\2026_hee-seung-hwang_reinforce...
1,2602.16671v1,SPARC: Scenario Planning and Reasoning for Aut...,2026,Jaid Monwar Chowdhury,"Jaid Monwar Chowdhury, Chi-An Fu, Reyhaneh Jab...",2026-02-18T18:09:03+00:00,2026-02-18T18:09:03+00:00,"cs.SE, cs.AI",Automated unit test generation for C remains a...,https://arxiv.org/pdf/2602.16671v1,http://arxiv.org/abs/2602.16671v1,2026_jaid-monwar-chowdhury_sparc-scenario-plan...,data\raw\papers\2026_jaid-monwar-chowdhury_spa...


In [None]:
# This code block defines funcitons to clean text and extract text from
# PDF files

# Function takes a string and performs several cleaning operations
def clean_text(s: str) -> str:
    # replace null character by space
    s = s.replace("\x00", " ")

    # delete whitespace before \n, e.g. "word \n" -> "word\n"
    s = re.sub(r"\s+\n", "\n", s)

    # if there are 3+ empty lines, replace them with 2 empty lines
    s = re.sub(r"\n{3,}", "\n\n", s)

    # replace long whitespace with a single whitespace
    s = re.sub(r"[ \t]{2,}", " ", s)

    # remove beginning and ending whitespaces
    return s.strip()

# Function to read a PDF and return the cleaned text of each page
def pdf_pages_text(pdf_path: Path):
    doc = fitz.open(pdf_path) # read the pdf file 
    for i, page in enumerate(doc):

        # retrieve plain text from the page
        text = page.get_text("text")

        # return page number starting from 1 and clean the text
        yield (i+1, clean_text(text)) # yeild is a generator
    doc.close()

import string

import re

REF_HDR = re.compile(r"^\s*(references|bibliography)\s*$", re.IGNORECASE | re.MULTILINE)

# Function to check if a page is low signal based on heuristics
def looks_like_prompt_block(text: str) -> bool:
    t = text.lower()
    patterns = [
        "site:github.com", "output format", "query constraints",
        "follow initial rules", "================", "strict",
        "do not", "must:"
    ]
    hits = sum(p in t for p in patterns)
    return hits >= 2

# Function to check if a page is a valid chunk of text based on heuristics
def is_valid_chunk(text: str, min_chars=200):
    if len(text) < min_chars:
        return False

    letters = sum(c.isalpha() for c in text)
    digits  = sum(c.isdigit() for c in text)
    if letters / max(len(text), 1) < 0.45:
        return False
    if digits / max(len(text), 1) > 0.35:
        return False
    if looks_like_prompt_block(text):
        return False

    return True

# Function to check if a page looks like a prompt based on heuristics
def looks_like_prompt(text: str) -> bool:
    t = text.lower()
    bad = ["site:github.com", "output format", "query constraints", "follow initial rules", "do not", "must:"]
    hits = sum(b in t for b in bad)
    return hits >= 2

# Function to check if a page is low signal, which means it likely
# contains acknowledgements or impact statements rather than main content
def is_low_signal_page(page_text: str) -> bool:
    head = page_text[:1200].lower()
    return ("impact statement" in head) or ("acknowledgements" in head)

import re

REF_HEADER = re.compile(r"^\s*(references|bibliography)\s*$", re.IGNORECASE | re.MULTILINE)

# Function to check if a page is a references page based on heuristics

def is_references_page(page_text: str) -> bool:
    # check early part of page for header
    head = page_text[:1200].lower()
    return "references" in head.splitlines()[:20] or bool(REF_HEADER.search(page_text[:1200]))

In [None]:
import tiktoken # a tokenizer library from OpenAI

# tokenizer for gpt-4, gpt-4o
enc = tiktoken.get_encoding("cl100k_base")

# Function to chunk text into pieces of a certain token size 
# with some overlap
def chunk_by_tokens(text: str, chunk_size=450, overlap=80):
    
    # encode the text into tokens
    tokens = enc.encode(
        text,
        disallowed_special=()  # allow all special tokens as normal text
    )
    chunks = []
    start = 0
    
    # loop to create chunks of tokens with specified size and overlap
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = enc.decode(chunk_tokens).strip()
        if chunk_text:
            chunks.append(chunk_text)
        if end == len(tokens):
            break
        start = max(0, end - overlap)
    return chunks

In [5]:
# Define where the output JSONL file will be saved
OUT_JSONL = OUT_DIR / "chunks.jsonl"
# JSONL (JSON Lines) is chosen instead of JSON because:
# - with JSONL, it's not required to load all data into RAM
# - easier to add a new chunk, while with JSON, you need to 
# load the whole file, add the chunk, then write back the whole file
# - JSONL is more robust to errors, e.g. if the process is interrupted 
# in the middle of writing, the already written lines are still valid JSON
# objects, while with JSON, the whole file would be corrupted
# JSON is better for hierarchical data, while JSONL is better for flat data 
# like our chunks.

written = 0
with OUT_JSONL.open("w", encoding="utf-8") as f:

    # Loop through each papers in metada/papers.csv (120 in total)
    for _, row in tqdm(df.iterrows(), total=len(df)):

        # For each paper, read the corresponding PDF file
        pdf_path = PDF_DIR / row["file_name"]
        if not pdf_path.exists():
            continue
        
        # Extract metadata
        paper_id = row["paper_id"]
        title = row.get("title", "")
        year = int(row.get("year", 0))
        source_file = pdf_path.name

        # For each page in the PDF
        for page_num, page_text in pdf_pages_text(pdf_path):
            if is_low_signal_page(page_text):
                continue
            if is_references_page(page_text):
                continue

            if len(page_text) < 200:  # skip tiny pages
                continue

            # Split the page text into chunks of ~450 tokens
            # Each chunk will repeat the last 80 tokens of the previous
            # chunk, to help maintain context across chunks
            # page_text is the cleaned text of the page, which is a string,
            # this means overlap only applies to the text within a page, not across pages
            page_chunks = chunk_by_tokens(page_text, chunk_size=450, overlap=80)

            # For each chunk
            for j, ch in enumerate(page_chunks):
                if not is_valid_chunk(ch):
                    continue

                # Create a record, which is a dictionary containing the chunk 
                # text and its metadata
                chunk_id = f"{paper_id}_p{page_num:02d}_c{j:03d}"

                if looks_like_prompt(ch):
                    continue

                rec = {
                    "chunk_id": chunk_id,
                    "paper_id": paper_id,
                    "title": title,
                    "year": year,
                    "page": page_num,
                    "text": ch,
                    "source_file": source_file,
                }

                # Write the record as a JSON line in the output file
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
                written += 1

print("Wrote chunks:", written)
print("Saved:", OUT_JSONL)

100%|██████████| 120/120 [00:19<00:00,  6.11it/s]

Wrote chunks: 6319
Saved: ..\data\processed\chunks.jsonl





In [6]:
# show 2 random chunks
import random, itertools
lines = OUT_JSONL.read_text(encoding="utf-8").splitlines()
for line in random.sample(lines, 2):
    obj = json.loads(line)
    print(obj["chunk_id"], obj["page"])
    print(obj["text"][:400], "\n---\n")

2602.15909v1_p14_c001 14
amil W´ojcicki, and Benjamin Shannon. The importance of phase in speech
enhancement. speech communication, 53(4):465–494, 2011.
Daniel S Park, William Chan, Yu Zhang, Chung-Cheng Chiu, Barret Zoph, Ekin D Cubuk, and
Quoc V Le. Specaugment: A simple data augmentation method for automatic speech recognition.
arXiv preprint arXiv:1904.08779, 2019.
William Peebles and Saining Xie. Scalable diffusion m 
---

2602.16660v1_p04_c000 4
Published as a conference paper at ICLR 2026
representations (c.f., Section 3.1). The overall framework, including the linear extractor used to ob-
tain representations from hidden states, as well as the joint optimization objective, is subsequently
elucidated in Section 3.2.
3.1
REGULATING MULTILINGUAL CONSISTENCY WITH SINGULAR ANALYSIS
Representation consistency in queries. Recent studies have d 
---



In [7]:
import json
from pathlib import Path

CHUNKS = Path("../data/processed/chunks.jsonl")

def noise_score(text: str):
    n = max(len(text), 1)
    letters = sum(c.isalpha() for c in text) / n
    digits  = sum(c.isdigit() for c in text) / n
    return letters, digits

total = 0
digit_heavy = 0
letter_light = 0

with CHUNKS.open("r", encoding="utf-8") as f:
    for line in f:
        total += 1
        obj = json.loads(line)
        letters, digits = noise_score(obj["text"])
        if digits > 0.30:      # nhiều số
            digit_heavy += 1
        if letters < 0.50:     # ít chữ
            letter_light += 1

print("Total chunks:", total)
print("Digit-heavy (>30% digits):", digit_heavy, f"({digit_heavy/total:.2%})")
print("Letter-light (<50% letters):", letter_light, f"({letter_light/total:.2%})")

Total chunks: 6319
Digit-heavy (>30% digits): 3 (0.05%)
Letter-light (<50% letters): 143 (2.26%)


In [8]:
import json
import numpy as np
from pathlib import Path

CHUNKS = Path("../data/processed/chunks.jsonl")
lens = []

with CHUNKS.open("r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        lens.append(len(obj["text"]))

arr = np.array(lens)
print("chunks:", len(arr))
print("min/median/p90/max:", arr.min(), int(np.median(arr)), int(np.percentile(arr, 90)), arr.max())
print("too short (<200 chars):", (arr < 200).sum(), f"({(arr<200).mean():.2%})")

chunks: 6319
min/median/p90/max: 202 1488 2096 3035
too short (<200 chars): 0 (0.00%)


## II. Build Vector Index (Chroma) + test retrieval

In [9]:
# !pip -q install chromadb sentence-transformers

In [5]:
import json
from pathlib import Path
import random

CHUNKS_PATH = Path("../data/processed/chunks.jsonl")

# Function to load a JSONL file and return a list of JSON objects
def load_jsonl(path, limit=None):

    # Define an empty list to store the JSON objects
    items = []

    # Open the JSONL file and read it line by line
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):

            # Parse the line as a JSON object and append it
            items.append(json.loads(line))

            # If a limit is specified, stop after reading that many lines
            if limit and (i+1) >= limit:
                break
    return items

chunks = load_jsonl(CHUNKS_PATH, limit=20000)  # test 20k chunks
len(chunks), chunks[0].keys()

(6319,
 dict_keys(['chunk_id', 'paper_id', 'title', 'year', 'page', 'text', 'source_file']))

In [6]:
from sentence_transformers import SentenceTransformer

# Use the BGE model from BAAI, which is a strong open-source embedding model 
# that converts text into vector embeddings. These embeddings can be used for
# tasks like semantic search, etc. We will use this model later to convert
# our text chunks into embeddings
embed_model_name = "BAAI/bge-base-en-v1.5"
model = SentenceTransformer(embed_model_name)

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 886.68it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [9]:
import chromadb
from chromadb.config import Settings

DATA_DIR = PROJECT_ROOT / "data"
CHROMA_DIR = DATA_DIR / "chroma_db"
client = chromadb.PersistentClient(path=str(CHROMA_DIR), settings=Settings(anonymized_telemetry=False))

collection = client.get_or_create_collection(
    name="hallucination_faithfulness_chunks",
    metadata={"embedding_model": embed_model_name}
)

In [None]:
from tqdm import tqdm

# Function that takes an iterable and yields it in batches of size n
def batch(iterable, n=256):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

# Prepare the data for upserting into ChromaDB. We need three lists:
# - docs: the text of each chunk
# - ids: a unique ID for each chunk, which we will use the "chunk_id"
# - metas: a dictionary of metadata for each chunk
docs = [c["text"] for c in chunks]
ids = [c["chunk_id"] for c in chunks]
metas = [
    {
        "paper_id": c["paper_id"],
        "title": c.get("title", ""),
        "year": int(c.get("year", 0)),
        "page": int(c.get("page", 0)),
        "source_file": c.get("source_file", "")
    }
    for c in chunks
]

# Upsert in batches
for b_docs, b_ids, b_metas in tqdm(list(zip(batch(docs), batch(ids), batch(metas))), total=(len(docs)+255)//256):
    emb = model.encode(b_docs, normalize_embeddings=True).tolist()
    collection.upsert(
        ids=b_ids,
        documents=b_docs,
        metadatas=b_metas,
        embeddings=emb
    )

print("Inserted:", collection.count())

  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [41:01<00:00, 98.46s/it] 

Inserted: 6319





In [None]:
# Function to perform a search query on the ChromaDB collection.
def search(query, k=5, where=None):
    
    # Encode the query into an embedding
    q_emb = model.encode([query], normalize_embeddings=True).tolist()
    
    # Query the collection for the top k most similar chunks to the 
    # query embedding,
    res = collection.query(
        query_embeddings=q_emb,
        n_results=k,
        where=where
    )

    # Loop through the search results and print the metadata and 
    # text of each chunk
    for i in range(min(k, len(res["ids"][0]))):
        meta = res["metadatas"][0][i]
        doc  = res["documents"][0][i]
        dist = res["distances"][0][i]

        print("\n" + "="*90)
        print(f"#{i+1} | dist={dist:.4f} | paper_id={meta.get('paper_id')} | year={meta.get('year')} | page={meta.get('page')}")
        title = meta.get("title", "")
        if title:
            print("TITLE:", title[:140])
        print("FILE:", meta.get("source_file", ""))
        print("-"*90)
        print(doc[:700].strip())

In [15]:
queries = [
    "What is hallucination in large language models?",
    "How do papers define faithfulness or groundedness?",
    "What metrics are used to measure factuality or faithfulness?",
    "How does retrieval (RAG) help reduce hallucination?",
    "What are common causes of hallucination?"
]

for q in queries:
    print("\n\n" + "#"*30)
    print("QUERY:", q)
    search(q, k=5)



##############################
QUERY: What is hallucination in large language models?

#1 | dist=0.3085 | paper_id=2602.14259v1 | year=2026 | page=7
TITLE: Detecting LLM Hallucinations via Embedding Cluster Geometry: A Three-Type Taxonomy with Measurable Signatures
FILE: 2026_matic-korun_detecting-llm-hallucinations-via-embedding-cluster-geometry-_2602.142591.pdf
------------------------------------------------------------------------------------------
prerequisites, but hallucination detection in prac-
tice operates on contextual hidden states, which
may exhibit different geometry. Second, the cross-
model survey covers 11 models but no models
above 1.5B parameters; extending to larger mod-
els requires GPU resources. Third, the detection
architecture is proposed but not yet benchmarked
against established datasets such as HaluEval (Li
et al., 2023). Fourth, the α statistic is limited by
the number of clusters containing ≥2 co-clustered
antonym pairs from our curated set; sample siz

In [16]:
keywords = [
    "we define hallucination",
    "hallucination refers to",
    "hallucination occurs when",
    "hallucination can be understood",
    "hallucination denotes",
]

with open("../data/processed/chunks.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        text = item["text"].lower()
        if any(k in text for k in keywords):
            print(item["paper_id"], item["page"])
            print(item["text"])
            print("----")

2602.14419v1 3
Thus, the entire model resides in
Lp(Ω, F, P)
(6)
The norm of this Lp space corresponds to the Lebesgue integral representing the expectation of the
probability distribution
∥f∥p =
Z
Ω
|f(ω)|p dP(ω)
1/p
(7)
This expression indicates that the model optimizes not pointwise truth values, but plausible average
optimality over distributions.
2.3
Norms and the Minkowski Inequality
In Lp spaces, the Minkowski inequality
∥f + g∥p ≤∥f∥p + ∥g∥p
(8)
holds. This implies that different semantic components, hypotheses, and contexts can be linearly combined
in the embedding space, while the norm remains stable (i.e., no gradient explosion).
This property allows semantic superposition, which is a source of the flexibility of generative AI.
On the other hand, the combination of this linearity and incompleteness inherently contains a structure
that can generate non-existent propositions (erroneous outputs).
3
Mathematical Definition of Hallucination
3.1
Definition
Definition 1 (Hallucin

The query "what is hallucination..." doesn't return the definition of hallucination. However, by scanning chunks.jsonl and match lines with keywords, 1 formal definition of hallucination is found. The solutions include:
- Find 5-10 papers about hallucination, benchmark papers, or foundational papers
- Create index for chunks. For example, chunks that contain terms like "we define", "refers to", "definition" can be saved to "definition_index".
- MMR(Diversity Control) to avoid top-k from 1 paper.

In [17]:
print("Filter: year >= 2024")
search("faithfulness metric", k=5, where={"year": {"$gte": 2024}})

Filter: year >= 2024

#1 | dist=0.5143 | paper_id=2602.16154v1 | year=2026 | page=6
TITLE: Balancing Faithfulness and Performance in Reasoning via Multi-Listener Soft Execution
FILE: 2026_nithin-sivakumaran_balancing-faithfulness-and-performance-in-reasoning-via-mult_2602.161541.pdf
------------------------------------------------------------------------------------------
Balancing Faithfulness and Performance in Reasoning via Multi-Listener Soft Execution
Table 2. Qwen3-14B AOC metrics for measuring faithfulness using truncated CoT and after adding mistakes to the CoT, following
Lanham et al. (2023).
Method
Truncated CoT Answering
Adding Mistake
BBEH
ZLB
MuSR
FOLIO
BBEH
ZLB
MuSR
FOLIO
Original
0.580
0.520
0.332
0.284
0.621
0.793
0.708
0.667
MAT-Steer
0.644
0.527
0.336
0.320
0.649
0.808
0.722
0.674
Faithfulness Only
0.665
0.587
0.330
0.350
0.672
0.838
0.731
0.714
Correctness Only
0.518
0.474
0.317
0.247
0.598
0.778
0.694
0.648
Balanced Rewards
0.574
0.512
0.328
0.299
0.614
0.790
0.712


In [None]:
paper_id_example = "2602.16660v1"
print(f"Filter: paper_id == {paper_id_example}")
search("reward gap supervision signal", k=5, where={"paper_id": paper_id_example})

Filter: paper_id == 2602.16660v1

#1 | dist=0.5244 | paper_id=2602.16660v1 | year=2026 | page=18
TITLE: Align Once, Benefit Multilingually: Enforcing Multilingual Consistency for LLM Safety Alignment
FILE: 2026_yuyan-bu_align-once-benefit-multilingually-enforcing-multilingual-con_2602.166601.pdf
------------------------------------------------------------------------------------------
PO leverages the reward gap in a dominant language as a high-quality supervision signal to guide
multilingual safety alignment. Specifically, it minimizes the discrepancy in reward gaps across
different languages to enable effective transfer of alignment signals. We reproduce MPO using the
same PKU-SafeRLHF dataset as in our main experiments 11. The only deviation from the original
setup lies in the data source: instead of using their original dataset, we adopt the prompt, chosen,
and rejected fields from PKU-SafeRLHF, and translate both prompts and responses into the target
languages to construct the tra

MMR just return more papers, it doesn't guarantee that the paper containing the definition of "hallucination" is returned.