In [4]:
import os
import json
import hashlib
from datetime import datetime
from collections import defaultdict

import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter

PDF_PATH = "NU.pdf"
OUT_PATH = "chunks_output.jsonl"

# --- Helpers ---------------------------------------------------------------

def sha1_of_file(path, buf_size=1024 * 1024):
    h = hashlib.sha1()
    with open(path, "rb") as f:
        while True:
            b = f.read(buf_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def make_chunk_id(source_sha1: str, page: int, global_idx: int, page_idx: int) -> str:
    # Deterministic, human-readable-ish ID
    core = f"{source_sha1[:12]}:p{page}:g{global_idx}:k{page_idx}"
    return hashlib.sha1(core.encode("utf-8")).hexdigest()

# --- Extract per-page text -------------------------------------------------

with pdfplumber.open(PDF_PATH) as pdf:
    total_pages = len(pdf.pages)
    page_texts = []
    for i, page in enumerate(pdf.pages, start=1):
        txt = page.extract_text() or ""
        if txt.strip():
            page_texts.append((i, txt))

# --- Splitter (per page to keep page provenance) ---------------------------

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

# Build a global list of (page, chunk_text)
chunks_with_pages = []
page_local_counts = defaultdict(int)

for page_num, page_text in page_texts:
    # Split this page's text; chunks won't cross pages
    page_chunks = splitter.split_text(page_text)
    for ch in page_chunks:
        page_local_counts[page_num] += 1
        chunks_with_pages.append((page_num, page_local_counts[page_num], ch))

# --- File-level provenance -------------------------------------------------

source_name = os.path.basename(PDF_PATH)
source_size = os.path.getsize(PDF_PATH)
source_sha1 = sha1_of_file(PDF_PATH)
created_at = datetime.utcnow().isoformat(timespec="seconds") + "Z"

# --- Write JSONL with rich metadata ---------------------------------------

with open(OUT_PATH, "w", encoding="utf-8") as f:
    for global_idx, (page_num, idx_in_page, chunk_text) in enumerate(chunks_with_pages):
        meta = {
            # Identifiers
            "id": make_chunk_id(source_sha1, page_num, global_idx, idx_in_page),
            "chunk_number": global_idx + 1,              # 1-based global index
            "chunk_index_global": global_idx,            # 0-based
            "chunk_index_in_page": idx_in_page,          # 1-based within page

            # Content
            "text": chunk_text,
            "chunk_char_count": len(chunk_text),

            # Provenance (file-level)
            "source_file": source_name,
            "source_path": os.path.abspath(PDF_PATH),
            "source_type": "pdf",
            "source_size_bytes": source_size,
            "source_sha1": source_sha1,

            # Provenance (page-level)
            "page_start": page_num,
            "page_end": page_num,
            "page_total": total_pages,

            # Process info
            "created_at": created_at,
            "splitter": {
                "type": "RecursiveCharacterTextSplitter",
                "chunk_size": 1000,
                "chunk_overlap": 200
            },
        }
        f.write(json.dumps(meta, ensure_ascii=False) + "\n")

print(f"Wrote {len(chunks_with_pages)} chunks with rich metadata to {OUT_PATH}")

Wrote 418 chunks with rich metadata to chunks_output.jsonl


  created_at = datetime.utcnow().isoformat(timespec="seconds") + "Z"


Embedding and Index Store

In [3]:
import json
from pathlib import Path

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Import Document (handles both old/new LangChain versions)
try:
    from langchain.schema import Document
except Exception:
    from langchain_core.documents import Document

# --- Read chunks *with* metadata from JSONL and build Documents ---
def read_jsonl_as_documents(file_path: str):
    docs = []
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            obj = json.loads(line)
            text = obj.pop("text", "")  # remove text from metadata dict
            # Ensure a chunk_number exists even if not provided
            obj.setdefault("chunk_number", i)
            docs.append(Document(page_content=text, metadata=obj))
    return docs

# Paths
chunks_path = "chunks_output.jsonl"
index_dir = "vector_databases.index"

# Load docs
docs = read_jsonl_as_documents(chunks_path)

# Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Build FAISS *with metadata* (docstore will keep it)
vectorstore = FAISS.from_documents(docs, embedding_model)

# Save FAISS index + docstore (includes metadata)
Path(index_dir).mkdir(parents=True, exist_ok=True)
vectorstore.save_local(index_dir)

print(f"FAISS index saved to '{index_dir}' with {len(docs)} documents (metadata included).")

# --- Optional: write a human-readable sidecar of metadata for quick inspection ---
sidecar = "chunks_metadata_preview.jsonl"
with open(sidecar, "w", encoding="utf-8") as out:
    for d in docs:
        out.write(json.dumps(
            {
                # Short preview to avoid huge files
                "text_preview": d.page_content[:160],
                **d.metadata
            },
            ensure_ascii=False
        ) + "\n")
print(f"Metadata preview written to '{sidecar}'.")

# --- Example: how to load and retrieve later (metadata comes back in results) ---
# from langchain.vectorstores import FAISS
# vectorstore = FAISS.load_local(index_dir, embedding_model, allow_dangerous_deserialization=True)
# hits = vectorstore.similarity_search("message from the president", k=3)
# for h in hits:
#     print(h.metadata, h.page_content[:120])


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


FAISS index saved to 'vector_databases.index' with 391 documents (metadata included).
Metadata preview written to 'chunks_metadata_preview.jsonl'.


In [None]:
Retrieval 

In [21]:
# Load the FAISS index from disk
vectorstore = FAISS.load_local("vector_databases.index", embedding_model, allow_dangerous_deserialization=True)

# Query the index
query = "message from the president and what is it about"
results = vectorstore.similarity_search(query, k=5)  # Get top 5 most similar chunks

# Display the results
for idx, doc in enumerate(results, start=1):
    print(f"[{idx}] {doc.page_content}\n")

[1] Table of Contents
Message from the President ................................................................................................................. v
Privacy Statement ................................................................................................................................. vi
History ................................................................................................................................................ viii
National University Hymn ......................................................................................................................x
School Logo, Colors and Motto ............................................................................................................. xi
Vision, Mission and Dynamic Filipinism ............................................................................................... xii

[2] the University.
Aside from the norms in this handbook, bulletin board postings, electronic ann

In [None]:
Query

In [22]:
import ollama

# Define the model name
model = "mistral:instruct"

# Prepare the retrieved content for the Mistral model prompt
retrieved_text = "\n\n".join([doc.page_content for doc in results])

# Formulate the prompt including the retrieved context
prompt = f"Here are some documents related to your query:\n\n{retrieved_text}\n\nBased on the information above, answer the following question: {query}"

# Send the prompt to the Mistral model
response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])

# Print the response content
print(response['message']['content'])

 The message from the President in the provided documents is a welcome message for the 2022 Student Handbook. In this message, President RENATO CARLOS H. ERMITA, JR. welcomes the students to the National University and provides information about the handbook, its purpose, and when it will take effect. The handbook serves as a guide for students on the rules, policies, and procedures of the university.
