Ingestion and Chunking

In [35]:
import os
import json
import hashlib
from datetime import datetime

import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter

PDF_PATH = "data_store/pdfs/prob.pdf"
OUT_PATH = "chunks.jsonl"

# --- Helpers ---------------------------------------------------------------

def sha1_of_file(path, buf_size=1024 * 1024):
    h = hashlib.sha1()
    with open(path, "rb") as f:
        while True:
            b = f.read(buf_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def make_chunk_id(source_sha1: str, page: int, global_idx: int, page_idx: int) -> str:
    # Deterministic, human-readable-ish ID
    core = f"{source_sha1[:12]}:p{page}:g{global_idx}:k{page_idx}"
    return hashlib.sha1(core.encode("utf-8")).hexdigest()

# --- Extract per-page text -------------------------------------------------

with pdfplumber.open(PDF_PATH) as pdf:
    total_pages = len(pdf.pages)
    page_texts = []
    for i, page in enumerate(pdf.pages, start=1):
        txt = page.extract_text() or ""
        if txt.strip():
            page_texts.append((i, txt))

# --- Splitter (per page to keep page provenance) ---------------------------

splitter = RecursiveCharacterTextSplitter(
    chunk_size=10,
    chunk_overlap=1,
    length_function=len,
)

# Build a global list of (page, chunk_text)
chunks_with_pages = []
for page_num, page_text in page_texts:
    # Split this page's text; chunks won't cross pages
    page_chunks = splitter.split_text(page_text)
    for idx_in_page, ch in enumerate(page_chunks, start=1):
        chunks_with_pages.append((page_num, idx_in_page, ch))

# --- File-level provenance -------------------------------------------------

source_name = os.path.basename(PDF_PATH)
source_sha1 = sha1_of_file(PDF_PATH)
created_at = datetime.utcnow().isoformat(timespec="seconds") + "Z"

# --- Write JSONL with simplified structure -------------------------------

with open(OUT_PATH, "w", encoding="utf-8") as f:
    for global_idx, (page_num, idx_in_page, chunk_text) in enumerate(chunks_with_pages):
        meta = {
            # Content (directly storing the text without extra nesting)
            "content": chunk_text,
            
            # Metadata
            "metadata": {
                "id": make_chunk_id(source_sha1, page_num, global_idx, idx_in_page),
                "source": source_name,
                "page": page_num,
                "author": "Unknown",  # You can add a way to extract the author if needed
                "created_at": created_at,
            },
        }
        f.write(json.dumps(meta, ensure_ascii=False) + "\n")

print(f"Wrote {len(chunks_with_pages)} chunks to {OUT_PATH}")


Wrote 1007 chunks to chunks.jsonl


  created_at = datetime.utcnow().isoformat(timespec="seconds") + "Z"



Embedding and Vector Store

In [23]:
import json
from pathlib import Path
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Paths
chunks_path = "chunks.jsonl"
index_dir = "data_store/vector_databases.index"

# --- Read chunks *with* metadata from JSONL ---
def read_jsonl(file_path: str):
    texts = []
    metadatas = []
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            obj = json.loads(line)
            text = obj["content"]
            metadata = obj.get("metadata", {})
            metadata.setdefault("chunk_number", i)
            texts.append(text)
            metadatas.append(metadata)
    return texts, metadatas



# Load docs
texts, metadatas = read_jsonl(chunks_path)
print(f"Loaded {len(texts)} documents.")

# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# --- Create FAISS vectorstore with normalization ---
vectorstore = FAISS.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas,
    normalize_L2=True   # <-- This ensures cosine similarity
)

# Save FAISS index + docstore
Path(index_dir).mkdir(parents=True, exist_ok=True)
vectorstore.save_local(index_dir)

print(f"FAISS index saved to '{index_dir}' with {len(texts)} documents (metadata included).")




Loaded 15 documents.
FAISS index saved to 'data_store/vector_databases.index' with 15 documents (metadata included).


In [None]:
Similarity Scores

In [38]:
# Load the FAISS index from disk
vectorstore = FAISS.load_local("data_store/vector_databases.index", embedding_model, allow_dangerous_deserialization=True)

# Query the index
query = "Hello"
results = vectorstore.similarity_search_with_relevance_scores(query, k=5)

# Print results
for i, (doc, score) in enumerate(results, start=1):
    print(f"[{i}]")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print(f"Score: {score:.4f}")
    print("-" * 50)

[1]
Content: INTRODUCTION TO COMPUTING Course Code: CCINCOM/L
Review in Probability, Intro to RL Concepts
Key idea:
A simple event is when we’re focusing on just one specific
outcome.
RJoEsINeFpOh RMCaErMviEnN RT. ILmEpAeRrNiaINl G NU College of Computing and InCfoourrmsaet Cioond Tee: cChCnRoNloFgLiReLs
Metadata: {'id': 'f82e04e65af1c05c8deb7e1f09d0356d8bdda630', 'source': 'prob.pdf', 'page': 9, 'author': 'Unknown', 'created_at': '2025-08-26T11:29:15Z', 'chunk_number': 8}
Score: -0.2948
--------------------------------------------------
[2]
Content: INTRODUCTION TO COMPUTING Course Code: CCINCOM/L
Example
In an experiment involving a sequence of 3 tosses of a coin, the number of
getting heads H in the sequence is a random variable 𝑋.
Review in Probability, Intro to RL Concepts
𝑃(𝑋 = 0) = 1/8 (T, T, T)
𝑃(𝑋 = 1) = 3/8 (H, T, T), (T, H, T), (T, T, H)
𝑃(𝑋 = 2) = 3/8 (H, H, T), (H, T, H), (T, H, H)
𝑃(𝑋 = 3) = 1/8 (H, H, H)
𝑃(𝑋 ≥ 4) = 0
RJoEsINeFpOh RMCaErMviEnN RT. ILmEpAeRrNiaINl G NU Col

  results = vectorstore.similarity_search_with_relevance_scores(query, k=5)


Retrieval-Augmented Generation


In [None]:
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.llms import Ollama

# Initialize your retriever from FAISS
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Initialize your Ollama LLM
llm = Ollama(model="mistral:instruct")


with open("prompt_template/qa.txt", "r", encoding="utf-8") as f:
    prompt_text = f.read()

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_text
)

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True,        # <<— important
)

query = "what is probability"

res = qa_chain.invoke({"query": query})  # prefer .invoke to get the dict
print(res["result"])

# Pretty-print unique sources
seen = set()
print("\nSources:")
for doc in res["source_documents"]:
    src = doc.metadata.get("source") or doc.metadata.get("file_path") or "unknown"
    page = doc.metadata.get("page")
    key = (src, page)
    if key in seen: 
        continue
    seen.add(key)
    # nice short name
    short = Path(src).name if isinstance(src, str) else str(src)
    if page is not None:
        print(f"- {short} (page {page})")
    else:
        print(f"- {short}")

 The probability is a measurement of uncertainty that tells us how likely it is that a particular event will occur. It can be denoted as P(A) where A is an event or a collection of possible outcomes. In the context of discrete probability, the probability of an event A occurring is calculated as the ratio of the number of elements in A to the total possible outcomes:

P(A) = (Number of elements in A) / (Total possible outcomes)

For example, if there's only one 4, and six possible numbers in total, we say the probability of getting a 4 is 1 out of 6, or:

P(4) = 1/6

This matches the formula shown on the slide. In this case, A represents the event "getting a 4".
