In [2]:
#Start

In [3]:
import fitz

if not hasattr(fitz.Page, "find_tables"):
    raise RuntimeError("This PyMuPDF version does not support the table feature")

# Functions

In [4]:
DOC_PATH = r".\documents\curated\American-Red-Cross-First-Aid-CPR-AED-Participants-Manual-CURATED.pdf"

In [5]:
# lets see the diffent police sizes in the document
def get_font_sizes(doc_path, page):
    doc = fitz.open(doc_path)
    page = doc.load_page(page)
    blocks = page.get_text("dict")["blocks"]
    font_sizes = set()
    for block in blocks:
        if block["type"] == 0:  # text block
            for line in block["lines"]:
                for span in line["spans"]:
                    font_sizes.add(span["size"])
    return sorted(font_sizes)

    

In [6]:
font_sizes = get_font_sizes(DOC_PATH, 0)
print(font_sizes)
#print exepmle of the text with the different font sizes
def get_text_by_font_size(doc_path, page, font_size):
    doc = fitz.open(doc_path)
    page = doc.load_page(page)
    blocks = page.get_text("dict")["blocks"]
    texts = []
    for block in blocks:
        if block["type"] == 0:  # text block
            for line in block["lines"]:
                for span in line["spans"]:
                    if span["size"] == font_size:
                        texts.append(span["text"])
    return texts
for size in font_sizes:
    texts = get_text_by_font_size(DOC_PATH, 0, size)
    print(f"Font size: {size}, Example text: {texts[:3]}")

[6.49656867980957, 8.662057876586914, 10.0, 17.819089889526367, 29.698484420776367, 46.71791458129883, 75.23616790771484]
Font size: 6.49656867980957, Example text: ['CHAPTER']
Font size: 8.662057876586914, Example text: [' ', ' ', ' 1    ']
Font size: 10.0, Example text: ['edical emergencies can happen every day, in any setting. People are injured in situations like falls or ', 'motor-vehicle accidents, or they develop sudden illnesses, such as heart attack or stroke.', 'The statistics are sobering. For example, about 900,000 people in the United States die each year from some ']
Font size: 17.819089889526367, Example text: ['CHAPTER     ']
Font size: 29.698484420776367, Example text: ['Before Giving Care and Checking ', 'an Injured or Ill Person']
Font size: 46.71791458129883, Example text: ['M']
Font size: 75.23616790771484, Example text: ['1']


In [7]:
CHAPTER_FONT_SIZE = 29

In [8]:
import os
import fitz  # PyMuPDF

def get_chapters(doc_source, chapter_font_size):
    """
    Extract chapters based on font size.
    Each chapter contains page-wise separated content.
    """

    files_to_process = []
    if os.path.isdir(doc_source):
        files_to_process = [
            os.path.join(doc_source, f)
            for f in os.listdir(doc_source)
            if f.lower().endswith(".pdf")
        ]
    elif os.path.isfile(doc_source):
        files_to_process = [doc_source]
    else:
        print(f"Invalid source: {doc_source}")
        return []

    all_chapters = []

    for doc_path in files_to_process:
        doc = fitz.open(doc_path)
        doc_name = os.path.basename(doc_path)

        current_chapter = None

        for page_index, page in enumerate(doc):
            page_number = page_index + 1
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] != 0:
                    continue

                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if not text:
                            continue

                        # ---- CHAPTER DETECTION ----
                        if int(span["size"]) == chapter_font_size:

                            # Save previous chapter
                            if current_chapter is not None:
                                all_chapters.append(current_chapter)

                            # Start new chapter
                            current_chapter = {
                                "doc_name": doc_name,
                                "title": text,
                                "pages": {}
                            }
                            continue

                        # ---- TITLE EXTENSION (same page, no content yet) ----
                        if (
                            current_chapter is not None
                            and not current_chapter["pages"]
                            and text.isupper()
                        ):
                            current_chapter["title"] += " " + text
                            continue

                        # ---- NORMAL CONTENT ----
                        if current_chapter is not None:
                            current_chapter["pages"].setdefault(page_number, "")
                            current_chapter["pages"][page_number] += text + " "

        # Save last chapter
        if current_chapter is not None:
            all_chapters.append(current_chapter)

        doc.close()

    # Clean whitespace
    for chapter in all_chapters:
        for p in chapter["pages"]:
            chapter["pages"][p] = chapter["pages"][p].strip()

    return all_chapters
chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)
print(f"Extracted {len(chapters)} chapters.")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")


Extracted 15 chapters.

Chapter 1: Before Giving Care and Checking

Chapter 2: an Injured or Ill Person CHAPTER
  Page 1: 1 M edical emergencies can happen every day, in any setting. People are injured in situations like f...
  Page 2: 2 First Aid/CPR/AED | Participant’s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in maki...
  Page 3: CHAPTER 1 | Before Giving Care and Checking an Injured or Ill Person 3 Clutching the chest or throat...
  Page 4: 4 First Aid/CPR/AED | Participant’s Manual Fear of Being Sued Sometimes people worry that they might...
  Page 5: CHAPTER 1 | Before Giving Care and Checking an Injured or Ill Person 5 Getting Permission to Give Ca...
  Page 6: 6 First Aid/CPR/AED | Participant’s Manual FOCUS ON PREPAREDNESS Important Information Keep medical ...
  Page 7: CHAPTER 1 | Before Giving Care and Checking an Injured or Ill Person 7 Avoid handling any of your pe...
  Page 8: 8 First Aid/CPR/AED | Participant’s Manual to 1 gallon of fresh water (1 part bl

In [None]:
import os
import fitz
import re
from collections import Counter

def get_chapters(doc_source, chapter_font_size):

    files_to_process = []
    if os.path.isdir(doc_source):
        files_to_process = [
            os.path.join(doc_source, f)
            for f in os.listdir(doc_source)
            if f.lower().endswith(".pdf")
        ]
    elif os.path.isfile(doc_source):
        files_to_process = [doc_source]
    else:
        print(f"Invalid source: {doc_source}")
        return []

    all_chapters = []

    for doc_path in files_to_process:
        doc = fitz.open(doc_path)
        doc_name = os.path.basename(doc_path)
        current_chapter = None

        for page_num, page in enumerate(doc):
            page_number = page_num + 1
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] != 0:
                    continue

                for line in block["lines"]:
                    # Build the text of the line from spans first
                    line_text_parts = []
                    line_font_sizes = []
                    
                    for span in line["spans"]:
                        span_text = span["text"].strip()
                        if not span_text:
                            continue
                        line_text_parts.append(span_text)
                        
                        # Use the first valid span size for chapter detection logic 
                        # (simplified, usually headers are consistent)
                        if span_text:
                            line_font_sizes.append(span["size"])

                    if not line_text_parts:
                        continue
                        
                    full_line_text = " ".join(line_text_parts)
                    primary_size = max(line_font_sizes) if line_font_sizes else 0
                    
                    # ---- CHAPTER TITLE DETECTION ----
                    # Check if any span in the line hits the chapter font size
                    if any(abs(s - chapter_font_size) < 0.5 for s in line_font_sizes): # Tolerance for float size

                        if current_chapter is not None:
                            merged_text = "".join(
                                current_chapter["pages"].values()
                            ).strip()

                            if merged_text == "":
                                current_chapter["title"] += " " + full_line_text
                            else:
                                all_chapters.append(current_chapter)
                                current_chapter = {
                                    "doc_name": doc_name,
                                    "title": full_line_text,
                                    "pages": {}
                                }
                        else:
                            current_chapter = {
                                "doc_name": doc_name,
                                "title": full_line_text,
                                "pages": {}
                            }
                        continue # Skip adding title to page content

                    # ---- NORMAL CONTENT ----
                    if current_chapter is not None:
                        current_chapter["pages"].setdefault(page_number, "")
                        # Add NEWLINE after every visual line to preserve structure for cleaning
                        current_chapter["pages"][page_number] += full_line_text + "\n"

        if current_chapter is not None:
            all_chapters.append(current_chapter)

        doc.close()

    return all_chapters

In [None]:
import unicodedata

def normalize(text):
    # Expand ligatures (ﬁ -> fi, ﬂ -> fl)
    text = unicodedata.normalize('NFKC', text)
    text = text.lower()
    # Remove specific artifacts
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text) # Control chars
    text = re.sub(r'[■•]', '', text) 
    
    # Keep generally valid content
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def is_noise(line, chapter_title):
    l = line.lower().strip()
    
    if not l: return True
    if len(l) < 3: return True # too short

    # Page numbers
    if re.match(r'^\d+$', l): return True
    
    # Headers/Footers specific to this manual
    if "participant’s manual" in l or "participants manual" in l: return True
    if "first aid/cpr/aed" in l: return True
    if "american red cross" in l: return True
    
    # Chapter references relative to current chapter
    # Only remove "Chapter X" lines, not sentences containing the word chapter
    if re.match(r'^chapter \d+(\s+\|)?.*', l) and chapter_title.lower() not in l:
        return True

    return False

def clean_chapter(chapter, redundancy_ratio=0.5):
    pages = chapter["pages"]
    cleaned_pages = {}
    
    # Pass 1: Line-based cleaning
    for page, text in pages.items():
        # normalize unicode first to fix ligatures across the whole block
        text = unicodedata.normalize('NFKC', text)
        
        lines = text.split("\n")
        kept_lines = []
        
        for line in lines:
            # Pre-clean weird symbols from line before checking noise
            line = line.replace('ﬁ', 'fi').replace('ﬂ', 'fl')
            line = re.sub(r'[■]', '', line).strip()
            
            if is_noise(line, chapter["title"]):
                continue
                
            kept_lines.append(line)
            
        cleaned_pages[page] = "\n".join(kept_lines)

    # Pass 4: replace chapter name mentioned with ''
    # (Optional: Only do this if it truly interrupts reading flow)
    chapter_title_norm = chapter["title"].lower()
    
    final_pages = {}
    for page, text in cleaned_pages.items():
         # Case insensitive replace of title
         # Escape is important for titles with special chars
         clean_text = re.sub(re.escape(chapter_title_norm), '', text, flags=re.IGNORECASE)
         
         # Final whitespace cleanup
         clean_text = re.sub(r'\s+', ' ', clean_text).strip()
         final_pages[page] = clean_text

    chapter["pages"] = final_pages
    return chapter

In [None]:
# Re-extract and clean with improved logic
print("Extracting chapters with structural preservation...")
chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)

print(f"Cleaning {len(chapters)} chapters...")
chapters = [clean_chapter(ch) for ch in chapters]

print(f"After redundancy removal, sample chapter content:")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")

After redundancy removal, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER 1 M edical emergencies can happen every day, in any setting. People are injured in situation...
  Page 2: 2 First Aid/CPR/AED | Participant’s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in maki...
  Page 3: CHAPTER 1 | Before Giving Care and Checking an Injured or Ill Person 3 Clutching the chest or throat...
  Page 4: 4 First Aid/CPR/AED | Participant’s Manual Fear of Being Sued Sometimes people worry that they might...
  Page 5: CHAPTER 1 | Before Giving Care and Checking an Injured or Ill Person 5 Getting Permission to Give Ca...
  Page 6: 6 First Aid/CPR/AED | Participant’s Manual FOCUS ON PREPAREDNESS Important Information Keep medical ...
  Page 7: CHAPTER 1 | Before Giving Care and Checking an Injured or Ill Person 7 Avoid handling any of your pe...
  Page 8: 8 First Aid/CPR/AED | Participant’s Manual to 1 gallon of fresh water (1 

In [35]:
chapters = [clean_chapter(ch) for ch in chapters]

print(f"After cleaning, sample chapter content:")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")

After cleaning, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER 1 M edical emergencies can happen every day, in any setting. People are injured in situation...
  Page 2: 2 First Aid/CPR/AED | Participant’s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in maki...
  Page 3: CHAPTER 1 |  3 Clutching the chest or throat  A person doubled over in pain  Slurred, confused or ...
  Page 4: 4 First Aid/CPR/AED | Participant’s Manual Fear of Being Sued Sometimes people worry that they might...
  Page 5: CHAPTER 1 |  5 Getting Permission to Give Care People have a basic right to decide what can and cann...
  Page 6: 6 First Aid/CPR/AED | Participant’s Manual FOCUS ON PREPAREDNESS Important Information Keep medical ...
  Page 7: CHAPTER 1 |  7 Avoid handling any of your personal items, such as ■ pens or combs, while giving care...
  Page 8: 8 First Aid/CPR/AED | Participant’s Manual to 1 gallon of fresh water (1 part bleac

In [36]:
import json

# --- 1. Save Chapters to JSON ---
output_json_path = "processed_chapters.json"
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(chapters, f, indent=4)
print(f"Saved {len(chapters)} chapters to {output_json_path}")

Saved 11 chapters to processed_chapters.json


In [37]:
import uuid
import chromadb
from sentence_transformers import SentenceTransformer
from typing import List, Dict

# --- 2. Setup ChromaDB & Embedding ---
# Using the same model as your src/rag_engine.py
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
DB_PATH = "./chroma_db_custom_chapters_2"

print("Initializing ChromaDB and Embeddings...")
client = chromadb.PersistentClient(path=DB_PATH)
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# Custom embedding wrapper for Chroma
class LocalHuggingFaceEmbedding(chromadb.EmbeddingFunction):
    def __init__(self, model):
        self.model = model
    def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
        return self.model.encode(input, convert_to_tensor=False).tolist()

collection = client.get_or_create_collection(
    name="chapter_knowledge_base",
    embedding_function=LocalHuggingFaceEmbedding(embedding_model)
)

# --- 3. Chunking & Ingestion ---
def split_text_recursive(text: str, chunk_size=1000, overlap=200):
    """Simple splitting function to chunk large chapter texts."""
    chunks = []
    if not text: return chunks
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += (chunk_size - overlap)
    return chunks

print("Chunking and Indexing...")
ids = []
documents = []
metadatas = []

for chapter in chapters:
    # Merge pages back into one text for chunking context, or chunk per page.
    # Here we merge to ensure continuity across pages within a chapter.
    full_text = "\n".join(chapter["pages"].values())
    
    text_chunks = split_text_recursive(full_text)
    
    for i, chunk in enumerate(text_chunks):
        chunk_id = str(uuid.uuid4())
        ids.append(chunk_id)
        documents.append(chunk)
        metadatas.append({
            "doc_name": chapter["doc_name"],
            "title": chapter["title"],  # Vital for weighting
            "chunk_index": i,
            "source": "custom_chunking"
        })

# Add to Chroma in batches to be safe
BATCH_SIZE = 100
for i in range(0, len(ids), BATCH_SIZE):
    collection.upsert(
        ids=ids[i:i+BATCH_SIZE],
        documents=documents[i:i+BATCH_SIZE],
        metadatas=metadatas[i:i+BATCH_SIZE]
    )
print(f"Indexed {len(ids)} chunks into ChromaDB.")

Initializing ChromaDB and Embeddings...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 225.81it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Chunking and Indexing...
Indexed 526 chunks into ChromaDB.


In [38]:
# --- 4. Weighted Retrieval Function ---
def weighted_retrieval(query: str, n_results=5, title_weight=0.5):
    """
    Retrieves documents based on semantic similarity + title weight.
    
    Args:
        query: The search query.
        n_results: Number of final results to return.
        title_weight: 0.0 to 1.0. How much boosting exact words in title gives.
    """
    # 1. Get more candidates than needed (to allow re-ranking)
    results = collection.query(
        query_texts=[query],
        n_results=n_results * 3,
        include=["documents", "metadatas", "distances"]
    )
    
    scored_results = []
    
    # Process the first batch of results
    if results['ids']:
        for i in range(len(results['ids'][0])):
            doc_id = results['ids'][0][i]
            content = results['documents'][0][i]
            metadata = results['metadatas'][0][i]
            distance = results['distances'][0][i]
            
            # Convert Chroma distance (L2) to Similarity (approximate for ranking)
            # Lower distance = Higher similarity. 
            # We can invert distance or just treat it as a base score.
            semantic_score = 1.0 / (1.0 + distance)
            
            # Calculate Title Match Score
            title = metadata.get('title', '').lower()
            query_words = set(query.lower().split())
            title_words = set(title.split())
            
            # Jaccard-like or Overlap score for title
            overlap = len(query_words.intersection(title_words))
            title_score = 0
            if query_words:
                title_score = overlap / len(query_words) # How much of query is found in title
                
            # Final Weighted Score
            # semantic_score is usually dominant, we boost it if title matches
            final_score = semantic_score + (title_score * title_weight)
            
            scored_results.append({
                "id": doc_id,
                "content": content,
                "metadata": metadata,
                "score": final_score,
                "semantic_score": semantic_score,
                "title_score": title_score
            })
    
    # Sort by descending final score
    scored_results.sort(key=lambda x: x["score"], reverse=True)
    
    return scored_results[:n_results]

In [26]:
# --- Test ---
test_query = "cpr procedure" 
print(f"\n--- Testing Retrieval for: '{test_query}' ---")
hits = weighted_retrieval(test_query, title_weight=0.8)

for hit in hits:
    print(f"\n[Score: {hit['score']:.4f} | Title Match: {hit['title_score']:.2f}]")
    print(f"Title: {hit['metadata']['title']}")
    print(f"Content: {hit['content'][:150]}...")


--- Testing Retrieval for: 'cpr procedure' ---

[Score: 0.9496 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: cal care and transport the person to a hospital. For each minute that CPR and deﬁ brillation are delayed, the chance for survival is reduced by about ...

[Score: 0.9414 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: g the infant’s mouth and nose with your mouth (Fig. 2-13, B). Each rescue breath should last about 1 second and make the chest clearly rise. Continue ...

[Score: 0.9375 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: lth care. ADVANCE DIRECTIVES ( Continued )

CHAPTER 2 |  39 Continue cycles of 30 chest compressions and 2 rescue breaths. Do not stop CPR except in o...

[Score: 0.9166 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: KILL SHEET AFTER CHECKING THE SCENE AND THE INJURED OR ILL CHILD: GIVE 30 CHEST COMPRESSIONS Push hard, push fast in the center of the chest about 2 i...

[Score: 0.9

In [39]:
# --- Test ---
test_query = "cpr procedure" 
print(f"\n--- Testing Retrieval for: '{test_query}' ---")
hits = weighted_retrieval(test_query, title_weight=0.8)

for hit in hits:
    print(f"\n[Score: {hit['score']:.4f} | Title Match: {hit['title_score']:.2f}]")
    print(f"Title: {hit['metadata']['title']}")
    print(f"Content: {hit['content'][:150]}...")


--- Testing Retrieval for: 'cpr procedure' ---

[Score: 1.0024 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: y long enough to allow the heart to spontaneously develop an effective rhythm on its own. Without early CPR and early deﬁ brillation, the chances of s...

[Score: 0.9696 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: and lift the chin up.  Pinch the nose shut then make a complete seal  over the person’s mouth. Blow in for about 1 second to make the chest  clearl...

[Score: 0.9646 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: in the durable power of attorney for health care. ADVANCE DIRECTIVES ( Continued )
38 First Aid/CPR/AED | Participant’s Manual CPR for a Child If duri...

[Score: 0.9625 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content:  take over. ■ You are too exhausted to continue. ■ The scene becomes unsafe. If at any time you notice the infant begin to breathe, stop CPR, keep the...

[Score: 0.9