# Start*

# Loadings

In [33]:
# Calling the font size jason file to use the chapter font size in the custom chunking process.


import json


with open('chapter_font_sizes.json', 'r') as f:
    chapter_font_sizes = json.load(f)

chapter_font_sizes

[{'doc_path': 'American-Red-Cross-First-Aid-CPR-AED-Participants-Manual-CURATED',
  'chapter_font_size': 29},
 {'doc_path': 'general-principles-2021-01-21-final-CURATED',
  'chapter_font_size': 21},
 {'doc_path': 'Guidelines on Core Components of Infection Prevention and Control Programmes-CURATED',
  'chapter_font_size': 13},
 {'doc_path': 'Healthy-WHO-CURATED', 'chapter_font_size': 32},
 {'doc_path': 'Outpatient-Guide-508-CURATED', 'chapter_font_size': 18}]

In [None]:
all_processed_chapters = []

for item in chapter_font_sizes:
    # Construct full path if needed, assuming base dir is 'documents/curated'
    full_path = os.path.join(r".\documents\curated", item['doc_path'] + ".pdf")
    
    print(f"Processing {item['doc_path']} with font size {item['chapter_font_size']}...")
    
    # Extract
    chapters = get_chapters(full_path, item['chapter_font_size'])
    
    # Clean
    cleaned_chapters = [clean_chapter(ch) for ch in chapters]
    
    all_processed_chapters.extend(cleaned_chapters)


In [34]:
import os
import fitz  # PyMuPDF

def get_chapters(doc_source, chapter_font_size):
    """
    Extract chapters based on font size.
    Each chapter contains page-wise separated content.
    """

    files_to_process = []
    if os.path.isdir(doc_source):
        files_to_process = [
            os.path.join(doc_source, f)
            for f in os.listdir(doc_source)
            if f.lower().endswith(".pdf")
        ]
    elif os.path.isfile(doc_source):
        files_to_process = [doc_source]
    else:
        print(f"Invalid source: {doc_source}")
        return []

    all_chapters = []

    for doc_path in files_to_process:
        doc = fitz.open(doc_path)
        doc_name = os.path.basename(doc_path)

        current_chapter = None

        for page_index, page in enumerate(doc):
            page_number = page_index + 1
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] != 0:
                    continue

                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if not text:
                            continue

                        # ---- CHAPTER DETECTION ----
                        if int(span["size"]) == chapter_font_size:

                            # Save previous chapter
                            if current_chapter is not None:
                                all_chapters.append(current_chapter)

                            # Start new chapter
                            current_chapter = {
                                "doc_name": doc_name,
                                "title": text,
                                "pages": {}
                            }
                            continue

                        # ---- TITLE EXTENSION (same page, no content yet) ----
                        if (
                            current_chapter is not None
                            and not current_chapter["pages"]
                            and text.isupper()
                        ):
                            current_chapter["title"] += " " + text
                            continue

                        # ---- NORMAL CONTENT ----
                        if current_chapter is not None:
                            current_chapter["pages"].setdefault(page_number, "")
                            current_chapter["pages"][page_number] += text + " "

        # Save last chapter
        if current_chapter is not None:
            all_chapters.append(current_chapter)

        doc.close()

    # Clean whitespace
    for chapter in all_chapters:
        for p in chapter["pages"]:
            chapter["pages"][p] = chapter["pages"][p].strip()

    return all_chapters
chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)
print(f"Extracted {len(chapters)} chapters.")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")


Extracted 3 chapters.

Chapter 1: FUNDAMENTAL ELEMENTS NEEDED TO PREVENT

Chapter 2: TRANSMISSION OF INFECTIOUS AGENTS IN

Chapter 3: OUTPATIENT SETTINGS
  Page 1: Dedicate Resources to Infection Prevention (Administrative Resources) Infection prevention must be m...
  Page 2: 7 Version 2.3 - September 2016 Key administrative recommendations for outpatient settings: 1. Develo...
  Page 3: 8 Version 2.3 - September 2016 At a minimum, outpatient facilities need to adhere to local, state, a...
  Page 4: 9 Version 2.3 - September 2016 because of its activity against a broad spectrum of epidemiologically...
  Page 5: 10 Version 2.3 - September 2016 Key recommendations for use of PPE in outpatient settings: 1. Facili...
  Page 6: 11 Version 2.3 - September 2016 The Campaign is led by the Centers for Disease Control and Preventio...
  Page 7: 12 Version 2.3 - September 2016 Complete guidance for the cleaning and disinfection of environmental...
  Page 8: 13 Version 2.3 - September 2016 Cleani

In [35]:
import os
import fitz
import re
from collections import Counter

def get_chapters(doc_source, chapter_font_size):

    files_to_process = []
    if os.path.isdir(doc_source):
        files_to_process = [
            os.path.join(doc_source, f)
            for f in os.listdir(doc_source)
            if f.lower().endswith(".pdf")
        ]
    elif os.path.isfile(doc_source):
        files_to_process = [doc_source]
    else:
        print(f"Invalid source: {doc_source}")
        return []

    all_chapters = []

    for doc_path in files_to_process:
        doc = fitz.open(doc_path)
        doc_name = os.path.basename(doc_path)
        current_chapter = None

        for page_num, page in enumerate(doc):
            page_number = page_num + 1
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] != 0:
                    continue

                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if not text:
                            continue

                        # ---- CHAPTER TITLE DETECTION (UNCHANGED LOGIC) ----
                        if int(span["size"]) == chapter_font_size:

                            if current_chapter is not None:
                                merged_text = "".join(
                                    current_chapter["pages"].values()
                                ).strip()

                                # ---- YOUR TITLE EXTENSION LOGIC ----
                                if merged_text == "":
                                    current_chapter["title"] += " " + text
                                    continue
                                else:
                                    all_chapters.append(current_chapter)

                            current_chapter = {
                                "doc_name": doc_name,
                                "title": text,
                                "pages": {}
                            }
                            continue

                        # ---- NORMAL CONTENT ----
                        if current_chapter is not None:
                            current_chapter["pages"].setdefault(page_number, "")
                            current_chapter["pages"][page_number] += text + " "

        if current_chapter is not None:
            all_chapters.append(current_chapter)

        doc.close()

    return all_chapters


In [36]:


def normalize(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def is_noise(line, chapter_title):
    l = line.lower()

    if re.match(r'^\d+\s+', l):
        return True

    if "chapter" in l and chapter_title.lower() not in l:
        return True

    if "participant" in l or "first aid/cpr/aed" in l:
        return True

    return False


def clean_chapter(chapter, redundancy_ratio=0.5):
    pages = chapter["pages"]
    total_pages = len(pages)

    line_freq = Counter()
    page_lines = {}

    # ---- Pass 1: clean obvious noise ----
    for page, text in pages.items():
        lines = [l.strip() for l in text.split("\n") if len(l.strip()) > 3]
        kept = []

        for line in lines:
            if is_noise(line, chapter["title"]):
                continue
            norm = normalize(line)
            kept.append((line, norm))
            line_freq[norm] += 1

        page_lines[page] = kept

    # # ---- Pass 2: cross-page redundancy ----
    # redundant = {
    #     l for l, c in line_freq.items()
    #     if c / total_pages >= redundancy_ratio and len(l.split()) < 12
    # }

    # # ---- Pass 3: rebuild pages ----
    # chapter["pages"] = {
    #     page: "\n".join(
    #         line for line, norm in pairs if norm not in redundant
    #     ).strip()
    #     for page, pairs in page_lines.items()
    # }

    # ---- Pass 4: replace chapter name mentioned with '' ----
    chapter_title_norm = normalize(chapter["title"])

    chapter["pages"] = {
        page: re.sub(re.escape(chapter_title_norm), '', text, flags=re.IGNORECASE).strip()
        for page, text in chapter["pages"].items()
    }

    return chapter


In [40]:
import os
import re

# Directory containing abbreviation JSONs
ABREVS_DIR = r".\documents\abrevs"

def expand_abbreviations(text, abrev_map):
    """
    Replaces abbreviations in text with their expanded forms.
    Uses regex word boundaries \b to avoid replacing partial words 
    (e.g., avoiding 'WHO' matching inside 'WHOLE').
    """
    # Sort keys by length (descending) to handle substring cases correctly
    sorted_keys = sorted(abrev_map.keys(), key=len, reverse=True)
    
    # Create a giant regex pattern: \b(WHO|USA|UK)\b
    pattern = r'\b(' + '|'.join(re.escape(k) for k in sorted_keys) + r')\b'
    
    def replace(match):
        return abrev_map[match.group(0)]
    
    return re.sub(pattern, replace, text)

In [41]:
# Visual Validation of Abbreviation Expansion
print("\n--- Testing Abbreviation Expansion ---")

# Let's search for expanded terms in the processed chapters
target_terms = ["World Health Organization", "United States of America", "Intensive care unit"]
found_examples = 0

for chapter in all_processed_chapters:
    for page, content in chapter["pages"].items():
        for term in target_terms:
            if term in content:
                print(f"\n[Found Expanded Term: '{term}']")
                print(f"Doc: {chapter['doc_name']}")
                print(f"Prop: {chapter['title']}")
                # Show context around the term
                start_idx = max(0, content.find(term) - 50)
                end_idx = min(len(content), content.find(term) + len(term) + 50)
                print(f"Context: ...{content[start_idx:end_idx]}...")
                found_examples += 1
                if found_examples >= 3: break # Just show a few examples
        if found_examples >= 3: break
    if found_examples >= 3: break

if found_examples == 0:
    print("No expanded abbreviations found in the sample scan. Check if source docs contained the abbreviation keys (e.g. WHO, USA).")


--- Testing Abbreviation Expansion ---

[Found Expanded Term: 'World Health Organization']
Doc: general-principles-2021-01-21-final-CURATED.pdf
Prop: 6. Key analytical concepts
Context: ...idance for programme managers. Section 4. Geneva: World Health Organization; 2020....

[Found Expanded Term: 'World Health Organization']
Doc: general-principles-2021-01-21-final-CURATED.pdf
Prop: 7. Presentation and communication
Context: ...h information systems. Health Metrics Network and World Health Organization. Geneva: World Health Organization; 2012 ( https:...

[Found Expanded Term: 'World Health Organization']
Doc: Outpatient-Guide-508-CURATED.pdf
Prop: FUNDAMENTAL ELEMENTS NEEDED TO PREVENT TRANSMISSION OF INFECTIOUS AGENTS IN OUTPATIENT SETTINGS
Context: ...thcare settings is recommended by the CDC and the World Health Organization (WHO)...


# All chapters

In [37]:
all_processed_chapters = []

for item in chapter_font_sizes:
    # Construct full path if needed, assuming base dir is 'documents/curated'
    full_path = os.path.join(r".\documents\curated", item['doc_path'] + ".pdf")
    
    print(f"Processing {item['doc_path']} with font size {item['chapter_font_size']}...")
    
    # Extract
    chapters = get_chapters(full_path, item['chapter_font_size'])
    
    # Clean
    cleaned_chapters = [clean_chapter(ch) for ch in chapters]
    
    all_processed_chapters.extend(cleaned_chapters)

Processing American-Red-Cross-First-Aid-CPR-AED-Participants-Manual-CURATED with font size 29...
Processing general-principles-2021-01-21-final-CURATED with font size 21...
Processing Guidelines on Core Components of Infection Prevention and Control Programmes-CURATED with font size 13...
Processing Healthy-WHO-CURATED with font size 32...
Processing Outpatient-Guide-508-CURATED with font size 18...


In [None]:

all_processed_chapters = []

for item in chapter_font_sizes:
    doc_name = item['doc_path']
    full_path = os.path.join(r".\documents\curated", doc_name + ".pdf")
    
    print(f"Processing {doc_name} with font size {item['chapter_font_size']}...")
    
    # 1. Extract
    chapters = get_chapters(full_path, item['chapter_font_size'])
    
    # 2. Clean
    cleaned_chapters = [clean_chapter(ch) for ch in chapters]
    
    # 3. Check for Abbreviation Map
    abrev_file = os.path.join(ABREVS_DIR, doc_name + ".json")
    abrev_map = {}
    
    if os.path.exists(abrev_file):
        print(f"Found abbreviation map for {doc_name}, applying expansion...")
        try:
            with open(abrev_file, 'r', encoding='utf-8') as f:
                abrev_map = json.load(f)
        except Exception as e:
            print(f"Error loading abbreviation file: {e}")
            
    # 4. Apply Abbreviations (if map exists)
    if abrev_map:
        for chapter in cleaned_chapters:
            for page_num, content in chapter["pages"].items():
                chapter["pages"][page_num] = expand_abbreviations(content, abrev_map)
    
    all_processed_chapters.extend(cleaned_chapters)

print(f"Total processed chapters: {len(all_processed_chapters)}")

In [38]:
all_processed_chapters[1:5]

[{'doc_name': 'American-Red-Cross-First-Aid-CPR-AED-Participants-Manual-CURATED.pdf',
  'title': 'Cardiac Emergencies and CPR',
  'pages': {29: 'CHAPTER 2 CHAPTER 2 |  29 C ardiac emergencies are life threatening. Heart attack and cardiac arrest are major causes of illness and death in the United States. Every day in U.S. homes, parks and workplaces someone will have a heart attack or go into cardiac arrest. Recognizing the signals of a heart attack and cardiac arrest, calling 9-1-1 or the local emergency number and giving immediate care in a cardiac emergency saves lives. Performing CPR and using an automated external deﬁ brillator (AED) immediately after a person goes into cardiac arrest can greatly increase his or her chance of survival. In this chapter you will ﬁ nd out what signals to look for if you suspect a person is having a heart attack or has gone into cardiac arrest. This chapter also discusses how to care for a person having a heart attack and how to perform CPR for a pers

In [None]:
# # Re-extract and clean with improved logic
# print("Extracting chapters with structural preservation...")
# chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)

# print(f"Cleaning {len(chapters)} chapters...")
# chapters = [clean_chapter(ch) for ch in chapters]

# print(f"After redundancy removal, sample chapter content:")
# for i, chapter in enumerate(chapters[:3]):
#     print(f"\nChapter {i+1}: {chapter['title']}")
#     for page_num, content in chapter["pages"].items():
#         print(f"  Page {page_num}: {content[:100]}...")

Extracting chapters with structural preservation...
Cleaning 11 chapters...
After redundancy removal, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER 1 M edical emergencies can happen every day, in any setting. People are injured in situation...
  Page 2: 2 First Aid/CPR/AED | Participant’s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in maki...
  Page 3: CHAPTER 1 |  3 Clutching the chest or throat  A person doubled over in pain  Slurred, confused or ...
  Page 4: 4 First Aid/CPR/AED | Participant’s Manual Fear of Being Sued Sometimes people worry that they might...
  Page 5: CHAPTER 1 |  5 Getting Permission to Give Care People have a basic right to decide what can and cann...
  Page 6: 6 First Aid/CPR/AED | Participant’s Manual FOCUS ON PREPAREDNESS Important Information Keep medical ...
  Page 7: CHAPTER 1 |  7 Avoid handling any of your personal items, such as ■ pens or combs, while giving care...
  Page 

In [None]:
# chapters = [clean_chapter(ch) for ch in chapters]

# print(f"After cleaning, sample chapter content:")
# for i, chapter in enumerate(chapters[:3]):
#     print(f"\nChapter {i+1}: {chapter['title']}")
#     for page_num, content in chapter["pages"].items():
#         print(f"  Page {page_num}: {content[:100]}...")

After cleaning, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER 1 M edical emergencies can happen every day, in any setting. People are injured in situation...
  Page 2: 2 First Aid/CPR/AED | Participant’s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in maki...
  Page 3: CHAPTER 1 |  3 Clutching the chest or throat  A person doubled over in pain  Slurred, confused or ...
  Page 4: 4 First Aid/CPR/AED | Participant’s Manual Fear of Being Sued Sometimes people worry that they might...
  Page 5: CHAPTER 1 |  5 Getting Permission to Give Care People have a basic right to decide what can and cann...
  Page 6: 6 First Aid/CPR/AED | Participant’s Manual FOCUS ON PREPAREDNESS Important Information Keep medical ...
  Page 7: CHAPTER 1 |  7 Avoid handling any of your personal items, such as ■ pens or combs, while giving care...
  Page 8: 8 First Aid/CPR/AED | Participant’s Manual to 1 gallon of fresh water (1 part bleac

In [39]:
import json

# --- 1. Save Chapters to JSON ---
output_json_path = "all_processed_chapters.json"
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(all_processed_chapters, f, indent=4)
print(f"Saved {len(all_processed_chapters)} chapters to {output_json_path}")

Saved 35 chapters to all_processed_chapters.json


# Chunking

In [37]:
import uuid
import chromadb
from sentence_transformers import SentenceTransformer
from typing import List, Dict

# --- 2. Setup ChromaDB & Embedding ---
# Using the same model as your src/rag_engine.py
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
DB_PATH = "./chroma_db_custom_chapters_2"

print("Initializing ChromaDB and Embeddings...")
client = chromadb.PersistentClient(path=DB_PATH)
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# Custom embedding wrapper for Chroma
class LocalHuggingFaceEmbedding(chromadb.EmbeddingFunction):
    def __init__(self, model):
        self.model = model
    def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
        return self.model.encode(input, convert_to_tensor=False).tolist()

collection = client.get_or_create_collection(
    name="chapter_knowledge_base",
    embedding_function=LocalHuggingFaceEmbedding(embedding_model)
)

# --- 3. Chunking & Ingestion ---
def split_text_recursive(text: str, chunk_size=1000, overlap=200):
    """Simple splitting function to chunk large chapter texts."""
    chunks = []
    if not text: return chunks
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += (chunk_size - overlap)
    return chunks

print("Chunking and Indexing...")
ids = []
documents = []
metadatas = []

for chapter in chapters:
    # Merge pages back into one text for chunking context, or chunk per page.
    # Here we merge to ensure continuity across pages within a chapter.
    full_text = "\n".join(chapter["pages"].values())
    
    text_chunks = split_text_recursive(full_text)
    
    for i, chunk in enumerate(text_chunks):
        chunk_id = str(uuid.uuid4())
        ids.append(chunk_id)
        documents.append(chunk)
        metadatas.append({
            "doc_name": chapter["doc_name"],
            "title": chapter["title"],  # Vital for weighting
            "chunk_index": i,
            "source": "custom_chunking"
        })

# Add to Chroma in batches to be safe
BATCH_SIZE = 100
for i in range(0, len(ids), BATCH_SIZE):
    collection.upsert(
        ids=ids[i:i+BATCH_SIZE],
        documents=documents[i:i+BATCH_SIZE],
        metadatas=metadatas[i:i+BATCH_SIZE]
    )
print(f"Indexed {len(ids)} chunks into ChromaDB.")

Initializing ChromaDB and Embeddings...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 225.81it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Chunking and Indexing...
Indexed 526 chunks into ChromaDB.


In [38]:
# --- 4. Weighted Retrieval Function ---
def weighted_retrieval(query: str, n_results=5, title_weight=0.5):
    """
    Retrieves documents based on semantic similarity + title weight.
    
    Args:
        query: The search query.
        n_results: Number of final results to return.
        title_weight: 0.0 to 1.0. How much boosting exact words in title gives.
    """
    # 1. Get more candidates than needed (to allow re-ranking)
    results = collection.query(
        query_texts=[query],
        n_results=n_results * 3,
        include=["documents", "metadatas", "distances"]
    )
    
    scored_results = []
    
    # Process the first batch of results
    if results['ids']:
        for i in range(len(results['ids'][0])):
            doc_id = results['ids'][0][i]
            content = results['documents'][0][i]
            metadata = results['metadatas'][0][i]
            distance = results['distances'][0][i]
            
            # Convert Chroma distance (L2) to Similarity (approximate for ranking)
            # Lower distance = Higher similarity. 
            # We can invert distance or just treat it as a base score.
            semantic_score = 1.0 / (1.0 + distance)
            
            # Calculate Title Match Score
            title = metadata.get('title', '').lower()
            query_words = set(query.lower().split())
            title_words = set(title.split())
            
            # Jaccard-like or Overlap score for title
            overlap = len(query_words.intersection(title_words))
            title_score = 0
            if query_words:
                title_score = overlap / len(query_words) # How much of query is found in title
                
            # Final Weighted Score
            # semantic_score is usually dominant, we boost it if title matches
            final_score = semantic_score + (title_score * title_weight)
            
            scored_results.append({
                "id": doc_id,
                "content": content,
                "metadata": metadata,
                "score": final_score,
                "semantic_score": semantic_score,
                "title_score": title_score
            })
    
    # Sort by descending final score
    scored_results.sort(key=lambda x: x["score"], reverse=True)
    
    return scored_results[:n_results]

In [26]:
# --- Test ---
test_query = "cpr procedure" 
print(f"\n--- Testing Retrieval for: '{test_query}' ---")
hits = weighted_retrieval(test_query, title_weight=0.8)

for hit in hits:
    print(f"\n[Score: {hit['score']:.4f} | Title Match: {hit['title_score']:.2f}]")
    print(f"Title: {hit['metadata']['title']}")
    print(f"Content: {hit['content'][:150]}...")


--- Testing Retrieval for: 'cpr procedure' ---

[Score: 0.9496 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: cal care and transport the person to a hospital. For each minute that CPR and deﬁ brillation are delayed, the chance for survival is reduced by about ...

[Score: 0.9414 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: g the infant’s mouth and nose with your mouth (Fig. 2-13, B). Each rescue breath should last about 1 second and make the chest clearly rise. Continue ...

[Score: 0.9375 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: lth care. ADVANCE DIRECTIVES ( Continued )

CHAPTER 2 |  39 Continue cycles of 30 chest compressions and 2 rescue breaths. Do not stop CPR except in o...

[Score: 0.9166 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: KILL SHEET AFTER CHECKING THE SCENE AND THE INJURED OR ILL CHILD: GIVE 30 CHEST COMPRESSIONS Push hard, push fast in the center of the chest about 2 i...

[Score: 0.9

In [39]:
# --- Test ---
test_query = "cpr procedure" 
print(f"\n--- Testing Retrieval for: '{test_query}' ---")
hits = weighted_retrieval(test_query, title_weight=0.8)

for hit in hits:
    print(f"\n[Score: {hit['score']:.4f} | Title Match: {hit['title_score']:.2f}]")
    print(f"Title: {hit['metadata']['title']}")
    print(f"Content: {hit['content'][:150]}...")


--- Testing Retrieval for: 'cpr procedure' ---

[Score: 1.0024 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: y long enough to allow the heart to spontaneously develop an effective rhythm on its own. Without early CPR and early deﬁ brillation, the chances of s...

[Score: 0.9696 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: and lift the chin up.  Pinch the nose shut then make a complete seal  over the person’s mouth. Blow in for about 1 second to make the chest  clearl...

[Score: 0.9646 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content: in the durable power of attorney for health care. ADVANCE DIRECTIVES ( Continued )
38 First Aid/CPR/AED | Participant’s Manual CPR for a Child If duri...

[Score: 0.9625 | Title Match: 0.50]
Title: Cardiac Emergencies and CPR
Content:  take over. ■ You are too exhausted to continue. ■ The scene becomes unsafe. If at any time you notice the infant begin to breathe, stop CPR, keep the...

[Score: 0.9

# Iamges+ nlmlkt cleaning

In [52]:
pip install nltk pillow


Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
                                              0.0/1.5 MB ? eta -:--:--
     --                                       0.1/1.5 MB 2.3 MB/s eta 0:00:01
     ---                                      0.1/1.5 MB 1.4 MB/s eta 0:00:01
     ------                                   0.2/1.5 MB 1.9 MB/s eta 0:00:01
     --------                                 0.3/1.5 MB 1.7 MB/s eta 0:00:01
     -----------                              0.5/1.5 MB 2.0 MB/s eta 0:00:01
     ---------------                          0.6/1.5 MB 2.1 MB/s eta 0:00:01
     ------------------                       0.7/1.5 MB 2.2 MB/s eta 0:00:01
     --------------------                     0.8/1.5 MB 2.2 MB/s eta 0:00:01
     -----------------------                  0.9/1.5 MB 2.0 MB/s eta 0:00:01
     -------------------------                1.0/1.5 MB 2.1 MB/s eta 0:00:01
     ----------------------------             1.1/1.5 MB 2.2 MB/s eta 0:00:01
    


[notice] A new release of pip is available: 23.1.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
import nltk
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\THiNKBooK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\THiNKBooK\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [59]:
import re

try:
    from nltk.tokenize import sent_tokenize
    NLTK_AVAILABLE = True
except Exception:
    NLTK_AVAILABLE = False


ALLOWED_PUNCT = r"\.\,\;\:\?\!\'\"\(\)\-"

def nltk_clean(text):
    # --- Sentence split ---
    if NLTK_AVAILABLE:
        try:
            sentences = sent_tokenize(text)
        except LookupError:
            sentences = re.split(r"[.!?]", text)
    else:
        sentences = re.split(r"[.!?]", text)

    cleaned = []

    for s in sentences:
        s = re.sub(rf"[^a-zA-Z0-9{ALLOWED_PUNCT}\s]", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        if len(s) > 3:
            cleaned.append(s)

    return " ".join(cleaned)


In [55]:
from PIL import Image
import io

def extract_images(doc, doc_name, image_root="images"):
    image_map = {}

    doc_folder = os.path.join(image_root, doc_name.replace(".pdf", ""))
    os.makedirs(doc_folder, exist_ok=True)

    for page_index in range(len(doc)):
        page = doc[page_index]
        page_number = page_index + 1
        image_list = []

        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base = doc.extract_image(xref)
            image_bytes = base["image"]
            ext = base["ext"]

            img_name = f"page_{page_number}_img_{img_index + 1}.{ext}"
            img_path = os.path.join(doc_folder, img_name)

            with open(img_path, "wb") as f:
                f.write(image_bytes)

            image_list.append(img_path)

        if image_list:
            image_map[page_number] = image_list

    return image_map


In [60]:
def get_chapters(doc_source, chapter_font_size):

    files_to_process = []
    if os.path.isdir(doc_source):
        files_to_process = [
            os.path.join(doc_source, f)
            for f in os.listdir(doc_source)
            if f.lower().endswith(".pdf")
        ]
    elif os.path.isfile(doc_source):
        files_to_process = [doc_source]
    else:
        print(f"Invalid source: {doc_source}")
        return []

    all_chapters = []

    for doc_path in files_to_process:
        doc = fitz.open(doc_path)
        doc_name = os.path.basename(doc_path)

        # ---- IMAGE EXTRACTION (ONCE PER DOC) ----
        image_map = extract_images(doc, doc_name)

        current_chapter = None

        for page_num, page in enumerate(doc):
            page_number = page_num + 1
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] != 0:
                    continue

                for line in block["lines"]:
                    for span in line["spans"]:
                        raw_text = span["text"].strip()
                        if not raw_text:
                            continue

                        # ---- CHAPTER TITLE (UNCHANGED) ----
                        if int(span["size"]) == chapter_font_size:

                            if current_chapter is not None:
                                merged_text = "".join(
                                    current_chapter["pages"].values()
                                ).strip()

                                if merged_text == "":
                                    current_chapter["title"] += " " + raw_text
                                    continue
                                else:
                                    all_chapters.append(current_chapter)

                            current_chapter = {
                                "doc_name": doc_name,
                                "title": raw_text,
                                "pages": {},
                                "images": {}
                            }
                            continue

                        # ---- NORMAL CONTENT ----
                        if current_chapter is not None:
                            clean_text = nltk_clean(raw_text)
                            if clean_text:
                                current_chapter["pages"].setdefault(page_number, "")
                                current_chapter["pages"][page_number] += clean_text + " "

                            # ---- MAP IMAGES TO CHAPTER ----
                            if page_number in image_map:
                                current_chapter["images"].setdefault(
                                    page_number, image_map[page_number]
                                )

        if current_chapter is not None:
            all_chapters.append(current_chapter)

        doc.close()

    return all_chapters


In [61]:
# Re-extract and clean with improved logic
print("Extracting chapters with structural preservation...")
chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)

print(f"Cleaning {len(chapters)} chapters...")
chapters = [clean_chapter(ch) for ch in chapters]

print(f"After redundancy removal, sample chapter content:")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")

Extracting chapters with structural preservation...
Cleaning 11 chapters...
After redundancy removal, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER edical emergencies can happen every day, in any setting. People are injured in situations li...
  Page 2: First Aid CPR AED Participant s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in making t...
  Page 3: CHAPTER  Clutching the chest or throat A person doubled over in pain Slurred, confused or hesitant s...
  Page 4: First Aid CPR AED Participant s Manual Fear of Being Sued Sometimes people worry that they might be ...
  Page 5: CHAPTER  Getting Permission to Give Care People have a basic right to decide what can and cannot be ...
  Page 6: First Aid CPR AED Participant s Manual FOCUS ON PREPAREDNESS Important Information Keep medical info...
  Page 7: CHAPTER  Avoid handling any of your personal items, such as pens or combs, while giving care or befo...
  Page 

In [62]:
chapters = [clean_chapter(ch) for ch in chapters]

print(f"After cleaning, sample chapter content:")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")

After cleaning, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER edical emergencies can happen every day, in any setting. People are injured in situations li...
  Page 2: First Aid CPR AED Participant s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in making t...
  Page 3: CHAPTER  Clutching the chest or throat A person doubled over in pain Slurred, confused or hesitant s...
  Page 4: First Aid CPR AED Participant s Manual Fear of Being Sued Sometimes people worry that they might be ...
  Page 5: CHAPTER  Getting Permission to Give Care People have a basic right to decide what can and cannot be ...
  Page 6: First Aid CPR AED Participant s Manual FOCUS ON PREPAREDNESS Important Information Keep medical info...
  Page 7: CHAPTER  Avoid handling any of your personal items, such as pens or combs, while giving care or befo...
  Page 8: First Aid CPR AED Participant s Manual to 1 gallon of fresh water (1 part bleach pe

In [63]:
import json
import os

def save_chapters_to_json(chapters, output_path="chapters.json"):
    # Convert page numbers to strings for JSON compatibility
    serializable = []

    for ch in chapters:
        serializable.append({
            "doc_name": ch["doc_name"],
            "title": ch["title"],
            "pages": {str(k): v for k, v in ch["pages"].items()},
            "images": {
                str(k): v for k, v in ch.get("images", {}).items()
            }
        })

    os.makedirs(os.path.dirname(output_path), exist_ok=True) if os.path.dirname(output_path) else None

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(serializable, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved {len(serializable)} chapters to {output_path}")


In [64]:
chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)
chapters = [clean_chapter(ch) for ch in chapters]

save_chapters_to_json(chapters, "output/chapters.json")


✅ Saved 11 chapters to output/chapters.json
