# Start*

# Loadings

In [1]:
import os
import sys

# Add project root to sys.path for absolute imports
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
	sys.path.insert(0, project_root)

from src.utils import get_file_name

# test
print(get_file_name(r"data\documents\curated\doc_002.pdf"))

doc_002


In [2]:
# Calling the font size jason file to use the chapter font size in the custom chunking process.
import json
with open('chapter_font_sizes.json', 'r') as f:
    chapter_font_sizes = json.load(f)

chapter_font_sizes

[{'doc_path': 'doc_005', 'chapter_font_size': 21},
 {'doc_path': 'doc_004', 'chapter_font_size': 18},
 {'doc_path': 'doc_003', 'chapter_font_size': 32},
 {'doc_path': 'doc_002', 'chapter_font_size': 13},
 {'doc_path': 'doc_001', 'chapter_font_size': 29}]

In [98]:
import os
import re
import json

ABREVS_DIR = r".\documents\abrevs"
class AbbreviationExpander:
    def __init__(self, file_name):
        abrev_file = os.path.join(ABREVS_DIR, f"{file_name}.json")

        if not os.path.exists(abrev_file):
            print(f"No abbreviation file found for {file_name} document. Abbreviations will not be expanded.")
            self.abrev_map = {}
            self.pattern = re.compile(r"(?!x)x")  # Matches nothing
            return
        with open(abrev_file, "r", encoding="utf-8") as f:
            self.abrev_map = json.load(f)

        sorted_keys = sorted(self.abrev_map.keys(), key=len, reverse=True)

        self.pattern = re.compile(
            r"\b(" + "|".join(re.escape(k) for k in sorted_keys) + r")\b"
        )

    def expand(self, text):
        return self.pattern.sub(
            lambda m: self.abrev_map[m.group(0)], text
        )
# test
expander = AbbreviationExpander("doc_002")
print(expander.expand("Dr. AMR is here."))


Dr. antimicrobial resistance is here.


In [103]:
import os
import fitz
import re
from collections import Counter

def get_chapters(doc_source, chapter_font_size):
    file_name = get_file_name(doc_source)
    
    expander = AbbreviationExpander(file_name)


    files_to_process = []
    if os.path.isdir(doc_source):
        files_to_process = [
            os.path.join(doc_source, f)
            for f in os.listdir(doc_source)
            if f.lower().endswith(".pdf")
        ]
    elif os.path.isfile(doc_source):
        files_to_process = [doc_source]
    else:
        print(f"Invalid source: {doc_source}")
        return []

    all_chapters = []

    for doc_path in files_to_process:
        doc = fitz.open(doc_path)
        doc_name = os.path.basename(doc_path)
        current_chapter = None

        for page_num, page in enumerate(doc):
            page_number = page_num + 1
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] != 0:
                    continue

                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if not text:
                            continue

                        # ---- CHAPTER TITLE DETECTION (UNCHANGED LOGIC) ----
                        if int(span["size"]) == chapter_font_size:

                            if current_chapter is not None:
                                merged_text = "".join(
                                    current_chapter[file_name]["pages"].values()
                                ).strip()

                                # ---- YOUR TITLE EXTENSION LOGIC ----
                                if merged_text == "":
                                    current_chapter[file_name]["title"] += " " + text
                                    continue
                                else:
                                    all_chapters.append(current_chapter)

                            current_chapter = {file_name :{
                                # "doc_name": doc_name,
                                "title": text,
                                "pages": {}
                            }}
                            continue

                        # ---- NORMAL CONTENT ----
                        if current_chapter is not None:
                            current_chapter[file_name]["pages"].setdefault(page_number, "")
                            current_chapter[file_name]["pages"][page_number] += expander.expand(text) + " "

        if current_chapter is not None:
            all_chapters.append(current_chapter)

        doc.close()

    return all_chapters
# test
CURATED_PATH = r".\documents\curated"
for item in chapter_font_sizes:
    doc_path = os.path.join(CURATED_PATH, item["doc_path"] + ".pdf")
    chapters = get_chapters(doc_path, item["chapter_font_size"])
    print(f"Chapters for {item['doc_path']}:")
    doc_name = get_file_name(doc_path)
    for chapter in chapters:
        print(f"  - {chapter[doc_name]['title']} (Pages: {list(chapter[doc_name]['pages'].keys())})")



Chapters for doc_005:
  - 1. Introduction (Pages: [1, 2, 3, 4, 5, 6])
  - 2. Core indicators (Pages: [6, 7])
  - 3. Facility representation in RHIS (Pages: [7, 8])
  - 4. Data quality (Pages: [8, 9, 10, 11, 12, 13, 14])
  - 5. Population estimates/denominators (Pages: [14, 15, 16])
  - 6. Key analytical concepts (Pages: [16, 17, 18, 19, 20])
  - 7. Presentation and communication (Pages: [20, 21, 22, 23, 24, 25, 26, 27])
  - 8. Interpretation and use (Pages: [27, 28])
No abbreviation file found for doc_004 document. Abbreviations will not be expanded.
Chapters for doc_004:
  - FUNDAMENTAL ELEMENTS NEEDED TO PREVENT TRANSMISSION OF INFECTIOUS AGENTS IN OUTPATIENT SETTINGS (Pages: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
No abbreviation file found for doc_003 document. Abbreviations will not be expanded.
Chapters for doc_003:
  - Key facts (Pages: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
Chapters for doc_002:
  - Core component 1: Infection prevention and control programmes 1a. Health care facility level

In [102]:
chapter

{'doc_005.pdf': {'title': '1. Introduction',
  'pages': {1: '1.1 Toolkit for analysis and use of routine health facility data Rationale for the toolkit Routine health facility data comprise data that are reported at regular intervals from facilities providing health services. The system of regular recording, reporting, analysis and presentation of health facility data is known as the routine health information system (routine health information system) . 1 routine health information system data provide a picture of the services delivered in health facilities and the health status of the people using the services. The data can be used to assess the performance of individual facilities and also to assess service utilization and coverage of interventions in defined populations. routine health information system data serve multiple users and a wide range of purposes including patient/client management, facility management, disease surveillance, monitoring of service provision and resource 

In [196]:
import re
from collections import Counter

WORDS = 5
# Window isn't used in learn_noise directly anymore but kept for reference
WINDOW = 20 

def normalize(text):
    # Remove leading numbers/whitespace to help noise learning merge "32 Header" and "33 Header"
    text_no_num = re.sub(r'^\s*\d+\s+', '', text)
    return re.sub(r'\s+', ' ', text_no_num.lower()).strip()

def make_start_pattern(noise_text):
    """Creates a regex pattern to match the noise text at the start of a string."""
    words = noise_text.split()
    # Updated: Allow optional leading digits and whitespace to match text like "32 Header"
    return re.compile(r"^\s*(?:\d+\s+)?\s*" + r"\s+".join(map(re.escape, words)), re.IGNORECASE)

def make_end_pattern(noise_text):
    """Creates a regex pattern to match the noise text at the end of a string."""
    words = noise_text.split()
    return re.compile(r"\s+".join(map(re.escape, words)) + r"\s*$", re.IGNORECASE)

def learn_noise_direction(pages_text, direction='start'):
    """
    Learns noise patterns for a specific direction (start or end).
    pages_text: list of normalized strings.
    """
    noise_patterns = []
    
    # We operate on a working copy to peel layers
    current_texts = list(pages_text)
    total_docs = len(current_texts)
    
    if total_docs == 0:
        return []

    max_iterations = 10 
    
    for _ in range(max_iterations):
        found_layer = False
        best_candidate = None
        
        # Check varying lengths
        for length in range(WORDS, 0, -1):
            candidates = []
            for text in current_texts:
                words = text.split()
                if not words: continue
                
                phrase = None
                if direction == 'start':
                    if len(words) >= length:
                        phrase = " ".join(words[:length])
                else: # end
                    if len(words) >= length:
                        phrase = " ".join(words[-length:])
                
                if phrase:
                    candidates.append(phrase)
            
            if not candidates:
                continue
                
            counts = Counter(candidates)
            cand, count = counts.most_common(1)[0]
            
            # Threshold: > 15% of pages (Global Document Level)
            if count / total_docs > 0.15:
                best_candidate = cand
                found_layer = True
                break
        
        if found_layer and best_candidate:
            noise_patterns.append(best_candidate)
            
            # OUT: Strip this noise from current_texts to reveal next layer
            for i, text in enumerate(current_texts):
                if direction == 'start':
                    if text.startswith(best_candidate):
                         current_texts[i] = text[len(best_candidate):].strip()
                else:
                    if text.endswith(best_candidate):
                         current_texts[i] = text[:-len(best_candidate)].strip()
        else:
            break
            
    return noise_patterns

def learn_noise(pages_map):
    # Learn from ALL pages in the document
    page_nums = sorted(pages_map.keys())
    
    # Normalized copy for learning
    normalized_pages = [normalize(pages_map[p]) for p in page_nums]
    
    noise_starts = learn_noise_direction(normalized_pages, 'start')
    noise_ends = learn_noise_direction(normalized_pages, 'end')
    
    return noise_starts, noise_ends

def apply_noise(pages_map, noise_starts, noise_ends):
    cleaned = {}
    page_nums = sorted(pages_map.keys())
    
    # Pre-compile regex patterns 
    start_patterns = [make_start_pattern(n) for n in noise_starts]
    end_patterns = [make_end_pattern(n) for n in noise_ends]

    for p in page_nums:
        text = pages_map[p]

        changed = True
        while changed:
            changed = False

            for pattern in start_patterns:
                match = pattern.match(text)
                if match:
                    text = text[match.end():].lstrip()
                    changed = True
            
            for pattern in end_patterns:
                match = pattern.search(text)
                if match:
                    text = text[:match.start()].rstrip()
                    changed = True

        cleaned[p] = text

    return cleaned

def clean_document_chapters(chapters):
    """
    Cleans a list of chapters belonging to a SINGLE document.
    Aggregates pages to learn noise globally, then applies cleaning.
    """
    if not chapters:
        return []
        
    # --- 1. Aggregation ---
    global_page_map = {}
    
    for ch in chapters:
        # Extract pages dict safely
        pages = None
        if "pages" in ch:
            pages = ch["pages"]
        else:
            # Handle potential nested format { "doc_id": { "pages": ... } }
            keys = list(ch.keys())
            if keys:
                first_val = ch[keys[0]]
                if isinstance(first_val, dict) and "pages" in first_val:
                    pages = first_val["pages"]
        
        if pages:
            for p_num, p_text in pages.items():
                global_page_map[p_num] = p_text
                
    if not global_page_map:
        return chapters

    # --- 2. Global Learning ---
    noise_starts, noise_ends = learn_noise(global_page_map)
    
    # --- 3. Global Application ---
    cleaned_global_pages = apply_noise(global_page_map, noise_starts, noise_ends)
    
    # --- 4. Post-processing & Redistribution ---
    leading_digit_pattern = re.compile(r'^\s*\d+\s+')
    leading_non_letter_pattern = re.compile(r'^[^a-zA-Z]+')
    
    # Use 10 words logic for last page
    
    for ch in chapters:
        # Locate pages dict again (same logic as above)
        pages_ref = None
        if "pages" in ch:
            pages_ref = ch["pages"]
        else:
            keys = list(ch.keys())
            if keys:
                first_val = ch[keys[0]]
                if isinstance(first_val, dict) and "pages" in first_val:
                    pages_ref = first_val["pages"]
                    
        if pages_ref is None:
            continue
            
        # Update content
        pages_to_remove = []
        for p_num in list(pages_ref.keys()):
            if p_num in cleaned_global_pages:
                new_text = cleaned_global_pages[p_num]
                
                # --- Post-clean regex ---
                new_text = leading_digit_pattern.sub('', new_text)
                new_text = leading_non_letter_pattern.sub('', new_text)
                
                pages_ref[p_num] = new_text

        # --- Remove Last Page of Chapter if garbage ---
        if pages_ref:
            # Ensure keys are comparable (int)
            # get_chapters usually uses Ints.
            try:
                # Filter for int keys if mixed (shouldn't be)
                int_keys = [k for k in pages_ref.keys() if isinstance(k, int)]
                if not int_keys:
                     # Try casting keys if they are strings
                     int_keys = [int(k) for k in pages_ref.keys()]
                     # (But pages_ref is dict, re-keying is messy. Assume int keys from fitz)
                     # get_chapters uses: current_chapter[...]["pages"][page_number(int)] = ...
                     # So keys ARE ints.
                
                if int_keys:
                    max_key = max(int_keys)
                    if max_key in pages_ref:
                        last_text = pages_ref[max_key]
                        if len(last_text.split()) < 10:
                            del pages_ref[max_key]
            except Exception as e:
                # Fallback or silent fail
                pass
                
    return chapters

# Backward compatibility shim
def clean_chapter(chapter, path=None):
    return clean_document_chapters([chapter])[0]

# All chapters

In [197]:
all_processed_chapters = []
all_processed_chapters_dirty = []

for item in chapter_font_sizes:
    # Construct full path if needed, assuming base dir is 'documents/curated'

    full_path = os.path.join(r".\documents\curated", item['doc_path'] + ".pdf")
    
    print(f"Processing {item['doc_path']} with font size {item['chapter_font_size']}...")
    
    # Extract
    chapters = get_chapters(full_path, item['chapter_font_size'])
    from copy import deepcopy
    all_processed_chapters_dirty.extend(deepcopy(chapters))
    
    # Clean - Process entire document at once to learn noise global statistics
    cleaned_chapters = clean_document_chapters(chapters)
    
    all_processed_chapters.extend(cleaned_chapters)

Processing doc_005 with font size 21...
Processing doc_004 with font size 18...
No abbreviation file found for doc_004 document. Abbreviations will not be expanded.
Processing doc_003 with font size 32...
No abbreviation file found for doc_003 document. Abbreviations will not be expanded.
Processing doc_002 with font size 13...
Processing doc_001 with font size 29...
No abbreviation file found for doc_001 document. Abbreviations will not be expanded.


In [198]:
all_processed_chapters

[{'doc_005': {'title': '1. Introduction',
   'pages': {1: 'Toolkit for analysis and use of routine health facility data Rationale for the toolkit Routine health facility data comprise data that are reported at regular intervals from facilities providing health services. The system of regular recording, reporting, analysis and presentation of health facility data is known as the routine health information system (routine health information system) . 1 routine health information system data provide a picture of the services delivered in health facilities and the health status of the people using the services. The data can be used to assess the performance of individual facilities and also to assess service utilization and coverage of interventions in defined populations. routine health information system data serve multiple users and a wide range of purposes including patient/client management, facility management, disease surveillance, monitoring of service provision and resource use, a

In [194]:
all_processed_chapters_dirty

[{'doc_005': {'title': '1. Introduction',
   'pages': {1: '1.1 Toolkit for analysis and use of routine health facility data Rationale for the toolkit Routine health facility data comprise data that are reported at regular intervals from facilities providing health services. The system of regular recording, reporting, analysis and presentation of health facility data is known as the routine health information system (routine health information system) . 1 routine health information system data provide a picture of the services delivered in health facilities and the health status of the people using the services. The data can be used to assess the performance of individual facilities and also to assess service utilization and coverage of interventions in defined populations. routine health information system data serve multiple users and a wide range of purposes including patient/client management, facility management, disease surveillance, monitoring of service provision and resource us

In [199]:
import json
from collections import defaultdict

# --- 1. Group Chapters by Document Name ---
# We want a dictionary structure:
# {
#    "doc_001": [ { "title": "...", "pages": ... }, ... ],
#    "doc_002": [ ... ]
# }

grouped_data = defaultdict(list)

for entry in all_processed_chapters:
    # Handle { "doc_name": { ...content... } } format (Output of get_chapters)
    # The keys[0] is dynamic (e.g., "doc_002")
    keys = list(entry.keys())
    if not keys:
         continue
         
    # Check if this entry is in the flat format (doc_name inside) or nested format
    if "doc_name" in entry and isinstance(entry["doc_name"], str): 
        # It's already flattened, use it directly (but we want to group it)
        doc_id = entry["doc_name"]
        chapter_content = {k: v for k, v in entry.items() if k != "doc_name"}
        grouped_data[doc_id].append(chapter_content)
        
    else:
        # It's likely { "doc_005": { "title": ..., "pages": ... } }
        doc_id = keys[0]
        chapter_content = entry[doc_id]
        
        # Ensure we capture title and pages clearly
        clean_chapter = {
            "title": chapter_content.get("title", "Unknown"),
            "pages": chapter_content.get("pages", {})
        }
        if "images" in chapter_content:
            clean_chapter["images"] = chapter_content["images"]
            
        grouped_data[doc_id].append(clean_chapter)

# Convert defaultdict to regular dict for clean JSON
final_output = dict(grouped_data)

# --- 2. Save grouped data ---
output_json_path = "all_processed_chapters.json"
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(final_output, f, indent=4)

print(f"Saved chapters for {len(final_output)} documents to {output_json_path}")
print("Documents found:", list(final_output.keys()))

# Preview structure
if final_output:
    first_doc = list(final_output.keys())[0]
    print(f"\nStructure for '{first_doc}': List of {len(final_output[first_doc])} chapters")

Saved chapters for 5 documents to all_processed_chapters.json
Documents found: ['doc_005', 'doc_004', 'doc_003', 'doc_002', 'doc_001']

Structure for 'doc_005': List of 8 chapters


In [9]:
#    load chapters from the saved JSON to verifypo
import json
output_json_path = "all_processed_chapters.json"
with open(output_json_path, 'r', encoding='utf-8') as f:
    loaded_chapters = json.load(f)
print(f"Loaded chapters for {len(loaded_chapters)} documents from {output_json_path}")

Loaded chapters for 5 documents from all_processed_chapters.json


In [10]:
loaded_chapters

{'doc_005': [{'title': '1. Introduction',
   'pages': {'1': 'Toolkit for analysis and use of routine health facility data Rationale for the toolkit Routine health facility data comprise data that are reported at regular intervals from facilities providing health services. The system of regular recording, reporting, analysis and presentation of health facility data is known as the routine health information system (routine health information system) . 1 routine health information system data provide a picture of the services delivered in health facilities and the health status of the people using the services. The data can be used to assess the performance of individual facilities and also to assess service utilization and coverage of interventions in defined populations. routine health information system data serve multiple users and a wide range of purposes including patient/client management, facility management, disease surveillance, monitoring of service provision and resource use,

In [75]:
for chapter in loaded_chapters['doc_001']:
    # check if page "36" exists in the chapter
    if "pages" in chapter and "36" in chapter["pages"]:
        print(f"Page 36 content: {chapter['pages']['36'][550:]}...")
        break

Page 36 content: ths should take about 24 seconds. Minimize the interruption of chest compressions. If Two Responders Are Available If two responders trained in CPR are at the scene, both should identify themselves as being trained. One should call 9-1-1 or the local emergency number for help while the other performs CPR. If the ﬁ rst responder is tired and needs help: The ﬁ rst responder should tell the second responder ■ to take over. The second responder should immediately take over ■ CPR, beginning with chest compressions. When to Stop CPR Once you begin CPR, do not stop except in one of these situations: You notice an obvious sign of life, such as ■ breathing. An AED is available and ready to use. ■ Another trained responder or EMS personnel take ■ over (Fig. 2-10). You are too exhausted to continue. ■ The scene becomes unsafe. ■ If at any time you notice that the person is breathing, stop CPR. Keep his or her airway open and continue to monitor the person’s breathing and for any 

In [77]:
clean_text_post(chapter['pages']['36'][550:])

'ths should take about 24 seconds. Minimize the interruption of chest compressions. If Two Responders Are Available If two responders trained in CPR are at the scene, both should identify themselves as being trained. One should call 9-1-1 or the local emergency number for help while the other performs CPR. If the fi rst responder is tired and needs help: The fi rst responder should tell the second responder to take over. The second responder should immediately take over CPR, beginning with chest compressions. When to Stop CPR Once you begin CPR, do not stop except in one of these situations: You notice an obvious sign of life, such as breathing. An AED is available and ready to use. Another trained responder or EMS personnel take over (Fig. 2-10). You are too exhausted to continue. The scene becomes unsafe. If at any time you notice that the person is breathing, stop CPR. Keep his or her airway open and continue to monitor the person s breathing and for any changes in the person s cond

# §Post cleaning

In [11]:
import re
import unicodedata
from copy import deepcopy

# allowed punctuation
ALLOWED_PUNCT = set(".,;:?!'\"()-")
import re

def restore_word_boundaries(text: str) -> str:
    """
    Restore missing spaces between words caused by PDF extraction cleaning.
    """

    if not text:
        return text

    # letter followed by uppercase (example: healthFacility → health Facility)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # letter followed by digit
    text = re.sub(r'([a-zA-Z])([0-9])', r'\1 \2', text)

    # digit followed by letter
    text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)

    # punctuation followed by letter
    text = re.sub(r'([.,;:?!])([A-Za-z])', r'\1 \2', text)

    # fix cases like "surveyalso" using dictionary-free heuristic
    text = re.sub(r'([a-z])([A-Z][a-z])', r'\1 \2', text)

    # normalize spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()
def clean_text_post(text: str) -> str:

    text = unicodedata.normalize("NFKC", text)

    cleaned_chars = []

    for char in text:

        if char.isalnum():
            cleaned_chars.append(char)

        elif char.isspace():
            cleaned_chars.append(" ")

        elif char in ".,;:?!'\"()-":
            cleaned_chars.append(char)

        else:
            cleaned_chars.append(" ")

    text = "".join(cleaned_chars)

    text = restore_word_boundaries(text)

    text = re.sub(r"\s+", " ", text)

    return text.strip()



def post_clean_document_structure(data: dict) -> dict:
    """
    Apply post cleaning to your full document structure.
    """

    cleaned_data = deepcopy(data)

    for doc_id, sections in cleaned_data.items():

        for section in sections:

            # clean title
            section["title"] = clean_text_post(section["title"])

            # clean pages
            for page_num, content in section["pages"].items():
                section["pages"][page_num] = clean_text_post(content)

    return cleaned_data
# Apply post cleaning
final_cleaned_chapters = post_clean_document_structure(loaded_chapters)

In [78]:
final_cleaned_chapters
for chapter in final_cleaned_chapters['doc_001']:
    # check if page "36" exists in the chapter
    if "pages" in chapter and "36" in chapter["pages"]:
        print(f"Page 36 content: {chapter['pages']['36'][550:]}...")
        break

Page 36 content: ake about 24 seconds. Minimize the interruption of chest compressions. If Two Responders Are Available If two responders trained in CPR are at the scene, both should identify themselves as being trained. One should call 9-1-1 or the local emergency number for help while the other performs CPR. If the fi rst responder is tired and needs help: The fi rst responder should tell the second responder to take over. The second responder should immediately take over CPR, beginning with chest compressions. When to Stop CPR Once you begin CPR, do not stop except in one of these situations: You notice an obvious sign of life, such as breathing. An AED is available and ready to use. Another trained responder or EMS personnel take over (Fig. 2-10). You are too exhausted to continue. The scene becomes unsafe. If at any time you notice that the person is breathing, stop CPR. Keep his or her airway open and continue to monitor the person s breathing and for any changes in the person s 

In [79]:
clean_ligatures(chapter['pages']['36'][550:])

'akeabout 24 seconds. Minimizetheinterruptionofchestcompressions. If Two Responders Are Available Iftworesponderstrainedin CPR areatthescene, bothshouldidentifythemselvesasbeingtrained. Oneshouldcall 9-1-1 orthelocalemergencynumberforhelpwhiletheotherperforms CPR. Ifthefirstresponderistiredandneedshelp: Thefirstrespondershouldtellthesecondrespondertotakeover. Thesecondrespondershouldimmediatelytakeover CPR, beginningwithchestcompressions. Whento Stop CPR Onceyoubegin CPR, donotstopexceptinoneofthesesituations: Younoticeanobvioussignoflife, suchasbreathing. An AED isavailableandreadytouse. Anothertrainedresponderor EMS personneltakeover (Fig. 2-10). Youaretooexhaustedtocontinue. Thescenebecomesunsafe. Ifatanytimeyounoticethatthepersonisbreathing, stop CPR. Keephisorherairwayopenandcontinuetomonitorthepersons breathingandforanychangesinthepersons conditionuntil EMS personneltakeover (Fig. 2-11). Cardiac Emergenciesin Childrenand Infants Itisrarefora childoraninfanttoinitiallysuffera card

# Chunking

In [None]:
import uuid
import chromadb
from sentence_transformers import SentenceTransformer
import re

# --- 2. Setup ChromaDB & Embedding ---
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
DB_PATH = "./chroma_db_chap_based"

print("Initializing ChromaDB and Embeddings...")
client = chromadb.PersistentClient(path=DB_PATH)
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

class LocalHuggingFaceEmbedding(chromadb.EmbeddingFunction):
    def __init__(self, model):
        self.model = model
    def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
        return self.model.encode(input, convert_to_tensor=False).tolist()

collection = client.get_or_create_collection(
    name="chapter_knowledge_base",
    embedding_function=LocalHuggingFaceEmbedding(embedding_model)
)


def split_text_smart(text: str, chunk_size=1000, overlap=200):
    if not text: return []
    
    # 1. Pre-clean ligatures
    
    chunks = []
    start = 0
    text_len = len(text)
    
    while start < text_len:
        end = min(start + chunk_size, text_len)
        
        # Determine Cut Point (End)
        if end < text_len:
            chunk_str = text[start:end]
            min_search = int(len(chunk_str) * 0.6)
            break_index = -1
            
            # Priority 1: Double Newline
            last_para = chunk_str.rfind('\n\n')
            if last_para > min_search:
                break_index = last_para + 2
                
            # Priority 2: Sentence End
            if break_index == -1:
                for i in range(len(chunk_str) - 1, min_search, -1):
                    if chunk_str[i] in '.!?' and (i + 1 == len(chunk_str) or chunk_str[i+1].isspace()):
                        break_index = i + 1
                        break
            
            # Priority 3: Space
            if break_index == -1:
                last_space = chunk_str.rfind(' ')
                if last_space > min_search:
                    break_index = last_space + 1
            
            # Fallback
            if break_index != -1:
                end = start + break_index

        # Add Chunk
        valid_chunk = text[start:end].strip()
        if valid_chunk:
            chunks.append(valid_chunk)
            
        # Determine Next Start (Overlap)
        if end == text_len:
            break
            
        next_target = end - overlap
        
        # Align `next_target` to the START of a sentence or word
        # Look backwards from next_target for a punctuation or newline
        align_found = False
        
        # 1. Look for paragraph break before target
        para_start = text.rfind('\n\n', start, next_target)
        if para_start != -1:
             start = para_start + 2
             align_found = True
        
        # 2. Look for sentence end
        if not align_found:
            # Simple scan for '. '
            sent_start = -1
            for i in range(next_target, start, -1):
                if text[i] in '.!?' and (i+1 < text_len and text[i+1].isspace()):
                    sent_start = i + 1
                    break
            if sent_start != -1:
                start = sent_start
                while start < text_len and text[start].isspace():
                    start += 1
                align_found = True
                
        # 3. Fallback to space
        if not align_found:
            space_start = text.rfind(' ', start, next_target)
            if space_start != -1:
                start = space_start + 1
            else:
                start = next_target

    return chunks

print("Chunking and Indexing...")
ids = []
documents = []
metadatas = []

# Ensure we use the loaded_chapters grouped structure
data_source = final_cleaned_chapters if 'final_cleaned_chapters' in globals() else {}
if not data_source and 'chapters' in globals():
    # Fallback to single doc 'chapters' but structured for loop
    data_source = {"current_doc": chapters}

for doc_name, values_list in data_source.items():
    print(f"Chunking document: {doc_name}")
    for chapter in values_list:
        # Merge pages - join with double newline to simulate paragraph breaks between pages
        if isinstance(chapter.get("pages"), dict):
            # Sort pages by page number (key)
            sorted_pages = sorted(chapter["pages"].items(), key=lambda x: int(x[0]) if str(x[0]).isdigit() else x[0])
            full_text = "\n\n".join([str(p[1]) for p in sorted_pages]) # Ensure strings
        else:
            full_text = ""
            
        text_chunks = split_text_smart(full_text)
        
        for i, chunk in enumerate(text_chunks):
            if not chunk.strip(): continue
            
            chunk_id = str(uuid.uuid4())
            ids.append(chunk_id)
            documents.append(chunk)
            
            title = chapter.get("title", "")
            if not isinstance(title, str): title = str(title)

            metadatas.append({
                "doc_name": doc_name,
                "title": title,
                "page_range": f"{sorted_pages[0][0]}-{sorted_pages[-1][0]}" if isinstance(chapter.get("pages"), dict) and sorted_pages else "unknown",
                "chunk_index": i,
                "source": "custom_chunking"
            })

# Add to Chroma in batches
BATCH_SIZE = 256
print(f"Upserting {len(ids)} chunks...")
for i in range(0, len(ids), BATCH_SIZE):
    collection.upsert(
        ids=ids[i:i+BATCH_SIZE],
        documents=documents[i:i+BATCH_SIZE],
        metadatas=metadatas[i:i+BATCH_SIZE]
    )
print(f"Indexed {len(ids)} chunks into ChromaDB.")

Initializing ChromaDB and Embeddings...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 281.86it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Chunking and Indexing...
Chunking document: doc_005
Chunking document: doc_004
Chunking document: doc_003
Chunking document: doc_002
Chunking document: doc_001
Upserting 1455 chunks...
Indexed 1455 chunks into ChromaDB.


In [30]:
print("Fetching all documents from ChromaDB to build Sparse Index...")
all_data = collection.get() # Fetches everything
ids = all_data['ids']
documents = all_data['documents']
print('len ids', len(ids))

if not ids:
    raise ValueError("ChromaDB collection is empty! Please run your ingestion notebook 'custom_chunking.ipynb' first.")


Fetching all documents from ChromaDB to build Sparse Index...
len ids 1455


In [38]:
# lets visualize a a smaple for eachn doc chunk from chromadb
for doc_name in final_cleaned_chapters.keys():
    print(f"\nSample chunk for document: {doc_name}")
    results = collection.query(
        query_texts=["sample"],
        n_results=1,
        where={"doc_name": doc_name}
    )
    if results and results['documents']:
        print("Chunk:", results['documents'][0][0])
        print("Metadata:", results['metadatas'][0][0])
    else:
        print("No chunks found for this document.")



Sample chunk for document: doc_005
Chunk: However, surveysalsohavelimitations, including: timelag: theremaybea periodofseveralyearsbetweenthetimethataneventoccurredandtheavailabilityofthesurveyresults; limitedgeographicdisaggregation: surveysamplesizesarerarelylargeenoughtoprovidevalidestimatesforlowersubnationallevelssuchasdistricts; qualityproblems: surveysvaryintheiradherencetosamplingandinterviewprotocolsandotherqualitystandards; allsurveyscontainsamplingerrors, whichincreaseatsubnationallevelsduetosmallsamplesizes; 17 non-samplingerrorsmayalsooccur, e. g. selectionbiasorrecallbias. 18 Healthfacilityassessments Healthfacilityassessmentssurveysareusedtocollectdatanotusuallyreportedthroughtheroutinehealthinformationsystem, suchasdataonquality-of-careandavailabilityofequipment, medicinesandhumanresources. Facilityassessmentscanalsobeusedtoverifyroutinehealthinformationsystemdata.
Metadata: {'chunk_index': 14, 'doc_name': 'doc_005', 'source': 'custom_chunking', 'title': '6. Key analyt

In [19]:
import numpy as np

# --- 4. Weighted Retrieval Function ---
def weighted_retrieval(query: str, n_results=5, title_weight=0.1, diversity_ratio=0.2):
    """
    Retrieves documents with:
    1. Content Semantic Similarity (Base)
    2. Title Semantic Similarity (Boost)
    3. Diversity Enforcement (Max 20% of results from same chapter/title by default)
    
    Args:
        query: Search string
        n_results: Total desired chunks
        title_weight: How much title match affects score (0.1 means 10% influence)
        diversity_ratio: Max fraction of n_results allowed from a single chapter (0.2 = 20%)
    """
    # Embed query once for title comparison
    query_embedding = embedding_model.encode(query, convert_to_tensor=False)
    
    # 1. Fetch Candidates (fetch extra to allow for diversity filtering)
    # We fetch sufficient candidates to try and fill the quota with diverse items
    candidate_k = min(n_results * 5, collection.count()) 
    results = collection.query(
        query_texts=[query],
        n_results=candidate_k, 
        include=["documents", "metadatas", "distances"]
    )
    
    if not results['ids']: return []

    scored_results = []
    
    # We will cache title embeddings to avoid re-computing for same chapter across chunks
    title_embedding_cache = {}
    
    # Pre-compute query norm for efficiency
    q_norm = np.linalg.norm(query_embedding)
    
    # Process results (flattening the list of lists that Chroma returns)
    ids = results['ids'][0]
    docs = results['documents'][0]
    metas = results['metadatas'][0]
    dists = results['distances'][0]

    for i in range(len(ids)):
        doc_id = ids[i]
        content = docs[i]
        metadata = metas[i]
        distance = dists[i]  # L2 distance
        
        # A. Content Score (1 / 1+L2) - converts distance to similarity [0, 1]
        content_score = 1.0 / (1.0 + distance)
        
        # B. Title Semantic Score
        title = metadata.get('title', '')
        if title not in title_embedding_cache:
            title_emb = embedding_model.encode(title, convert_to_tensor=False)
            title_embedding_cache[title] = title_emb
        else:
            title_emb = title_embedding_cache[title]
            
        # Cosine Similarity: (A . B) / (|A| * |B|)
        t_norm = np.linalg.norm(title_emb)
        if t_norm > 0 and q_norm > 0:
            title_semantic_score = np.dot(query_embedding, title_emb) / (t_norm * q_norm)
        else:
            title_semantic_score = 0.0
            
        # Clip to 0-1 range
        title_semantic_score = max(0.0, min(1.0, title_semantic_score))
        
        # C. Combined Score
        final_score = content_score + (title_semantic_score * title_weight)
        
        scored_results.append({
            "id": doc_id,
            "content": content,
            "metadata": metadata,
            "score": final_score,
            "title_score": title_semantic_score,
            "title_txt": title
        })
    
    # Sort by Final Combined Score
    scored_results.sort(key=lambda x: x["score"], reverse=True)
    
    # D. Diversity Filter
    # Calc max items per title (e.g. 50 * 0.2 = 10 items max per chapter)
    limit_per_title = max(1, int(n_results * diversity_ratio))
    
    final_selection = []
    title_counts = {}
    
    for item in scored_results:
        t = item['title_txt']
        current_count = title_counts.get(t, 0)
        
        if current_count < limit_per_title:
            final_selection.append(item)
            title_counts[t] = current_count + 1
            
        if len(final_selection) >= n_results:
            break
            
    # NOTE: Fallback loop has been removed to strictly enforce the diversity limit.
    # The function may return fewer than n_results if not enough diverse chapters are found.
    
    return final_selection

In [1]:
hit

NameError: name 'hit' is not defined

In [20]:
# --- Test ---
test_query = "cpr procedure" 
print(f"\n--- Testing Retrieval for: '{test_query}' ---")
hits = weighted_retrieval(test_query, title_weight=0.1, n_results=50)

for idx, hit in enumerate(hits):
    print(f"\n[Rank: {idx+1} | Score: {hit['score']:.4f} | Title Match: {hit['title_score']:.2f}]")
    print(f"Document Name: {hit['metadata']['doc_name']}")
    print(f"Title: {hit['metadata']['title']}")
    print(f"Content: {hit['content'][:15]}...")


--- Testing Retrieval for: 'cpr procedure' ---

[Rank: 1 | Score: 0.6290 | Title Match: 0.67]
Document Name: doc_001
Title: Cardiac Emergencies and CPR
Content: Cardiac Emergen...

[Rank: 2 | Score: 0.6285 | Title Match: 0.67]
Document Name: doc_001
Title: Cardiac Emergencies and CPR
Content: CPR for a Child...

[Rank: 3 | Score: 0.6166 | Title Match: 0.67]
Document Name: doc_001
Title: Cardiac Emergencies and CPR
Content: rst link in the...

[Rank: 4 | Score: 0.6073 | Title Match: 0.67]
Document Name: doc_001
Title: Cardiac Emergencies and CPR
Content: Cardiac Emergen...

[Rank: 5 | Score: 0.6066 | Title Match: 0.67]
Document Name: doc_001
Title: Cardiac Emergencies and CPR
Content: Once you have g...

[Rank: 6 | Score: 0.6052 | Title Match: 0.67]
Document Name: doc_001
Title: Cardiac Emergencies and CPR
Content: Cardiac Emergen...

[Rank: 7 | Score: 0.5974 | Title Match: 0.67]
Document Name: doc_001
Title: Cardiac Emergencies and CPR
Content: AFTER CHECKING ...

[Rank: 8 | Score: 0.

In [53]:
# install bm25
%pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import subprocess
import sys

# Install Reranking & Sparse Retrieval Deps
packages = ["rank-bm25", "sentence-transformers"]
for package in packages:
    try:
        __import__(package.replace("-", "_"))
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np

print("Initializing Hybrid Search Components...")

# 1. Build Sparse Index (BM25)
# We rely on the global 'documents' list from the Chunking cell
if 'documents' in globals() and documents:
    # Use simple whitespace tokenization for speed
    tokenized_corpus = [doc.lower().split() for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    print(f"✅ BM25 Sparse Index built for {len(documents)} chunks.")
else:
    print("⚠️ 'documents' list missing. Please re-run the Chunking/Ingestion cell.")

# 2. Load Cross-Encoder (Reranker)
# MS MARCO MiniLM is fast and effective
rerank_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
reranker = CrossEncoder(rerank_model_name)
print(f"✅ Reranker loaded: {rerank_model_name}")

Initializing Hybrid Search Components...
✅ BM25 Sparse Index built for 281 chunks.


Loading weights: 100%|██████████| 105/105 [00:00<00:00, 306.89it/s, Materializing param=classifier.weight]                                    
BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


✅ Reranker loaded: cross-encoder/ms-marco-MiniLM-L-6-v2


In [27]:

def hybrid_rerank_retrieval(query: str, n_results=5, diversity_ratio=0.2, title_weight=0.1):
    """
    Advanced Retrieval Function:
    1. Hybrid Search (Sparse BM25 + Dense ChromaDB) with RRF Fusion
    2. Cross-Encoder Reranking
    3. Explicit Title Semantic Boost
    4. Diversity Filtering
    """
    
    # --- Step 1: Hybrid Retrieval (Candidate Generation) ---
    k_candidates = n_results * 4 # Fetch extra for reranking & diversity
    
    # A. Dense Search (Chroma)
    chroma_res = collection.query(
        query_texts=[query], 
        n_results=k_candidates,
    )
    
    # B. Sparse Search (BM25)
    tokenized_query = query.lower().split()
    # Get indices of top BM25 matches
    bm25_scores = bm25.get_scores(tokenized_query)
    top_bm25_indices = np.argsort(bm25_scores)[::-1][:k_candidates]
    
    # C. Reciprocal Rank Fusion (RRF)
    # We use global IDs to merge results
    rrf_scores = {}
    rrf_k = 60
    
    # Process Dense Ranks
    if chroma_res['ids']:
        for rank, doc_id in enumerate(chroma_res['ids'][0]):
            rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + (1 / (rrf_k + rank + 1))
            
    # Process Sparse Ranks (Map index -> ID)
    for rank, idx in enumerate(top_bm25_indices):
        if idx < len(ids): # Boundary check
            doc_id = ids[idx]
            rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + (1 / (rrf_k + rank + 1))
            
    # Select Top Candidates for Reranking
    sorted_candidates = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:k_candidates]
    candidate_ids = [x[0] for x in sorted_candidates]
    
    if not candidate_ids: return []
    
    # --- Step 2: Fetch Data & Prepare for Reranker ---
    # Retrieve actual content for candidates from Chroma
    docs_data = collection.get(ids=candidate_ids, include=['documents', 'metadatas'])
    
    # Map 'get' results (unordered) back to candidates
    # Create dict: id -> {content, metadata}
    data_map = {}
    for i, doc_id in enumerate(docs_data['ids']):
        data_map[doc_id] = {
            'content': docs_data['documents'][i],
            'metadata': docs_data['metadatas'][i]
        }
    
    rerank_pairs = []
    final_items = []
    
    for doc_id in candidate_ids:
        if doc_id in data_map:
            item = data_map[doc_id]
            # Input for CrossEncoder: (Query, Document Text)
            rerank_pairs.append([query, item['content']])
            item['id'] = doc_id
            final_items.append(item)
            
    # --- Step 3: Reranking ---
    if final_items:
        rerank_scores = reranker.predict(rerank_pairs)
        
        # Pre-calc Query Embedding for Title Boost
        q_emb = embedding_model.encode(query, convert_to_tensor=False)
        q_norm = np.linalg.norm(q_emb)

        for i, item in enumerate(final_items):
            # Base Score from Reranker (Logits, usually -10 to 10)
            # Sigmoid normalization for cleaner combination
            raw_score = rerank_scores[i]
            normalized_rerank = 1 / (1 + np.exp(-raw_score))
            
            # Title Semantic Boost
            title = item['metadata'].get('title', '')
            t_emb = embedding_model.encode(title, convert_to_tensor=False)
            t_norm = np.linalg.norm(t_emb)
            
            if q_norm > 0 and t_norm > 0:
                title_sim = np.dot(q_emb, t_emb) / (q_norm * t_norm)
            else:
                title_sim = 0
            
            title_sim = max(0.0, min(1.0, title_sim))
            
            # Final Hybrid Score
            item['final_score'] = normalized_rerank + (title_sim * title_weight)
            item['title_score'] = title_sim
            item['title_txt'] = title
            
        # Re-sort by Final Score
        final_items.sort(key=lambda x: x['final_score'], reverse=True)
        
    # --- Step 4: Diversity Filtering ---
    limit_per_title = max(1, int(n_results * diversity_ratio))
    selection = []
    title_counts = {}
    
    for item in final_items:
        t = item['title_txt']
        if title_counts.get(t, 0) < limit_per_title:
            selection.append(item)
            title_counts[t] = title_counts.get(t, 0) + 1
            
        if len(selection) >= n_results:
            break
            
    return selection

# --- Test Hybrid ---
print("\n--- Testing Hybrid Reranking Search ---")
hits = hybrid_rerank_retrieval("cpr procedure", n_results=10)

for idx, hit in enumerate(hits):
    print(f"\n[Rank: {idx+1} | Score: {hit['final_score']:.4f} | Title: {hit['title_score']:.2f}]")
    print(f"Doc: {hit['metadata']['doc_name']} | Title: {hit['title_txt']}")
    print(f"metadata: {hit['metadata']}")
    print(f"Content: {hit['content'][:100]}...")


--- Testing Hybrid Reranking Search ---

[Rank: 1 | Score: 0.5100 | Title: 0.67]
Doc: doc_001 | Title: Cardiac Emergencies and CPR
metadata: {'source': 'page_chunking', 'chunk_index': 12, 'page_range': '41-42', 'title': 'Cardiac Emergencies and CPR', 'doc_name': 'doc_001'}
Content: Cardiac Emergencies and CPR 41 PUTTING IT ALL TOGETHER Cardiac emergencies are life threatening. Eve...

[Rank: 2 | Score: 0.5100 | Title: 0.67]
Doc: doc_001 | Title: Cardiac Emergencies and CPR
metadata: {'title': 'Cardiac Emergencies and CPR', 'page_range': '41-42', 'doc_name': 'doc_001', 'chunk_index': 12, 'source': 'page_chunking'}
Content: Cardiac Emergencies and CPR 41 PUTTING IT ALL TOGETHER Cardiac emergencies are life threatening. Eve...

[Rank: 3 | Score: 0.4392 | Title: 0.09]
Doc: doc_001 | Title: AED
metadata: {'source': 'page_chunking', 'title': 'AED', 'page_range': '54-55', 'chunk_index': 9, 'doc_name': 'doc_001'}
Content: STAND CLEAR Make sure no one, including you, is touching the child or i

# Iamges+ nlmlkt cleaning

In [52]:
pip install nltk pillow


Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
                                              0.0/1.5 MB ? eta -:--:--
     --                                       0.1/1.5 MB 2.3 MB/s eta 0:00:01
     ---                                      0.1/1.5 MB 1.4 MB/s eta 0:00:01
     ------                                   0.2/1.5 MB 1.9 MB/s eta 0:00:01
     --------                                 0.3/1.5 MB 1.7 MB/s eta 0:00:01
     -----------                              0.5/1.5 MB 2.0 MB/s eta 0:00:01
     ---------------                          0.6/1.5 MB 2.1 MB/s eta 0:00:01
     ------------------                       0.7/1.5 MB 2.2 MB/s eta 0:00:01
     --------------------                     0.8/1.5 MB 2.2 MB/s eta 0:00:01
     -----------------------                  0.9/1.5 MB 2.0 MB/s eta 0:00:01
     -------------------------                1.0/1.5 MB 2.1 MB/s eta 0:00:01
     ----------------------------             1.1/1.5 MB 2.2 MB/s eta 0:00:01
    


[notice] A new release of pip is available: 23.1.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
import nltk
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\THiNKBooK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\THiNKBooK\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [93]:
import re

try:
    from nltk.tokenize import sent_tokenize
    NLTK_AVAILABLE = True
except Exception:
    NLTK_AVAILABLE = False


ALLOWED_PUNCT = r"\.\,\;\:\?\!\'\"\(\)\-"

def nltk_clean(text):
    # --- Sentence split ---
    if NLTK_AVAILABLE:
        try:
            sentences = sent_tokenize(text)
        except LookupError:
            sentences = re.split(r"[.!?]", text)
    else:
        sentences = re.split(r"[.!?]", text)

    cleaned = []

    for s in sentences:
        s = re.sub(rf"[^a-zA-Z0-9{ALLOWED_PUNCT}\s]", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        if len(s) > 3:
            cleaned.append(s)

    return " ".join(cleaned)


In [94]:
from PIL import Image
import io

def extract_images(doc, doc_name, image_root="images"):
    image_map = {}

    doc_folder = os.path.join(image_root, doc_name.replace(".pdf", ""))
    os.makedirs(doc_folder, exist_ok=True)

    for page_index in range(len(doc)):
        page = doc[page_index]
        page_number = page_index + 1
        image_list = []

        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base = doc.extract_image(xref)
            image_bytes = base["image"]
            ext = base["ext"]

            img_name = f"page_{page_number}_img_{img_index + 1}.{ext}"
            img_path = os.path.join(doc_folder, img_name)

            with open(img_path, "wb") as f:
                f.write(image_bytes)

            image_list.append(img_path)

        if image_list:
            image_map[page_number] = image_list

    return image_map


In [60]:
def get_chapters(doc_source, chapter_font_size):

    files_to_process = []
    if os.path.isdir(doc_source):
        files_to_process = [
            os.path.join(doc_source, f)
            for f in os.listdir(doc_source)
            if f.lower().endswith(".pdf")
        ]
    elif os.path.isfile(doc_source):
        files_to_process = [doc_source]
    else:
        print(f"Invalid source: {doc_source}")
        return []

    all_chapters = []

    for doc_path in files_to_process:
        doc = fitz.open(doc_path)
        doc_name = os.path.basename(doc_path)

        # ---- IMAGE EXTRACTION (ONCE PER DOC) ----
        image_map = extract_images(doc, doc_name)

        current_chapter = None

        for page_num, page in enumerate(doc):
            page_number = page_num + 1
            blocks = page.get_text("dict")["blocks"]

            for block in blocks:
                if block["type"] != 0:
                    continue

                for line in block["lines"]:
                    for span in line["spans"]:
                        raw_text = span["text"].strip()
                        if not raw_text:
                            continue

                        # ---- CHAPTER TITLE (UNCHANGED) ----
                        if int(span["size"]) == chapter_font_size:

                            if current_chapter is not None:
                                merged_text = "".join(
                                    current_chapter["pages"].values()
                                ).strip()

                                if merged_text == "":
                                    current_chapter["title"] += " " + raw_text
                                    continue
                                else:
                                    all_chapters.append(current_chapter)

                            current_chapter = {
                                "doc_name": doc_name,
                                "title": raw_text,
                                "pages": {},
                                "images": {}
                            }
                            continue

                        # ---- NORMAL CONTENT ----
                        if current_chapter is not None:
                            clean_text = nltk_clean(raw_text)
                            if clean_text:
                                current_chapter["pages"].setdefault(page_number, "")
                                current_chapter["pages"][page_number] += clean_text + " "

                            # ---- MAP IMAGES TO CHAPTER ----
                            if page_number in image_map:
                                current_chapter["images"].setdefault(
                                    page_number, image_map[page_number]
                                )

        if current_chapter is not None:
            all_chapters.append(current_chapter)

        doc.close()

    return all_chapters


In [61]:
# Re-extract and clean with improved logic
print("Extracting chapters with structural preservation...")
chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)

print(f"Cleaning {len(chapters)} chapters...")
chapters = [clean_chapter(ch) for ch in chapters]

print(f"After redundancy removal, sample chapter content:")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")

Extracting chapters with structural preservation...
Cleaning 11 chapters...
After redundancy removal, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER edical emergencies can happen every day, in any setting. People are injured in situations li...
  Page 2: First Aid CPR AED Participant s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in making t...
  Page 3: CHAPTER  Clutching the chest or throat A person doubled over in pain Slurred, confused or hesitant s...
  Page 4: First Aid CPR AED Participant s Manual Fear of Being Sued Sometimes people worry that they might be ...
  Page 5: CHAPTER  Getting Permission to Give Care People have a basic right to decide what can and cannot be ...
  Page 6: First Aid CPR AED Participant s Manual FOCUS ON PREPAREDNESS Important Information Keep medical info...
  Page 7: CHAPTER  Avoid handling any of your personal items, such as pens or combs, while giving care or befo...
  Page 

In [62]:
chapters = [clean_chapter(ch) for ch in chapters]

print(f"After cleaning, sample chapter content:")
for i, chapter in enumerate(chapters[:3]):
    print(f"\nChapter {i+1}: {chapter['title']}")
    for page_num, content in chapter["pages"].items():
        print(f"  Page {page_num}: {content[:100]}...")

After cleaning, sample chapter content:

Chapter 1: Before Giving Care and Checking an Injured or Ill Person
  Page 1: CHAPTER edical emergencies can happen every day, in any setting. People are injured in situations li...
  Page 2: First Aid CPR AED Participant s Manual YOUR ROLE IN THE EMS SYSTEM You play a major role in making t...
  Page 3: CHAPTER  Clutching the chest or throat A person doubled over in pain Slurred, confused or hesitant s...
  Page 4: First Aid CPR AED Participant s Manual Fear of Being Sued Sometimes people worry that they might be ...
  Page 5: CHAPTER  Getting Permission to Give Care People have a basic right to decide what can and cannot be ...
  Page 6: First Aid CPR AED Participant s Manual FOCUS ON PREPAREDNESS Important Information Keep medical info...
  Page 7: CHAPTER  Avoid handling any of your personal items, such as pens or combs, while giving care or befo...
  Page 8: First Aid CPR AED Participant s Manual to 1 gallon of fresh water (1 part bleach pe

In [63]:
import json
import os

def save_chapters_to_json(chapters, output_path="chapters.json"):
    # Convert page numbers to strings for JSON compatibility
    serializable = []

    for ch in chapters:
        serializable.append({
            "doc_name": ch["doc_name"],
            "title": ch["title"],
            "pages": {str(k): v for k, v in ch["pages"].items()},
            "images": {
                str(k): v for k, v in ch.get("images", {}).items()
            }
        })

    os.makedirs(os.path.dirname(output_path), exist_ok=True) if os.path.dirname(output_path) else None

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(serializable, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved {len(serializable)} chapters to {output_path}")


In [64]:
chapters = get_chapters(DOC_PATH, CHAPTER_FONT_SIZE)
chapters = [clean_chapter(ch) for ch in chapters]

save_chapters_to_json(chapters, "output/chapters.json")


✅ Saved 11 chapters to output/chapters.json
