In [None]:
import fitz, re, json, os, contractions, camelot
from tqdm.notebook import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import nltk

# Make sure NLTK sentence tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [2]:
RAW_PDF_DIR = r"../Data/raw_pdf"
PROCESSED_TEXT_DIR = r"../Data/processed_text"
IMAGES_DIR = r"../Data/images"
PAGE_IMAGES_DIR = "../data/page_images"
IMAGE_WITH_CAPTIONS_PATH = '../Data/image_with_captions'
os.makedirs(PROCESSED_TEXT_DIR, exist_ok = True)
os.makedirs(IMAGES_DIR, exist_ok = True)
os.makedirs(PAGE_IMAGES_DIR, exist_ok = True)
os.makedirs(IMAGE_WITH_CAPTIONS_PATH, exist_ok = True)

OUTPUT_JSON_PATH = os.path.join(PROCESSED_TEXT_DIR, "chunks_metadata.json").replace("\\", "/")

In [None]:
# Map variations to standard headings
heading_map = {
    "definition": "Definition",
    "description": "Definition",
    "overview": "Definition",
    "causes": "Causes",
    "etiology": "Causes",
    "incidence": "Causes",
    "diagnosis": "Diagnosis",
    "tests": "Diagnosis",
    "identification": "Diagnosis",
    "treatment": "Treatment",
    "therapy": "Treatment",
    "management": "Treatment"
}

# Replace headings with standardized version
def standardize_headings(text):
    # Detect lines starting with uppercase words
    pattern = r"(?<=\n)([A-Z][A-Za-z\s]+)(?=\n)"
    matches = re.findall(pattern, text)
    for match in matches:
        key = match.lower().strip()
        if key in heading_map:
            text = text.replace(match, heading_map[key])
    return text


def clean_text(text):
    """
    Clean extracted text from PDF for RAG:
    - Fix contractions, OCR errors, hyphenations
    - Remove headers, footers, page numbers, noisy sections
    - Normalize punctuation, spaces, and encoding artifacts
    - Return cleaned text + detailed change log
    """
    changes_log = {
        "contractions_expansion": [],
        "non_ascii_removal": [],
        "title_pages_removal": [],
        "headers_footers_removal": [],
        "bibliographic_removal": [],
        "key_terms_removal": [],
        "line_break_fix": [],
        "hyphen_fix": [],
        "bullet_conversion": [],
        "punctuation_normalization": [],
        "unicode_ligature_fix": [],
        "extra_space_clean": []
    }

    # Original backup
    original_text = text

    # 1. Expand contractions (don't → do not)
    expanded_text = contractions.fix(text)
    if expanded_text != text:
        changes_log["contractions_expansion"].append({
            "before": text[:500],
            "after": expanded_text[:500]
        })
    text = expanded_text

    # 2. Remove non-ASCII / corrupted symbols
    cleaned_ascii = re.sub(r'[^\x00-\x7F]+', ' ', text)
    if cleaned_ascii != text:
        changes_log["non_ascii_removal"].append({
            "before": text[:300],
            "after": cleaned_ascii[:300]
        })
    text = cleaned_ascii

    # 3. Remove title pages, contributor lists
    title_patterns = [
        r"The GALE\s+ENCYCLOPEDIA\s+of MEDICINE.*?(?=\n[A-Z])",
        r"STAFF\n.*?(?=\n[A-Z])",
        r"CONTRIBUTORS\n.*?(?=\n[A-Z])",
        r"ADVISORY BOARD\n.*?(?=\n[A-Z])",
        r"Library of Congress Cataloging.*?(?=\n[A-Z])"
    ]
    for pattern in title_patterns:
        matches = re.findall(pattern, text, flags=re.DOTALL | re.IGNORECASE)
        if matches:
            changes_log["title_pages_removal"].append({
                "removed": matches[:3],
                "count": len(matches)
            })
            text = re.sub(pattern, "", text, flags=re.DOTALL | re.IGNORECASE)

    # 4. Remove headers, footers, page numbers
    header_footer_patterns = [
        r"GALE ENCYCLOPEDIA OF MEDICINE.*?\n",
        r"GEM\s*-\s*\d{4}\s*to\s*\d{4}.*?\n",
        r"Page \d+",
        r"\n\d+\n"
    ]
    for pattern in header_footer_patterns:
        matches = re.findall(pattern, text, flags=re.DOTALL | re.IGNORECASE)
        if matches:
            changes_log["headers_footers_removal"].append({
                "removed": matches[:3],
                "count": len(matches)
            })
            text = re.sub(pattern, "", text, flags=re.DOTALL | re.IGNORECASE)

    text = standardize_headings(text)
    
    # 5. Remove bibliographic sections (Resources, Periodicals, Organizations)
    biblio_pattern = r"(Resources|Organizations|Periodicals|Further reading).*"
    biblio_matches = re.findall(biblio_pattern, text, flags=re.IGNORECASE | re.DOTALL)
    if biblio_matches:
        changes_log["bibliographic_removal"].append({
            "removed": biblio_matches[:3],
            "count": len(biblio_matches)
        })
        text = re.sub(biblio_pattern, "", text, flags=re.IGNORECASE | re.DOTALL)

    # 6. Remove "KEY TERMS" and similar metadata
    key_terms_pattern = r"(KEY TERMS|SEE ALSO|Other Names).*"
    key_terms_matches = re.findall(key_terms_pattern, text, flags=re.IGNORECASE | re.DOTALL)
    if key_terms_matches:
        changes_log["key_terms_removal"].append({
            "removed": key_terms_matches[:3],
            "count": len(key_terms_matches)
        })
        text = re.sub(key_terms_pattern, "", text, flags=re.IGNORECASE | re.DOTALL)

    # 7. Fix line breaks inside sentences
    fixed_lines = []
    for line in text.splitlines():
        if line and not re.match(r".*[.:;]$", line):
            fixed_lines.append(line.strip() + " ")
        else:
            fixed_lines.append(line.strip())
    fixed_text = " ".join(fixed_lines)
    if fixed_text != text:
        changes_log["line_break_fix"].append({
            "before": text[:500],
            "after": fixed_text[:500]
        })
    text = fixed_text

    # 8. Fix hyphenated words split across lines
    hyphen_fixed = re.sub(r"(\w+)-\s+(\w+)", r"\1\2", text)
    if hyphen_fixed != text:
        changes_log["hyphen_fix"].append({
            "before": text[:200],
            "after": hyphen_fixed[:200]
        })
    text = hyphen_fixed

    # 9. Convert bullets to markdown (- )
    bullet_converted = re.sub(r"[•·]\s*", "- ", text)
    if bullet_converted != text:
        changes_log["bullet_conversion"].append({
            "before": text[:200],
            "after": bullet_converted[:200]
        })
    text = bullet_converted

    # 10. Normalize multiple punctuations (!!! → .)
    punct_norm = re.sub(r'[!?]{2,}', '.', text)
    punct_norm = re.sub(r'\.{2,}', '.', punct_norm)
    if punct_norm != text:
        changes_log["punctuation_normalization"].append({
            "before": text[:200],
            "after": punct_norm[:200]
        })
    text = punct_norm

    # 11. Fix Unicode ligatures (ﬁ → fi, ﬂ → fl)
    ligature_fixed = text.replace("ﬁ", "fi").replace("ﬂ", "fl")
    if ligature_fixed != text:
        changes_log["unicode_ligature_fix"].append({
            "before": text[:200],
            "after": ligature_fixed[:200]
        })
    text = ligature_fixed

    # 12. Clean extra spaces/newlines
    cleaned_text = re.sub(r"\s{2,}", " ", text).strip()
    if cleaned_text != text:
        changes_log["extra_space_clean"].append({
            "before": text[:200],
            "after": cleaned_text[:200]
        })
    text = cleaned_text

    return text, changes_log


In [None]:
# ---------------- TEXT + TABLE EXTRACTION ----------------
def extract_text_with_tables(pdf_path):
    """
    Extract page text using LangChain (PyPDFLoader) and tables using Camelot.
    Merge tables into text in reading order (tables appended after text of that page).
    """
    # Load text with LangChain
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # Extract tables using Camelot
    tables_by_page = {}
    try:
        tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')  # use 'stream' if borderless
        print(tables)
        for table in tqdm(tables, desc="Extracting Tables"):
            page_num = table.page
            # Convert table to Markdown-like text
            table_text = "\nTable:\n" + "\n".join([" | ".join(row) for row in table.df.values.tolist()])
            tables_by_page.setdefault(page_num, []).append(table_text)
    except Exception as e:
        print(f"No tables detected or error extracting tables: {e}")

    # Combine text + tables per page
    combined_pages = []
    for doc in tqdm(docs, desc="Extracting Text"):
        page_num = doc.metadata['page'] + 1
        text_content = doc.page_content.strip()

        if page_num in tables_by_page:
            for table_text in tables_by_page[page_num]:
                text_content += "\n" + table_text

        print(f'Page No. {page_num}')
        text_content, logs = clean_text(text_content)  # Assume you have clean_text implemented
        print(logs)
        
        combined_pages.append({
            "page_num": page_num,
            "content": text_content
        })

    return combined_pages, logs


# ---------------- CHUNKING ----------------
def chunk_combined_content(pages_data, pdf_path, chunk_size=800, overlap=50, mode="recursive"):
    """
    Chunk combined text (text + tables) into smaller parts.
    
    Args:
        pages_data (list): Extracted pages with combined text/tables.
        pdf_path (str): Path to the PDF file.
        chunk_size (int): Size of each chunk (only for recursive).
        overlap (int): Overlap between chunks (only for recursive).
        mode (str): "recursive" or "sentence".
        
    Returns:
        list: List of chunk metadata dictionaries.
    """
    formatted_chunks = []

    if mode == "recursive":
        # Recursive character splitter
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)

        for page in pages_data:
            chunks = splitter.split_text(page["content"])
            for i, chunk in enumerate(chunks):
                formatted_chunks.append({
                    "chunk_id": f"{page['page_num']}_{i}",
                    "page_num": page["page_num"],
                    "content": chunk.strip(),
                    "pdf_file": os.path.basename(pdf_path),
                    "images": []
                })

    elif mode == "sentence":
        # Sentence splitting with merging short sentences
        for page in pages_data:
            sentences = nltk.sent_tokenize(page["content"])
            buffer = ""
            for i, sentence in enumerate(sentences):
                # Merge short sentences (< 50 chars)
                if len(buffer) + len(sentence) < 50:
                    buffer += " " + sentence
                else:
                    if buffer:
                        formatted_chunks.append({
                            "chunk_id": f"{page['page_num']}_{i}",
                            "page_num": page["page_num"],
                            "content": buffer.strip(),
                            "pdf_file": os.path.basename(pdf_path),
                            "images": []
                        })
                    buffer = sentence
            if buffer:
                formatted_chunks.append({
                    "chunk_id": f"{page['page_num']}_{len(sentences)}",
                    "page_num": page["page_num"],
                    "content": buffer.strip(),
                    "pdf_file": os.path.basename(pdf_path),
                    "images": []
                })

    else:
        raise ValueError("Invalid mode. Use 'recursive' or 'sentence'.")

    return formatted_chunks


# ---------------- IMAGE EXTRACTION ----------------
def extract_images_pymupdf(pdf_path, images_output_dir=IMAGES_DIR):
    """
    Extract inline figures/images from PDF using PyMuPDF.
    """
    os.makedirs(images_output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)

    image_map = {}
    for page_num, page in enumerate(doc, start=1):
        images = page.get_images(full=True)
        image_list = []

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]

            image_filename = f"{os.path.basename(pdf_path).replace('.pdf','')}_page{page_num}_img{img_index+1}.{image_ext}"
            image_path = os.path.join(images_output_dir, image_filename)
            image_path = os.path.relpath(image_path).replace("\\", "/")

            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)

            image_list.append(image_path)

        image_map[page_num] = image_list

    return image_map

# ---------------- IMAGE CAPTION EXTRACTION ----------------
def extract_images_with_captions(pdf_path, output_dir=IMAGE_WITH_CAPTIONS_PATH, caption_lines=3):
    """
    Extract images and nearby captions from PDF pages.
    Captions are detected by finding text near image rectangles.
    
    Args:
        pdf_path (str): Path to PDF file.
        output_dir (str): Directory to save extracted images.
        caption_lines (int): Number of text lines near image to consider as caption.

    Returns:
        dict: {page_num: [ {image_path, caption_text} ]}
    """
    os.makedirs(output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    caption_map = {}
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

    for page_index, page in enumerate(doc, start=1):
        # Extract text blocks for caption proximity
        blocks = page.get_text("blocks")  # (x0, y0, x1, y1, text, block_no, block_type)
        text_blocks = [b for b in blocks if b[4].strip()]

        # Extract images on this page
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]

            # Save image
            image_filename = f"{pdf_name}_page{page_index}_img{img_index+1}.{image_ext}"
            image_path = os.path.join(output_dir, image_filename)
            with open(image_path, "wb") as f:
                f.write(image_bytes)

            # Get image rectangle
            try:
                rect = page.get_image_rects(xref)[0]  # bounding box
            except:
                rect = None

            caption_text = ""
            if rect:
                # Sort text blocks by vertical position
                text_blocks_sorted = sorted(text_blocks, key=lambda b: b[1])  # y0
                caption_candidates = []
                for b in text_blocks_sorted:
                    x0, y0, x1, y1, text, *_ = b
                    # Text above or below image within ~100px
                    if (y0 >= rect.y1 and y0 - rect.y1 < 100) or (rect.y0 - y1 < 100 and y1 <= rect.y0):
                        caption_candidates.append(text)

                caption_text = " ".join(caption_candidates[:caption_lines])

            # Clean caption text
            caption_text = re.sub(r"Figure\s*\d+[:.]?", "", caption_text, flags=re.IGNORECASE).strip()
            caption_text = re.sub(r"\s{2,}", " ", caption_text)

            # Save into map
            caption_map.setdefault(page_index, []).append({
                "image_path": os.path.relpath(image_path).replace("\\", "/"),
                "caption_text": caption_text if caption_text else "No caption detected"
            })

    return caption_map

def extract_full_page_images(pdf_path, page_output_dir=PAGE_IMAGES_DIR):
    """
    Save each full page of the PDF as an image (snapshot).
    """
    os.makedirs(page_output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    page_snapshot_map = {}
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # High-res snapshot
        image_filename = f"{pdf_name}_page{page_num+1}_snapshot.png"
        image_path = os.path.join(page_output_dir, image_filename)
        pix.save(image_path)

        # Store relative path
        page_snapshot_map[page_num + 1] = os.path.relpath(image_path).replace("\\", "/")

    return page_snapshot_map


# ---------------- MERGE TEXT + IMAGES ----------------
def merge_text_and_images_with_captions(chunks, image_map, page_snapshot_map, caption_map):
    """
    Add extracted images, page snapshots, and captions to chunks.
    """
    for chunk in chunks:
        page_num = chunk["page_num"]

        # Add inline figure images
        chunk["images"] = [path.replace("\\", "/") for path in image_map.get(page_num, [])]

        # Add page snapshot
        chunk["page_snapshot"] = page_snapshot_map.get(page_num)

        # Add captions (list of {image_path, caption_text})
        chunk["captions"] = caption_map.get(page_num, [])

    return chunks

# ---------------- SAVE JSON ----------------
def save_chunks_to_json(final_data, output_path=OUTPUT_JSON_PATH):
    """
    Save the final chunked data with tables + images to JSON.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_data, f, ensure_ascii=False, indent=4)
    print(f"Saved chunks metadata to: {output_path}")


# ---------------- USAGE ----------------
if __name__ == "__main__":
    pdf_path = "../Data/raw_pdf/medical_book.pdf"

    # Step 1: Extract text + tables
    combined_pages, logs = extract_text_with_tables(pdf_path)

    # Step 2: Chunk text (choose mode: "recursive" or "sentence")
    text_chunks = chunk_combined_content(combined_pages, pdf_path, chunk_size=600, mode="sentence")

    # Step 3: Extract inline figure images
    image_map = extract_images_pymupdf(pdf_path)

    # Step 4: Extract image captions
    caption_map = extract_images_with_captions(pdf_path)

    # Step 5: Extract full-page snapshots
    page_snapshot_map = extract_full_page_images(pdf_path)

    # Step 6: Merge text, images, and captions
    final_data = merge_text_and_images_with_captions(text_chunks, image_map, page_snapshot_map, caption_map)

    # Step 7: Save chunks to JSON
    save_chunks_to_json(final_data, OUTPUT_JSON_PATH)



<TableList n=9>


Extracting Tables:   0%|          | 0/9 [00:00<?, ?it/s]

Extracting Text:   0%|          | 0/637 [00:00<?, ?it/s]

Page No. 1
{'contractions_expansion': [], 'non_ascii_removal': [], 'title_pages_removal': [], 'headers_footers_removal': [], 'bibliographic_removal': [], 'key_terms_removal': [], 'line_break_fix': [], 'hyphen_fix': [], 'bullet_conversion': [], 'punctuation_normalization': [], 'unicode_ligature_fix': [], 'extra_space_clean': []}
Page No. 2
{'contractions_expansion': [], 'non_ascii_removal': [], 'title_pages_removal': [{'removed': ['The GALE\nENCYCLOPEDIA\nof MEDICINE'], 'count': 1}], 'headers_footers_removal': [], 'bibliographic_removal': [], 'key_terms_removal': [], 'line_break_fix': [{'before': '\nSECOND EDITION', 'after': ' SECOND EDITION '}], 'hyphen_fix': [], 'bullet_conversion': [], 'punctuation_normalization': [], 'unicode_ligature_fix': [], 'extra_space_clean': [{'before': ' SECOND EDITION ', 'after': 'SECOND EDITION'}]}
Page No. 3
{'contractions_expansion': [], 'non_ascii_removal': [], 'title_pages_removal': [{'removed': ['The GALE\nENCYCLOPEDIA\nof MEDICINE'], 'count': 1}], 'h

In [5]:
final_data

[{'chunk_id': '2_1',
  'page_num': 2,
  'content': 'SECOND EDITION',
  'pdf_file': 'medical_book.pdf',
  'images': [],
  'page_snapshot': '../data/page_images/medical_book_page2_snapshot.png',
  'captions': []},
 {'chunk_id': '3_1',
  'page_num': 3,
  'content': 'SECOND EDITION JACQUELINE L. LONGE, EDITOR DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR VOLUME A-B 1',
  'pdf_file': 'medical_book.pdf',
  'images': ['../Data/images/medical_book_page3_img1.jpeg',
   '../Data/images/medical_book_page3_img2.png'],
  'page_snapshot': '../data/page_images/medical_book_page3_snapshot.png',
  'captions': [{'image_path': '../Data/image_with_captions/medical_book_page3_img1.jpeg',
    'caption_text': 'The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION V O L U M E A-B'},
   {'image_path': '../Data/image_with_captions/medical_book_page3_img2.png',
    'caption_text': 'No caption detected'}]},
 {'chunk_id': '4_1',
  'page_num': 4,
  'content': 'Deirdre S. Blanchfield, Associate Editor Christine B. Jeryan, 

In [6]:
logs

{'contractions_expansion': [],
 'non_ascii_removal': [{'before': 'ORGANIZATIONS\nAmerican Lung Association. 1740 Broadway, New York, NY\n10019. (800) 586-4872. <http://www.lungusa.org>.\nCenters for Disease Control and Prevention. 1600 Clifton Rd.,\nNE, Atlanta, GA 30333. (800) 311-3435, (404) 639-3311.\n<http://www.cdc.gov>.\nOTHER\n“Occupational Lung Disease.” American',
   'after': 'ORGANIZATIONS\nAmerican Lung Association. 1740 Broadway, New York, NY\n10019. (800) 586-4872. <http://www.lungusa.org>.\nCenters for Disease Control and Prevention. 1600 Clifton Rd.,\nNE, Atlanta, GA 30333. (800) 311-3435, (404) 639-3311.\n<http://www.cdc.gov>.\nOTHER\n Occupational Lung Disease.  American'}],
 'title_pages_removal': [],
 'headers_footers_removal': [{'removed': ['GALE ENCYCLOPEDIA OF MEDICINE 2 623\n'],
   'count': 1},
  {'removed': ['Page 623'], 'count': 1}],
 'bibliographic_removal': [{'removed': ['ORGANIZATIONS'], 'count': 1}],
 'key_terms_removal': [],
 'line_break_fix': [],
 'hyphen