<a href="https://colab.research.google.com/github/Karan-Baid/rag_qa_pdf/blob/main/Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pymupdf
import json
import os
from math import ceil

def clean_bytes(data_obj):
    """
    Recursively traverses a dictionary or list to convert bytes to strings.
    This prevents JSON serialization errors with raw binary data from PyMuPDF.
    """
    if isinstance(data_obj, dict):
        # If it's a dictionary, recursively clean each value
        return {key: clean_bytes(value) for key, value in data_obj.items()}
    elif isinstance(data_obj, list):
        # If it's a list, recursively clean each item
        return [clean_bytes(item) for item in data_obj]
    elif isinstance(data_obj, bytes):
        # If it's bytes, represent it as a placeholder string
        # This is safer than attempting to decode potentially non-textual data
        return "<binary data>"
    else:
        # If it's any other type (int, str, float), return it as is
        return data_obj

# --- Configuration ---
# Replace with your PDF's file path
pdf_path = "E0H1CM114.pdf"
# Name of the single output JSON file that will contain all page data
output_path = "metadata.json"
# --- End of Configuration ---

# Open the PDF document
try:
    doc = pymupdf.open(pdf_path)
except Exception as e:
    print(f"Error opening PDF {pdf_path}: {e}")
    exit()

# This list will hold the metadata for every page
all_pages_data = []

# --- Iterate through every page in the document ---
for page_num in range(len(doc)):
    page = doc[page_num]
    print(f"Processing page {page_num + 1}/{len(doc)}...")

    # 1. Extract the raw page data as a dictionary
    page_data = page.get_text("dict")

    # 2. Perform a deep clean to handle non-serializable bytes
    cleaned_page_data = clean_bytes(page_data)

    # 3. Add the cleaned page data to our master list
    all_pages_data.append({
        "page_number": page_num + 1,
        "content": cleaned_page_data
    })


# 4. Create the final output object
final_output = {
    "document_metadata": {
        "source_pdf": os.path.basename(pdf_path),
        "total_pages": len(doc)
    },
    "pages": all_pages_data
}

# 5. Save the consolidated data to a single JSON file
with open(output_path, "w", encoding='utf-8') as f:
    json.dump(final_output, f, indent=4, ensure_ascii=False)

with open(output_path, 'r') as f:
    data = json.load(f)

# Dictionary to hold best merged text for each key
text_dict = {}

# Smart merge function to handle partial overlaps
def smart_merge(existing_text, new_text):
    existing_text = existing_text.strip()
    new_text = new_text.strip()

    # Case 1: new is already inside existing
    if new_text in existing_text:
        return existing_text

    # Case 2: existing is inside new → replace
    if existing_text in new_text:
        return new_text

    # Case 3: partial overlap → try to merge
    # We'll try to find the longest suffix of existing that matches prefix of new
    max_overlap = 0
    min_len = min(len(existing_text), len(new_text))
    for i in range(1, min_len):
        if existing_text[-i:] == new_text[:i]:
            max_overlap = i

    # Merge using that overlap
    merged = existing_text + new_text[max_overlap:]
    return merged

# Iterate through the JSON
for page in data["pages"]:
    page_num = page["page_number"]

    for block in page["content"]["blocks"]:
        if "lines" not in block:
            continue

        for line in block["lines"]:
            for span in line["spans"]:
                key = (
                    page_num,
                    span["size"],
                    span["ascender"],
                    span["descender"],
                    span.get("font", "")
                )
                text = span["text"].strip()

                if key in text_dict:
                    text_dict[key] = smart_merge(text_dict[key], text)
                else:
                    text_dict[key] = text


sorted_items = sorted(text_dict.items(), key=lambda item: item[0][1], reverse=True)
unique_sizes = []
seen = set()
for (page, size, asc, desc, font), text in sorted_items:
    if size not in seen:
        unique_sizes.append(size)
        seen.add(size)
    if len(unique_sizes) == 4:  # title + H1 + H2 + H3
        break

# Step 4: Compute ranges
title_size = unique_sizes[0]
h1_base = ceil(unique_sizes[1] / 5.0) * 5

heading_ranges = {
    "H1": (h1_base - 4, h1_base),
    "H2": (h1_base - 9, h1_base - 5),
    "H3": (h1_base - 14, h1_base - 10)
}

# Step 5: Generate output
output = {
    "title": "",
    "outline": []
}

for (page, size, asc, desc, font), text in sorted_items:
    if size == title_size and output["title"] == "":
        output["title"] = text
        continue

    for level, (low, high) in heading_ranges.items():
        if low <= size <= high:
            output["outline"].append({
                "level": level,
                "text": text,
                "page": page
            })
            break
final_op_path="output.json"
# print(json.dumps(output, indent=2))
with open(final_op_path, "w", encoding='utf-8') as f:
    json.dump(output, f, indent=4, ensure_ascii=False)


Processing page 1/14...
Processing page 2/14...
Processing page 3/14...
Processing page 4/14...
Processing page 5/14...
Processing page 6/14...
Processing page 7/14...
Processing page 8/14...
Processing page 9/14...
Processing page 10/14...
Processing page 11/14...
Processing page 12/14...
Processing page 13/14...
Processing page 14/14...


In [2]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3
