<a href="https://colab.research.google.com/github/GhazaleZe/Python-Exercises/blob/main/pdf_reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [5]:
! pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import fitz  # PyMuPDF
import os
from pathlib import Path

def extract_and_save_images(pdf_path, output_dir):
    """
    Extracts all images from a PDF and saves them to a specified directory.

    Args:
        pdf_path (str): The path to the PDF file.
        output_dir (str): The path to the directory where images will be saved.

    Returns:
        A list of paths to the saved images.
    """
    # Create the output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return []

    saved_image_paths = []

    for page_num in range(doc.page_count):
        page = doc[page_num]

        # Get a list of all images on the page
        image_list = page.get_images(full=True)

        for img_info in image_list:
            xref = img_info[0]

            # Extract the image data
            image_object = doc.extract_image(xref)
            image_data = image_object["image"]

            # Get the image extension (e.g., "png", "jpeg")
            image_ext = image_object["ext"]

            # Create a unique filename
            image_filename = f"page_{page_num}_img_{xref}.{image_ext}"
            image_path = os.path.join(output_dir, image_filename)

            # Write the image data to a file
            try:
                with open(image_path, "wb") as img_file:
                    img_file.write(image_data)
                saved_image_paths.append(image_path)
                print(f"Saved image: {image_path}")
            except Exception as e:
                print(f"Error saving image {image_path}: {e}")

    doc.close()
    return saved_image_paths

# --- Full Example Usage ---

def extract_and_save_all_content(pdf_path, images_output_dir="extracted_images"):
    """
    A combined function to extract text blocks and save all images.

    Args:
        pdf_path (str): The path to the PDF file.
        images_output_dir (str): The directory to save images to.

    Returns:
        A dictionary with extracted text content and a list of saved image paths.
    """
    Path(images_output_dir).mkdir(parents=True, exist_ok=True)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return None

    all_content = {'pages': [], 'saved_images': []}

    for page_num in range(doc.page_count):
        page = doc[page_num]
        page_content = {
            'page_number': page_num,
            'page_size': (page.rect.width, page.rect.height),
            'text_blocks': []
        }

        # --- 1. Extract Text Blocks ---
        text_blocks = page.get_text("blocks")
        for block in text_blocks:
            x0, y0, x1, y1, text, _, block_type = block
            if block_type == 0:
                page_content['text_blocks'].append({
                    'text': text.strip(),
                    'bbox': (x0, y0, x1, y1)
                })

        # --- 2. Extract and Save Images ---
        images = page.get_images(full=True)
        for img_info in images:
            xref = img_info[0]
            image_object = doc.extract_image(xref)
            image_data = image_object['image']
            image_ext = image_object['ext']

            image_filename = f"page_{page_num}_img_{xref}.{image_ext}"
            image_path = os.path.join(images_output_dir, image_filename)

            try:
                with open(image_path, "wb") as img_file:
                    img_file.write(image_data)
                all_content['saved_images'].append(image_path)
                print(f"Saved image: {image_path}")
            except Exception as e:
                print(f"Error saving image {image_path}: {e}")

        all_content['pages'].append(page_content)

    doc.close()
    return all_content

# --- Example Usage ---
if __name__ == '__main__':
    # You need a PDF file named 'sample.pdf' for this to work.
    pdf_file_path = 'test_pdf.pdf'

    # Directory to save the extracted images
    output_directory = "extracted_images"

    # Run the combined function
    extracted_results = extract_and_save_all_content(pdf_file_path, output_directory)

    if extracted_results:
        print("\n--- Extraction Summary ---")
        print(f"Total images saved: {len(extracted_results['saved_images'])}")
        print(f"Saved images are in the '{output_directory}' directory.")

        # You can still access the text data if needed
        for page in extracted_results['pages']:
            print(f"\n--- Page {page['page_number'] + 1} has {len(page['text_blocks'])} text blocks ---")
            # You can process the text blocks here as before

Saved image: extracted_images/page_0_img_8.png
Saved image: extracted_images/page_0_img_9.png

--- Extraction Summary ---
Total images saved: 2
Saved images are in the 'extracted_images' directory.

--- Page 1 has 9 text blocks ---


In [5]:
extracted_results

{'pages': [{'page_number': 0,
   'page_size': (612.0, 792.0),
   'text_blocks': [{'text': '',
     'bbox': (540.0,
      195.8520050048828,
      542.4359741210938,
      210.50401306152344)},
    {'text': '',
     'bbox': (72.0, 217.21200561523438, 74.43599700927734, 231.864013671875)},
    {'text': 'This is test for library',
     'bbox': (72.0,
      242.41201782226562,
      180.12742614746094,
      257.06402587890625)},
    {'text': '',
     'bbox': (540.0, 537.6119995117188, 542.4359741210938, 552.2639770507812)},
    {'text': '',
     'bbox': (72.0, 558.9719848632812, 74.43599700927734, 573.6239624023438)},
    {'text': '',
     'bbox': (72.0, 583.9320068359375, 98.43599700927734, 598.583984375)},
    {'text': '',
     'bbox': (72.0, 608.8920288085938, 74.43599700927734, 623.5440063476562)},
    {'text': 'There is data in the ﬁrst box \nI’ve added',
     'bbox': (109.19994354248047,
      585.6119995117188,
      251.7911834716797,
      617.3040161132812)},
    {'text': 'This 

In [7]:
extracted_results['pages'][0]

{'page_number': 0,
 'page_size': (612.0, 792.0),
 'text_blocks': [{'text': '',
   'bbox': (540.0, 195.8520050048828, 542.4359741210938, 210.50401306152344)},
  {'text': '',
   'bbox': (72.0, 217.21200561523438, 74.43599700927734, 231.864013671875)},
  {'text': 'This is test for library',
   'bbox': (72.0, 242.41201782226562, 180.12742614746094, 257.06402587890625)},
  {'text': '',
   'bbox': (540.0, 537.6119995117188, 542.4359741210938, 552.2639770507812)},
  {'text': '',
   'bbox': (72.0, 558.9719848632812, 74.43599700927734, 573.6239624023438)},
  {'text': '',
   'bbox': (72.0, 583.9320068359375, 98.43599700927734, 598.583984375)},
  {'text': '',
   'bbox': (72.0, 608.8920288085938, 74.43599700927734, 623.5440063476562)},
  {'text': 'There is data in the ﬁrst box \nI’ve added',
   'bbox': (109.19994354248047,
    585.6119995117188,
    251.7911834716797,
    617.3040161132812)},
  {'text': 'This is important data in \nsecond box that I’ve added',
   'bbox': (376.55999755859375,
    5

In [8]:
import fitz  # PyMuPDF
import pdfplumber
import os
from pathlib import Path

def is_within_bbox(inner_bbox, outer_bbox):
    """
    Checks if a bounding box is fully or partially inside another.
    Returns False if either bbox is malformed.
    """
    # Defensive check to ensure bboxes have 4 values
    if len(inner_bbox) != 4 or len(outer_bbox) != 4:
        return False

    x0_inner, y0_inner, x1_inner, y1_inner = inner_bbox
    x0_outer, y0_outer, x1_outer, y1_outer = outer_bbox

    # Check for any overlap
    return not (x1_inner < x0_outer or x0_inner > x1_outer or
                y1_inner < y0_outer or y0_inner > y1_outer)

def extract_content_with_table_images(pdf_path, output_dir="extracted_content"):
    """
    Extracts text (excluding tables) and saves tables as images from a PDF.

    This version is robust against malformed bounding boxes.

    Args:
        pdf_path (str): The path to the PDF file.
        output_dir (str): The directory to save extracted content.

    Returns:
        A dictionary with extracted text and a list of saved table image paths.
    """
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    all_content = {
        'pages': [],
        'saved_tables': [],
        'other_images': []
    }

    try:
        doc_fitz = fitz.open(pdf_path)
        doc_plumber = pdfplumber.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return None

    for page_num, page_plumber in enumerate(doc_plumber.pages):
        page_fitz = doc_fitz[page_num]

        page_content = {
            'page_number': page_num,
            'text_blocks': []
        }

        # --- 1. Detect and Save Tables as Images using pdfplumber ---
        table_bboxes = []
        tables = page_plumber.find_tables()

        for i, table in enumerate(tables):
            table_bbox = table.bbox
            if len(table_bbox) == 4:
                table_bboxes.append(table_bbox)

                try:
                    pix = page_fitz.get_pixmap(clip=table_bbox)
                    image_path = os.path.join(output_dir, f"page_{page_num}_table_{i}.png")
                    pix.save(image_path)
                    all_content['saved_tables'].append(image_path)
                    print(f"Saved table as image: {image_path}")
                except Exception as e:
                    print(f"Error saving table image on page {page_num}: {e}")

        # --- 2. Extract Text Blocks that are NOT part of a table ---
        text_blocks = page_fitz.get_text("blocks")
        for block in text_blocks:
            x0, y0, x1, y1, text, _, block_type = block
            if block_type == 0 and len(block) >= 4:
                block_bbox = (x0, y0, x1, y1)

                is_table_text = False
                for table_bbox in table_bboxes:
                    if is_within_bbox(block_bbox, table_bbox):
                        is_table_text = True
                        break

                if not is_table_text:
                    page_content['text_blocks'].append({
                        'text': text.strip(),
                        'bbox': block_bbox
                    })

        # --- 3. Extract and save other images ---
        images = page_fitz.get_images(full=True)
        for img_info in images:
            # Check if a bounding box exists for the image
            if len(img_info) >= 8:
                xref = img_info[0]
                img_bbox = img_info[7]

                is_table_img = False
                for table_bbox in table_bboxes:
                    if is_within_bbox(img_bbox, table_bbox):
                        is_table_img = True
                        break

                if not is_table_img:
                    try:
                        image_object = doc_fitz.extract_image(xref)
                        image_data = image_object['image']
                        image_ext = image_object['ext']

                        image_filename = f"page_{page_num}_img_{xref}.{image_ext}"
                        image_path = os.path.join(output_dir, image_filename)

                        with open(image_path, "wb") as img_file:
                            img_file.write(image_data)
                        all_content['other_images'].append(image_path)
                        print(f"Saved other image: {image_path}")
                    except Exception as e:
                        print(f"Error saving other image {image_path}: {e}")

        all_content['pages'].append(page_content)

    doc_fitz.close()
    doc_plumber.close()
    return all_content

# --- Example Usage ---
if __name__ == '__main__':
    pdf_file_path = 'test_lib.pdf'

    extracted_data = extract_content_with_table_images(pdf_file_path)

    if extracted_data:
        print("\n--- Extraction Summary ---")
        print(f"Total tables saved as images: {len(extracted_data['saved_tables'])}")
        print(f"Total other images saved: {len(extracted_data['other_images'])}")

        for page in extracted_data['pages']:
            print(f"\n--- Page {page['page_number'] + 1} Non-Table Text ---")
            for text_block in page['text_blocks']:
                print(f"  - Text: {text_block['text'][:50]}...")

Saved table as image: extracted_content/page_0_table_0.png
Saved other image: extracted_content/page_0_img_8.png
Saved other image: extracted_content/page_0_img_9.png

--- Extraction Summary ---
Total tables saved as images: 1
Total other images saved: 2

--- Page 1 Non-Table Text ---
  - Text: ...
  - Text: ...
  - Text: This is test for library...
  - Text: ...
  - Text: ...
  - Text: ...
  - Text: ...
  - Text: There is data in the ﬁrst box 
I’ve added...
  - Text: This is important data in 
second box that I’ve ad...


In [7]:
import fitz  # PyMuPDF library
import io
import os
from PIL import Image

def extract_and_structure_pdf(pdf_path: str, output_dir: str = "output"):
    """
    Analyzes a PDF to extract text, images, and tables, preserving the reading order.

    Args:
        pdf_path (str): The file path to the PDF document.
        output_dir (str): The directory where extracted files will be saved.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF file: {e}")
        return

    print(f"Processing '{pdf_path}' with {len(doc)} pages...")
    final_ordered_content = []

    for page_num, page in enumerate(doc):
        print(f"  -> Processing Page {page_num + 1}...")
        elements = []

        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            try:
                bbox = page.get_image_bbox(img)
                if bbox:
                    elements.append({"type": "image", "bbox": bbox, "xref": xref, "index": img_index})
            except ValueError as e:
                print(f"      - Warning: Could not get bbox for image {img_index} on page {page_num + 1}. Skipping. Error: {e}")

        try:
            table_finder = page.find_tables()
            for table_index, table in enumerate(table_finder.tables):
                if table.bbox:
                    elements.append({"type": "table", "bbox": table.bbox, "index": table_index})
        except Exception as e:
            print(f"      - Warning: Table detection failed on page {page_num + 1}. Error: {e}")


        text_blocks = page.get_text("dict").get("blocks", [])
        for block in text_blocks:
            if block.get('type') == 0 and block.get('bbox'): # This is a text block
                block_bbox = fitz.Rect(block['bbox'])

                # Check if this text block is inside an already identified image or table
                is_part_of_other_element = False
                for el in elements:
                    # Ensure the element's bbox is a fitz.Rect object before checking for containment
                    element_bbox = fitz.Rect(el["bbox"])
                    if element_bbox.contains(block_bbox.tl) and element_bbox.contains(block_bbox.br):
                        is_part_of_other_element = True
                        break

                if not is_part_of_other_element:
                    # Extract the actual text from the block
                    block_text = ""
                    if "lines" in block:
                        for line in block["lines"]:
                            for span in line["spans"]:
                                block_text += span["text"] + " "
                            block_text += "\n"
                    elements.append({"type": "text", "bbox": block_bbox, "content": block_text.strip()})

        elements.sort(key=lambda el: (fitz.Rect(el["bbox"]).y0, fitz.Rect(el["bbox"]).x0))

        page_content = [f"--- Page {page_num + 1} ---"]
        for i, el in enumerate(elements):
            bbox = fitz.Rect(el["bbox"])
            if el["type"] == "text":
                page_content.append(el["content"])

            elif el["type"] == "image":
                try:
                    base_image = doc.extract_image(el["xref"])
                    image_bytes = base_image["image"]

                    img_filename = f"page{page_num+1}_img{el['index']+1}.{base_image['ext']}"
                    img_filepath = os.path.join(output_dir, img_filename)
                    with open(img_filepath, "wb") as img_file:
                        img_file.write(image_bytes)

                    page_content.append(f"\n[IMAGE: {img_filename}]\n")
                except Exception as e:
                    print(f"      - Warning: Could not extract image {el['index']} on page {page_num + 1}. Error: {e}")

            elif el["type"] == "table":
                try:
                    pix = page.get_pixmap(clip=bbox, dpi=300)
                    table_filename = f"page{page_num+1}_table{el['index']+1}.png"
                    table_filepath = os.path.join(output_dir, table_filename)
                    pix.save(table_filepath)

                    page_content.append(f"\n[TABLE: {table_filename}]\n")
                except Exception as e:
                    print(f"      - Warning: Could not extract table {el['index']} on page {page_num + 1}. Error: {e}")

        final_ordered_content.append("\n\n".join(page_content))

    output_text_path = os.path.join(output_dir, "extracted_ordered_text.txt")
    with open(output_text_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(final_ordered_content))

    doc.close()
    print("\nProcessing complete.")
    print(f"Check the '{output_dir}' directory for the extracted images, tables, and the ordered text file.")


if __name__ == '__main__':

    pdf_file_path = "sample.pdf"

    if not os.path.exists(pdf_file_path):
        print(f"Error: The file '{pdf_file_path}' was not found.")
        print("Please create a 'sample.pdf' or update the 'pdf_file_path' variable in the script.")
    else:
        extract_and_structure_pdf(pdf_file_path, output_dir="pdf_output")


Processing 'sample.pdf' with 2 pages...
  -> Processing Page 1...
  -> Processing Page 2...

Processing complete.
Check the 'pdf_output' directory for the extracted images, tables, and the ordered text file.
