<a href="https://colab.research.google.com/github/GhazaleZe/Python-Exercises/blob/main/pdf_reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [4]:
import fitz  # PyMuPDF
import os
from pathlib import Path

def extract_and_save_images(pdf_path, output_dir):
    """
    Extracts all images from a PDF and saves them to a specified directory.

    Args:
        pdf_path (str): The path to the PDF file.
        output_dir (str): The path to the directory where images will be saved.

    Returns:
        A list of paths to the saved images.
    """
    # Create the output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return []

    saved_image_paths = []

    for page_num in range(doc.page_count):
        page = doc[page_num]

        # Get a list of all images on the page
        image_list = page.get_images(full=True)

        for img_info in image_list:
            xref = img_info[0]

            # Extract the image data
            image_object = doc.extract_image(xref)
            image_data = image_object["image"]

            # Get the image extension (e.g., "png", "jpeg")
            image_ext = image_object["ext"]

            # Create a unique filename
            image_filename = f"page_{page_num}_img_{xref}.{image_ext}"
            image_path = os.path.join(output_dir, image_filename)

            # Write the image data to a file
            try:
                with open(image_path, "wb") as img_file:
                    img_file.write(image_data)
                saved_image_paths.append(image_path)
                print(f"Saved image: {image_path}")
            except Exception as e:
                print(f"Error saving image {image_path}: {e}")

    doc.close()
    return saved_image_paths

# --- Full Example Usage ---

def extract_and_save_all_content(pdf_path, images_output_dir="extracted_images"):
    """
    A combined function to extract text blocks and save all images.

    Args:
        pdf_path (str): The path to the PDF file.
        images_output_dir (str): The directory to save images to.

    Returns:
        A dictionary with extracted text content and a list of saved image paths.
    """
    Path(images_output_dir).mkdir(parents=True, exist_ok=True)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return None

    all_content = {'pages': [], 'saved_images': []}

    for page_num in range(doc.page_count):
        page = doc[page_num]
        page_content = {
            'page_number': page_num,
            'page_size': (page.rect.width, page.rect.height),
            'text_blocks': []
        }

        # --- 1. Extract Text Blocks ---
        text_blocks = page.get_text("blocks")
        for block in text_blocks:
            x0, y0, x1, y1, text, _, block_type = block
            if block_type == 0:
                page_content['text_blocks'].append({
                    'text': text.strip(),
                    'bbox': (x0, y0, x1, y1)
                })

        # --- 2. Extract and Save Images ---
        images = page.get_images(full=True)
        for img_info in images:
            xref = img_info[0]
            image_object = doc.extract_image(xref)
            image_data = image_object['image']
            image_ext = image_object['ext']

            image_filename = f"page_{page_num}_img_{xref}.{image_ext}"
            image_path = os.path.join(images_output_dir, image_filename)

            try:
                with open(image_path, "wb") as img_file:
                    img_file.write(image_data)
                all_content['saved_images'].append(image_path)
                print(f"Saved image: {image_path}")
            except Exception as e:
                print(f"Error saving image {image_path}: {e}")

        all_content['pages'].append(page_content)

    doc.close()
    return all_content

# --- Example Usage ---
if __name__ == '__main__':
    # You need a PDF file named 'sample.pdf' for this to work.
    pdf_file_path = 'test_pdf.pdf'

    # Directory to save the extracted images
    output_directory = "extracted_images"

    # Run the combined function
    extracted_results = extract_and_save_all_content(pdf_file_path, output_directory)

    if extracted_results:
        print("\n--- Extraction Summary ---")
        print(f"Total images saved: {len(extracted_results['saved_images'])}")
        print(f"Saved images are in the '{output_directory}' directory.")

        # You can still access the text data if needed
        for page in extracted_results['pages']:
            print(f"\n--- Page {page['page_number'] + 1} has {len(page['text_blocks'])} text blocks ---")
            # You can process the text blocks here as before

Saved image: extracted_images/page_0_img_8.png
Saved image: extracted_images/page_0_img_9.png

--- Extraction Summary ---
Total images saved: 2
Saved images are in the 'extracted_images' directory.

--- Page 1 has 9 text blocks ---


In [5]:
extracted_results

{'pages': [{'page_number': 0,
   'page_size': (612.0, 792.0),
   'text_blocks': [{'text': '',
     'bbox': (540.0,
      195.8520050048828,
      542.4359741210938,
      210.50401306152344)},
    {'text': '',
     'bbox': (72.0, 217.21200561523438, 74.43599700927734, 231.864013671875)},
    {'text': 'This is test for library',
     'bbox': (72.0,
      242.41201782226562,
      180.12742614746094,
      257.06402587890625)},
    {'text': '',
     'bbox': (540.0, 537.6119995117188, 542.4359741210938, 552.2639770507812)},
    {'text': '',
     'bbox': (72.0, 558.9719848632812, 74.43599700927734, 573.6239624023438)},
    {'text': '',
     'bbox': (72.0, 583.9320068359375, 98.43599700927734, 598.583984375)},
    {'text': '',
     'bbox': (72.0, 608.8920288085938, 74.43599700927734, 623.5440063476562)},
    {'text': 'There is data in the ﬁrst box \nI’ve added',
     'bbox': (109.19994354248047,
      585.6119995117188,
      251.7911834716797,
      617.3040161132812)},
    {'text': 'This 

In [7]:
extracted_results['pages'][0]

{'page_number': 0,
 'page_size': (612.0, 792.0),
 'text_blocks': [{'text': '',
   'bbox': (540.0, 195.8520050048828, 542.4359741210938, 210.50401306152344)},
  {'text': '',
   'bbox': (72.0, 217.21200561523438, 74.43599700927734, 231.864013671875)},
  {'text': 'This is test for library',
   'bbox': (72.0, 242.41201782226562, 180.12742614746094, 257.06402587890625)},
  {'text': '',
   'bbox': (540.0, 537.6119995117188, 542.4359741210938, 552.2639770507812)},
  {'text': '',
   'bbox': (72.0, 558.9719848632812, 74.43599700927734, 573.6239624023438)},
  {'text': '',
   'bbox': (72.0, 583.9320068359375, 98.43599700927734, 598.583984375)},
  {'text': '',
   'bbox': (72.0, 608.8920288085938, 74.43599700927734, 623.5440063476562)},
  {'text': 'There is data in the ﬁrst box \nI’ve added',
   'bbox': (109.19994354248047,
    585.6119995117188,
    251.7911834716797,
    617.3040161132812)},
  {'text': 'This is important data in \nsecond box that I’ve added',
   'bbox': (376.55999755859375,
    5