In [10]:
import fitz  # PyMuPDF
import os

# Input the PDF file you want to work with
pdf_file = "Input.pdf"
output_folder = "extracted_images"

# Open the PDF file
doc = fitz.open(pdf_file)

# Create an output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to check if a rectangle is in the desired column
def is_in_column(rect, column_x_start, column_x_end):
    return column_x_start <= rect.x0 and rect.x1 <= column_x_end

# Loop over all pages of the PDF file
for page_num in range(len(doc)):
    page = doc[page_num]
    images = page.get_images(full=True)
    drawings = page.get_drawings()

    # Automatically detect the column start and end coordinates
    column_x_start = float('inf')
    column_x_end = float('-inf')

    # Calculate column start and end based on images
    for img in images:
        rect = fitz.Rect(img[3])
        if rect.x0 < column_x_start:
            column_x_start = rect.x0
        if rect.x1 > column_x_end:
            column_x_end = rect.x1

    # Calculate column start and end based on drawings
    for path in drawings:
        rect = path["rect"]
        if rect.x0 < column_x_start:
            column_x_start = rect.x0
        if rect.x1 > column_x_end:
            column_x_end = rect.x1

    image_count = 0

    # Extract and save images in the column
    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        
        # Determine the position of the image on the page
        rect = fitz.Rect(img[3])
        
        if is_in_column(rect, column_x_start, column_x_end):
            image_count += 1
            output_image_path = os.path.join(output_folder, f"page_{page_num + 1}_image_{image_count}.{image_ext}")
            with open(output_image_path, "wb") as f:
                f.write(image_bytes)

    # Extract and save drawings in the column
    for path in drawings:
        if is_in_column(path["rect"], column_x_start, column_x_end):
            shape = page.new_shape()
            for item in path["items"]:
                if item[0] == "l":  # line
                    shape.draw_line(item[1], item[2])
                elif item[0] == "re":  # rectangle
                    shape.draw_rect(item[1])
                elif item[0] == "qu":  # quad
                    shape.draw_quad(item[1])
                elif item[0] == "c":  # curve
                    shape.draw_bezier(item[1], item[2], item[3], item[4])
                else:
                    raise ValueError("unhandled drawing", item)
            shape.finish()
            image_count += 1
            output_image_path = os.path.join(output_folder, f"page_{page_num + 1}_drawing_{image_count}.png")
            shape.commit()
            # Save the drawn shape as an image
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Adjust scaling if needed
            pix.save(output_image_path)

print(f"Extracted images and drawings saved in {output_folder}")
