In [1]:
import fitz
from PIL import Image
import os

In [2]:
pdf_file = "p.pdf"
doc = fitz.open(pdf_file)
outpdf = fitz.open()

In [3]:
# Loop over all pages of the PDF file
for page_num in range(len(doc)):
    page = doc[page_num]
    paths = page.get_drawings()  # extract existing drawings

    # Define some output page with the same dimensions
    outpage = outpdf.new_page(width=page.rect.width, height=page.rect.height)
    shape = outpage.new_shape()  # make a drawing canvas for the output page    

    # Loop through the paths and draw them
    for path in paths:
        for item in path["items"]:  # these are the draw commands
            if item[0] == "l":  # line
                shape.draw_line(item[1], item[2])
            elif item[0] == "re":  # rectangle
                shape.draw_rect(item[1])
            elif item[0] == "qu":  # quad
                shape.draw_quad(item[1])
            elif item[0] == "c":  # curve
                shape.draw_bezier(item[1], item[2], item[3], item[4])
            else:
                raise ValueError("unhandled drawing", item)
        shape.finish()
    
    # Commit the shape to its page
    shape.commit()

In [4]:
# Save the output PDF
output_pdf_path = f"{os.path.splitext(pdf_file)[0]}_extracted.pdf"
outpdf.save(output_pdf_path)

In [6]:
# Here we loop over all the pages of the extracted pdf file and save them as images
extracted_doc = fitz.open(output_pdf_path)

for page_num in range(len(extracted_doc)):
    page = extracted_doc.load_page(page_num)
    mat = fitz.Matrix(5.0, 5.0)  # Increasing this number will give better clarity but will increase dimension of image file
    pix = page.get_pixmap(matrix=mat)
    output_image_path = f"{os.path.splitext(pdf_file)[0]}_page_{page_num + 1}.jpg"
    pix.save(output_image_path)