In [7]:
import fitz
from PIL import Image

In [8]:
doc = fitz.open("table.pdf")
page = doc[0]
paths = page.get_drawings()  # extract existing drawings
# this is a list of "paths", which can directly be drawn again using Shape
# -------------------------------------------------------------------------

In [9]:
# define some output page with the same dimensions
outpdf = fitz.open()
outpage = outpdf.new_page(width=page.rect.width, height=page.rect.height)
shape = outpage.new_shape()  # make a drawing canvas for the output page

In [10]:
# --------------------------------------
# loop through the paths and draw them
# --------------------------------------
for path in paths:
    # ------------------------------------
    # draw each entry of the 'items' list
    # ------------------------------------
    for item in path["items"]:  # these are the draw commands
        if item[0] == "l":  # line
            shape.draw_line(item[1], item[2])
        elif item[0] == "re":  # rectangle
            shape.draw_rect(item[1])
        elif item[0] == "qu":  # quad
            shape.draw_quad(item[1])
        elif item[0] == "c":  # curve
            shape.draw_bezier(item[1], item[2], item[3], item[4])
        else:
            raise ValueError("unhandled drawing", item)
    # ------------------------------------------------------
    # all items are drawn, now apply the common properties
    # to finish the path
    # ------------------------------------------------------
    shape.finish()

In [11]:
# all paths processed - commit the shape to its page
shape.commit()
outpdf.save("test_file_extracted.pdf")

In [12]:
import PIL

In [None]:
doc = fitz.open('test_file_extracted.pdf')
page = doc.load_page(0)
pix = page.get_pixmap()
img = pix.save('h.jpg')

In [44]:
import fitz
from PIL import Image

# Open the PDF file
doc = fitz.open('test_file_extracted.pdf')

# Load the desired page
page = doc.load_page(0)

# Get the pixel map of the page
pix = page.get_pixmap()

# Get the pixel width and height of the page
width = pix.width
height = pix.height

# Convert the pixel map to an Image object
image = Image.frombytes("RGB", (width, height), pix.samples)

# Get the quality of the original PDF file
pdf_quality = doc.metadata.get('pdf:PDFVersion')

# Check if pdf_quality is not None before converting it to an integer
if pdf_quality is not None:
    # Convert the PDF version to an integer and save the image as JPEG with the same quality
    image.save('h1.jpg', quality=int(pdf_quality))
else:
    # Save the image as JPEG with a default quality if pdf_quality is None
    image.save('h1.jpg', quality=100)  # Set a default quality of 100

In [13]:
from pdf2image import convert_from_path

# Specify the path to the PDF file
pdf_path = 'test_file_extracted.pdf'

# Convert the PDF file to a list of images
images = convert_from_path(pdf_path)

# Save each image as a JPEG file
for i, image in enumerate(images):
    image.save(f'page_{i+1}.jpg', 'JPEG')

In [17]:
import re
import cv2
import pytesseract
from PIL import Image
from pytesseract import Output

img = cv2.imread('invoice.jpg')
d = pytesseract.image_to_data(img, output_type=Output.DICT)
keys = list(d.keys())

date_pattern = '^(0[1-9]|[12][0-9]|3[01])/(0[1-9]|1[012])/(19|20)\d\d$'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) > 60:
        if re.match(date_pattern, d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

cv2.imshow('img', img)
cv2.waitKey(0)
img.save("sample.pdf")