In [1]:
#pip install opencv-contrib-python

In [2]:
import os
import fitz  #pymupdf
from pathlib import Path
import cv2
import numpy as np

In [3]:
def clip_image(i, name):
    
    img = cv2.imread(i) # Read in the image and convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = 255*(gray < 128).astype(np.uint8) # To invert the text to white
    coords = cv2.findNonZero(gray) # Find all non-zero points (text)
    x, y, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
    rect = img[y:y+h, x:x+w] # Crop the image - note we do this on the original image
    print(f'Cropping image {name} x:{x}, y:{y}, w:{w}, h:{h}')
    cv2.imwrite(i, rect) # Save the image


In [4]:
def pdf2img(p):
    """
    p: path of pdf directory
    
    requires 'clip_image' function
    
    """
    p_out = p + r'\images'
    Path(p_out).mkdir(parents=True, exist_ok=True)  # create the 'images' folder if it doesn't exist
    path = Path(p)
    l_pdfs = [f for f in path.glob('*.pdf')]
    for pdf in l_pdfs:
        print(f'Opening: {pdf}')
        doc = fitz.open(pdf)
        page = doc.loadPage(0)
        print(pdf.stem, page.MediaBox)
        #print(page.MediaBox[3])
        zoom = 2
        zz = fitz.Matrix(zoom, zoom)
        pix = page.getPixmap(matrix = zz)
        f_out = os.path.join(p_out, pdf.stem + '.png')
        #print(f'Extracting: {f_out}')
        pix.writeImage(f_out)
        
        #print(f' image {pdf.stem} x:{x}, y:{y}, w:{w}, h:{h}')
        print(f' rect {page.rect}')
        print(f' CB {page.CropBox}')
        print(f' rect {page.MediaBox}')
        
        try:
            clip_image(f_out, pdf.stem)
        except:
            print(f'failed clip on {f_out}')


In [6]:
#for example:   

p_in = r'C:\bin\data\GeoTools\PDF2PNG\PDF\220920'  # path to pdf directory
pth = r'C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs'
pdf2img(pth)

Opening: C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs\SA001.pdf
SA001 Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.9199829101562, 595.3200073242188)
 CB Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.92, 595.32)
Cropping image SA001 x:104, y:123, w:1398, h:82
Opening: C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs\SA002_1.pdf
SA002_1 Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.9199829101562, 595.3200073242188)
 CB Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.92, 595.32)
Cropping image SA002_1 x:105, y:110, w:1418, h:640
Opening: C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs\SA002_2.pdf
SA002_2 Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.9199829101562, 595.3200073242188)
 CB Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.92, 595.32)
Cropping image SA002_2 x:122, y:124, w:1438, h:821
Opening: C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs\SA003_1.pdf
SA003_1 Rect(0.0, 0.0, 841.92, 595.32)
 rect

SA022_3 Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.9199829101562, 595.3200073242188)
 CB Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.92, 595.32)
Cropping image SA022_3 x:157, y:152, w:1380, h:368
Opening: C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs\SA023_1.pdf
SA023_1 Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.9199829101562, 595.3200073242188)
 CB Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.92, 595.32)
Cropping image SA023_1 x:191, y:133, w:895, h:932
Opening: C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs\SA023_2.pdf
SA023_2 Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.9199829101562, 595.3200073242188)
 CB Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.92, 595.32)
Cropping image SA023_2 x:131, y:120, w:1415, h:557
Opening: C:\bin\data\GeoTools\EQN\OneDrive_1_9-23-2022\SA\pdfs\SA024.pdf
SA024 Rect(0.0, 0.0, 841.92, 595.32)
 rect Rect(0.0, 0.0, 841.9199829101562, 595.3200073242188)
 CB Rect(0.0, 0.0, 