<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [4]:
import matplotlib.pyplot as plt
import cv2
from pdf2image import convert_from_path
import numpy as np
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [5]:
class Preprocessing_PDF():
    """Preprocesses the pdf document object with the method preprocessing and 
    saves the result with the method save_preprocessing_images"""

    def __init__(self, path, file_name):
        self.path = path
        self.file_name = file_name


    def preprocessing(self):
        pdf_file=convert_from_path(self.file_name, dpi=600)
        processed_images={}
        i=1
        grey,blur,binarized,threshold = 0,0,0,0
        #Loops over the pages of the pdf
        for page in pdf_file:
            #Converts the pdf to an image and convert it to grayscale
            grey = cv2.cvtColor(np.array(page), cv2.COLOR_BGR2GRAY)
            #Adds Gaussian blur to the grayscale image
            blur = cv2.GaussianBlur(grey,(5,5),0)
            #Binarizes the Gaussian blured image
            th,binarized= cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
            #Saves the preprocessed images in a dictionnary
            processed_images[f'page_{i}']=binarized
            i=i+1
        return processed_images


    def save_preprocessing_images(self):
        i=1
        #Loops over the images in the dictionnary
        for image in self.preprocessing().values():
            #Saves the image
            cv2.imwrite(f'page_{i}.png', image)
            i=i+1
        print("images have been successfully saved")

In [21]:
class Zonification():
    """Identifies blocks of text in the preprocessed image, crops the corresponding image and reads the content.
    Returns a text file with the corresponding text"""
    
    def __init__(self, path, file_name):
        self.path = path
        self.file_name = file_name
        self.preprocessed = Preprocessing_PDF(self.path, self.file_name)
        
    def zone_identification(self):
        image_blocks = {}
        j,k=0,1
        kernel_size=(25,25)
        #Loops over the preprocessed pages 
        for page in self.preprocessed.preprocessing().values():
            #Creates a canny image of each page
            edged = cv2.Canny(page, 30, 200)
            # Create rectangular structuring element and dilate
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, kernel_size)
            dilate = cv2.dilate(edged, kernel, iterations=4)
            # Find contours and draw rectangle
            contours = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            contours = contours[0] if len(contours) == 2 else contours[1]
            #Loops over the zones 
            for contour in contours:
                x,y,w,h = cv2.boundingRect(contour)
                cv2.rectangle(img=page, pt1=(x, y), pt2=(x + w , y + h), color=(0,42,93), thickness=3)
                j=j+1
                #Saves the block in the image_block dictionnary
                image_blocks["bloc_"+str(j)]= page[y:y+h , x:x+w]
            #Saves the images with the contour
            cv2.imwrite("page_with_contours_"+str(k)+".png", page)
            k=k+1
        return image_blocks
    
    
    def block_image_text_comparison(self):
        block_number=0
        for block in self.zone_identification().values():
            plt.figure()
            plt.imshow(block,cmap='Greys_r')
            plt.title("Texte extrait "+"block_"+str(block_number)+" :"+pytesseract.image_to_string(block, config='--psm 6')+"\n")
            block_number=block_number+1

            
    def save_extracted_text(self):
        extracted_text_file = open("extracted_text.txt","w", encoding='utf8')
        for block in self.zone_identification().values():
            extracted_text_file.write(pytesseract.image_to_string(block, config='--psm 6')+"\n")
        print("extracted text has successfully been saved")
    

In [38]:
file_path='invoice.pdf'
file_name='invoice.pdf'
preprocessed_pdf_file = Preprocessing_PDF(file_path,file_name)
preprocessed_pdf_file.preprocessing()
preprocessed_pdf_file.save_preprocessing_images()

images have been successfully saved


In [22]:
zonified = Zonification(file_path,file_name)
zonified.save_extracted_text()