# IMPORTED LIBRARIES

In [None]:
import PySimpleGUI as sg
import sys
import fitz
from sys import exit
from collections import defaultdict

In [None]:
import aspose.words as aw

In [None]:
import PyPDF2 as pdf
import pdf2image
from pdf2image import convert_from_path
import numpy as np
import pandas as pd
import os
import io
import matplotlib.pyplot as plt
import pdfplumber

In [None]:
from sklearn.cluster import AgglomerativeClustering
from tabula import read_pdf
from tabulate import tabulate
import argparse
import imutils
from scipy.ndimage import interpolation as inter

In [None]:
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [None]:
import cv2
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Guest_jouw\.conda\envs\tesseract\Library\bin\tesseract.exe'
from pytesseract import Output

# FUNCTION FOR DISPLAYING IMAGES IN JUPYTER NOTEBOOK

In [None]:
 #Matplotlib was used to display full size of the image
    
def display(im_path):

    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()
        
  

# IMAGE PRE-PROCESSING FUNCTIONS

# CORRECTED SKEW ANGLE

In [None]:
def correct_skew(file, delta=1, limit=10):
    
    
    def determine_score(arr, angle):
        
        data = inter.rotate(arr, angle, reshape=False, order=0)
        histogram = np.sum(data, axis=1, dtype=float)
        score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
        
        return histogram, score

    gray = cv2.cvtColor(file, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] 

    scores = []
    angles = np.arange(-limit, limit + delta, delta)
    for angle in angles:
        histogram, score = determine_score(thresh, angle)
        scores.append(score)

    best_angle = angles[scores.index(max(scores))]

    (h, w) = file.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
    corrected = cv2.warpAffine(file, M, (w, h), flags=cv2.INTER_CUBIC, \
            borderMode=cv2.BORDER_REPLICATE)

    return best_angle, corrected  

# WATER_MARK REMOVAL

In [None]:
def water_mark_removal(file):
    
    # Load the image
    img = cv2.imread(file)

    # Convert the image to grayscale
    gr = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Make a copy of the grayscale image
    bg = gr.copy()

    # Apply morphological transformations
    for i in range(5):
        
        kernel2 = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
                                            (2 * i + 1, 2 * i + 1))
        bg = cv2.morphologyEx(bg, cv2.MORPH_CLOSE, kernel2)
        bg = cv2.morphologyEx(bg, cv2.MORPH_OPEN, kernel2)

    # Subtract the grayscale image from its processed copy
    dif = cv2.subtract(bg, gr)

    # Apply thresholding
    bw = cv2.threshold(dif, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    dark = cv2.threshold(bg, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

    # Extract pixels in the dark region
    darkpix = gr[np.where(dark > 0)]

    # Threshold the dark region to get the darker pixels inside it
    darkpix = cv2.threshold(darkpix, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

    # Paste the extracted darker pixels in the watermark region
    bw[np.where(dark > 0)] = darkpix.T
    
    cv2.imwrite('without_watermark_before.jpg',bw)

# GRAY_SCALING

In [None]:
def grayscale(file):
    img = cv2.imread(file)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cv2.imwrite("grey_image.jpg",gray)

In [None]:
display("grey_image.jpg")

# SHADOW_REMOVAL

In [None]:
def shadow_removal(file):
    
    img = cv2.imread(file,-1)

    rgb_planes = cv2.split(img)

    result_planes = []
    result_norm_planes = []
    for plane in rgb_planes:
        
        dilated_img = cv2.dilate(plane, np.ones((7,7), np.uint8))
        bg_img = cv2.medianBlur(dilated_img, 21)
        diff_img = 255 - cv2.absdiff(plane, bg_img)
        norm_img = cv2.normalize(diff_img,None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
        result_planes.append(diff_img)
        result_norm_planes.append(norm_img)

    shadows = cv2.merge(result_planes)
    shadows_norm = cv2.merge(result_norm_planes)

    cv2.imwrite('shadows_out.png', shadows_norm)

# THRESHOLDING

In [None]:
def thresholding(file):
    
    img = cv2.imread(file,cv2.IMREAD_GRAYSCALE)
    thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    thresh = cv2.bitwise_not(thresh)
    cv2.imwrite("otsuthreshold.jpg",thresh)

# INVERTED_IMAGE

In [None]:
def invert_image(file):
    
    img = cv2.imread(file)
    inverted_image = cv2.bitwise_not(img)
    cv2.imwrite("inverted.jpg", inverted_image)
    

In [None]:
display("inverted.jpg")

# DILATION

In [None]:
def dilation(file):
    
    img = cv2.imread(file)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,1))
    dilate = cv2.dilate(img, kernel, iterations=2)
    inverted_image = cv2.bitwise_not(dilate)

    cv2.imwrite("dilated.jpg",inverted_image)

In [None]:
display("dilated.jpg")

# NOISE_REMOVAL

In [None]:
def noise_removal(file):
    
    img = cv2.imread(file)
    
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.erode(img, kernel, iterations=1)
    img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
    img = cv2.medianBlur(img, 3)
    return (img)
    
    noise = img
    cv2.imwrite('noise_removal1.jpg',noise)

# NOISE REMOVAL_CONNECTIVITY

In [None]:
def noise_removal_connectivity(file):
    inverted_image = cv2.imread(file)
    connectivity = 4
    nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(inverted_image, connectivity, cv2.CV_32S)
    sizes = stats[1:, -1]; nb_components = nb_components - 1
    min_size = 50 #threshhold value for small noisy components
    inverted_image = np.zeros((output.shape), np.uint8)

    for i in range(0, nb_components):
        if sizes[i] >= min_size:
            inverted_image[output == i + 1] = 255

    noise_connect = inverted_image
    cv2.imwrite("noise_removal_connectivity1.jpg",noise_connect)
    

# NOISE REMOVAL3

In [None]:
def noise_median_blur(file):
    
    img = cv2.imread(file)
    inverted_image = np.array(img)
    median = cv2.medianBlur(inverted_image, 3)
    cv2.imwrite("medianBlur.jpg",median)

# OCR_FOR_IMAGES

In [None]:
#def image_to_text(file):
    img = cv2.imread(file)
    target = pytesseract.image_to_string(img)
    return target
    
    img = cv2.imread("medianBlur.jpg")
    print(image_to_text(img))

# CLEAN FUNCTION

In [None]:
def main_clean_function(file):
  
    image = cv2.imread(file)
    cv2.imwrite('original_file.jpg',image)
    
    #1 Skew Correction:
    image = cv2.imread('original_file.jpg')
    angle, corrected = correct_skew(image)
    cv2.imwrite('corrected.jpg',corrected) 
       
    #2 Water mark removal:
    #water_mark_removal('original_file.jpg')
    
    #3 Gray_Scaling:
    grayscale('corrected.jpg')
    
    #4 Shadow_removal:
    shadow_removal("grey_image.jpg")
    
    #5 Thresholding:
    thresholding('shadows_out.png')
                 
    #6 Inverted Image:
    invert_image("otsuthreshold.jpg")  
                 
    #7 Dilation:
    dilation("inverted.jpg")
                 
    #8 Noise Removal:
    #noise_median_blur("dilated.jpg")

# MAIN_FUNCTION_IMAGE_PROCESSING_OCR

In [None]:
def main_function(file):
  
    image = cv2.imread(file)
    cv2.imwrite('original_file.jpg',image)
    
    #1 Skew Correction:
    image = cv2.imread('original_file.jpg')
    angle, corrected = correct_skew(image)
    cv2.imwrite('corrected.jpg',corrected) 
         
    #2 Water mark removal:
    #water_mark_removal('original_file.jpg')
    
    #3 Gray_Scaling:
    grayscale('original_file.jpg')
    
    #4 Shadow_removal:
    shadow_removal("grey_image.jpg")
    
    #5 Thresholding:
    thresholding('shadows_out.png')
                 
    #6 Inverted Image:
    invert_image("otsuthreshold.jpg")  
                 
    #7 Dilation:
    dilation("inverted.jpg")
                 
    #8 Noise Removal:
    #noise_median_blur("dilated.jpg")
    
    
    #11 OCR_image
    def image_to_text(file):
        target = pytesseract.image_to_string(img)
        return target
    
    img = cv2.imread("dilated.jpg")
    print(image_to_text(img))

# MAIN_FUNCTION_PDF_PROCESSING

In [None]:
#1. Ocr_pdfs
def main_function2(file):
    doc = aw.Document(file)
    doc.save(file, aw.SaveFormat.PDF)
    global text
    pdfFile = open(file,'rb')
    pdfReader = pdf.PdfFileReader(pdfFile)
    pageObj = pdfReader.getPage(0)
    text = pageObj.extractText()
    print(text) 

# PDF VIEWING

In [None]:
 def get_page(pno):

        dlist = dlist_tab[pno]  # get display list
        if not dlist:
            # create if not yet there
            dlist_tab[pno] = doc[pno].getDisplayList()
            dlist = dlist_tab[pno]
            
        pix = dlist.getPixmap(alpha = False)
        return pix.getPNGData() 

In [None]:
def image_viewing(file):
        
    pdf = fitz.open(file)
    for pageNumber, page in enumerate(pdf.pages(),start = 1):
            
        for imgNumber, img in enumerate(page.getImageList(),start = 1):
                
            xref = img[0]
            pix = fitz.Pixmap(pdf,xref)
            
            if pix.n>4:
            
                pix = fitz.Pixmap(fitz.csRGB,pix)
                
            pix.writePNG(f'image_Page{imgNumber}.png')
            #return pix.getPNGData()

# PDF VIEWING2

In [None]:
#Store Pdf with convert_from_path function
def pdf_convert(file):
    
     images = convert_from_path(file,500,poppler_path=r"C:\Users\Guest_jouw\poppler\poppler-22.04.0\Library\bin")
 
     for i in range(len(images)):
        window["-IMAGE-"].update(images[i])
        

# SCAN

In [None]:
def scan(cmd, timeout=None, window=None):
    
    if values['-IN-'] == "Choose File" or values['-IN-'] == "":
  
        print('No file chosen')  
                
    else:
                
        file = values['-IN-']
        file_path = os.path.split(file)
        file_extension = os.path.splitext(file_path[1])
        image_file = ['.png','.jpg','.bmp','.gif','.ico','.jpeg','.ps','.psd','.svg','.tif','.tiff']
        text_file = ['.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wpd']
        
        for i in image_file, text_file:
            
            if file_extension[1] in image_file:
                
                print(main_function(file))
                
                
            if file_extension[1] in text_file:
                
                print(main_function2)
                    
            else: 
                print('This is not a recognized file format')

# INTERFACE

In [None]:
sg.LOOK_AND_FEEL_TABLE['EYtheme'] = {'BACKGROUND': '#333333',
                                        'TEXT': '#ffffff',
                                        'INPUT': '#ffffff',
                                        'TEXT_INPUT': '#333333',
                                        'SCROLL': '#cccccc',
                                        'BUTTON': ('#333333', '#ffe600'),
                                        'PROGRESS': ('#D1826B', '#CC8019'),
                                        'BORDER': 1, 'SLIDER_DEPTH': 0, 
'PROGRESS_DEPTH': 0, }
sg.theme('EYtheme')
menu_layout = [
    ['File',['Open','Save','---','Exit']],
   
]

file_types = [("JPEG (*.jpg)", "*.jpg"),("PNG (*.png)", "*.png"),("PDF (*.pdf)", "*.pdf"),
              ("All files (*.*)", "*.*")]

#COLUMNS:

image_elem = sg.Image(key = '-IMAGE-')
goto =  sg.InputText(key = '-TEXT-',size = (5, 1))
Buttons = sg.Column([
    [
        sg.Button('Prev'),
        sg.Button('Next'),
        sg.Text('Page:'),
        goto,
    ],
    [
        sg.Text("Zoom:"),
        sg.Button('Top-L'),
        sg.Button('Top-R'),
        sg.Button('Bot-L'),
        sg.Button('Bot-R'),
    ],
    [image_elem],
])
my_keys = ("Next", "Next:34", "Prev", "Prior:33", "Top-L", "Top-R",
           "Bot-L", "Bot-R", "MouseWheel:Down", "MouseWheel:Up")
zoom_buttons = ("Top-L", "Top-R", "Bot-L", "Bot-R")


clean_img_col = sg.Column([[sg.Image(key = '-CLEAN_IMAGE-')]])
control_col = sg.Column([
      
        [sg.Text('File Preview')],
        [sg.Input('Choose File',size=(30, 1),enable_events = True, key ='-IN-',justification = 'left')
         ,sg.FileBrowse(file_types=file_types),sg.Button('Upload New File', key = '-BUTTON2-')],

    
        [sg.Text('Upload Your File',justification = 'left')],
        [sg.Button("Load File",key = '-LOAD-'),sg.Button("Load Clean File",key = '-CLEAN-'),
         sg.Button('Clear Output',key = '-BUTTON3-'),sg.Button('Scan', key = '-BUTTON1-')],
        [sg.Output(size=(100,75),key = '_output_')]
    
])
layout = [[Buttons,clean_img_col,control_col]]

window = sg.Window('Document Scanner',layout,icon='EY-logo.ico',margins = (50,50),resizable = 'True',
                   return_keyboard_events=True, use_default_focus=False, finalize = True)

while True:
    
    event, values = window.read()
    cur_page = 0
    force_page = False
    if event == sg.WIN_CLOSED:   
        break
    
    if event == '-BUTTON2-':
        window['-IN-'].update("Choose File")
        
    if event == '-BUTTON1-':
        
        
        scan(cmd=values['-IN-'], window=window)
        
        
    if event == '-BUTTON3-':
        
        window.FindElement('_output_').Update('')
    
    
    if event == '-CLEAN-':
        
        filename = values["-IN-"]
        main_clean_function(filename)
        image = Image.open("dilated.jpg")
        image.thumbnail((1000, 800))
        bio = io.BytesIO()
        # Actually store the image in memory in binary 
        image.save(bio, format="PNG")
        # Use that image data in order to 
        window["-CLEAN_IMAGE-"].update(data=bio.getvalue())
        
          
    
    if event == '-LOAD-':
     
        doc = fitz.open(values['-IN-'])
        page_count = len(doc)

        # storage for page display lists
        dlist_tab = [None] * page_count

        data = get_page(cur_page)  # show page 1 for start
        image_elem = sg.Image(data=data)
        goto = sg.InputText(str(cur_page + 1), size=(5, 1))
        window["-IMAGE-"].update(data=data)
        window["-TEXT-"].update(str(cur_page + 1))
        
        
    if event[0] == chr(13):  # surprise: this is 'Enter'!
        try:
            cur_page = int(values[0]) - 1  # check if valid
            while cur_page < 0:
                cur_page += page_count
        except:
            cur_page = 0  # this guy's trying to fool me
            goto = sg.InputText(str(cur_page + 1))
            
    elif event in ("Next", "Next:34", "MouseWheel:Down"):
        cur_page += 1
    elif event in ("Prev", "Prior:33", "MouseWheel:Up"):
        cur_page -= 1
       
    # sanitize page number

    if cur_page >= page_count:  # wrap around
        cur_page = 0
    while cur_page < 0:  # we show conventional page numbers
        cur_page += page_count

    # prevent creating same data again
    if cur_page != old_page:
        
        force_page = True

    if force_page:
        data = get_page(cur_page)
        image_elem = sg.Image(data=data)
        window['-IMAGE-'].update(data=data)
        old_page = cur_page
        
        
    if event in my_keys:
        
        goto = sg.InputText(str(cur_page + 1))
        window["-TEXT-"].update(str(cur_page + 1))
        # goto.TKStringVar.set(str(cur_page + 1))
  
    #####################################################################################################  
    # used for zoom on/off
    # the zoom buttons work in on/off mode.
    
    

#####################################################################################################################  
            


#For a PDF file viewer,use pypdf2 or convert pdf to png files and open separate images together                   
#Fix the output printing twice                   
#Account for image files that are not text- No text recognized
#Add an error in OCR functions
#Account for pdf files that are image based if pdf_to_text throws an error and use ocrmypdf on those
#Find a way to not use two duplicate functions(main_function and main_clean_function)                   
          