## Imports

In [2]:
import os
import io
from io import BytesIO
from google.cloud import vision
from PIL import Image, ImageDraw, ImageFont
import cv2
from pdf2image import convert_from_path
import numpy as np
from operator import contains
import pandas as pd

## Preprocessing

#### Convert PDF to JPG

In [None]:
def to_jpg(pdf_name):
    img = convert_from_path("images/{name}.pdf".format(name=pdf_name),
                           fmt="jpeg")
    
    for pg_num, page in enumerate(img):
        page.save("temp/{name}_{pg_num}.jpg".format(name=pdf_name, pg_num=pg_num))

#### Image Preprocessing

In [None]:
#Specify Image
to_jpg('table')

#save jpg to temp folder
img = cv2.imread("temp/table_0.jpg", cv2.IMREAD_COLOR)

#grey scale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite("temp/index_gray.png", gray)

#blur
blur = cv2.GaussianBlur(gray, (5,5), 0)
cv2.imwrite("temp/index_blur.png", blur)

#thresh
thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 21, 4)
cv2.imwrite("temp/thresh.png", thresh)

#kernal
kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (6, 6))
cv2.imwrite("temp/kernal.png", kernal)

#dilate
dilate = cv2.dilate(thresh, kernal, iterations = 1)
cv2.imwrite("temp/dilate.png", dilate)

#contours
cnts = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])

#bounding boxes
boxes_area = []
boxes_centroid = []
boxes_roi = []
boxes_data = []

for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    if h > 45 and w > 90: #these values may have to be adjusted depending on the input image
        box_h = tuple(range(y, y+h))
        box_w = tuple(range(x, x+w))
        
        box_area = (box_w, box_h)
        box_centroid = (x+w/2, y+h/2)
        box_roi = img[y:y+h, x:x+w]
        box_data = (box_area, box_centroid, box_roi)
        
        boxes_area.append(box_area)
        boxes_centroid.append(box_centroid)
        boxes_roi.append(box_roi)
        boxes_data.append(box_data)
        
        cv2.rectangle(img,(x,y),(x+w,y+h),(36,255,12),2)

cv2.imwrite("temp/bbox.png", img)


#sort tables
def sort_tables(area, boxes_centroid):
    
    temp_table = []
    for centroid in boxes_centroid:
        if all(map(contains, area, centroid)):
            temp_table.append(centroid)
    print(temp_table)
    print(len(temp_table))
    return sorted(temp_table)
    
tables = []     
for area in boxes_area:
    if len(sort_tables(area, boxes_centroid)) > 1:
        tables.append(sort_tables(area, boxes_centroid))  
                
tables_data = []
for table in tables:
    temp_table = []
    for centroid in table:
        for box_data in boxes_data:
            if centroid == box_data[1]:
                temp_table.append(box_data)
    tables_data.append(temp_table)
    
#draw points
def draw_circle(points):
    for point in points:
        x = int(point[0])
        y = int(point[1])
        cv2.circle(img, (x,y), radius=10, color=(0, 0, 255), thickness=-1)
    return cv2.imwrite("temp/points.png", img)

#thresh cells
thresh_boxes = []
for box in boxes_roi:
    temp_box = box.copy()
    temp_box[:,:,1] = np.zeros([temp_box.shape[0], temp_box.shape[1]]) #removes green- sets all values in green ch. to 0
    temp_box = cv2.cvtColor(temp_box, cv2.COLOR_BGR2GRAY) #makes img greyscale
    temp_box = cv2.adaptiveThreshold(temp_box, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 4)
    thresh_boxes.append(temp_box)

## OCR

In [None]:
#service account key
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'ServiceAccountToken.json'

#definitions
image_folder_dir = os.getcwd() + "\images"
image_filenames = os.listdir(image_folder_dir)

def image_paths(image_filenames, image_folder_dir):
    
    image_paths = []
    
    for image_filename in image_filenames:
        image_paths.append(os.path.join(image_folder_dir, image_filename))
    
    return image_paths

paths = image_paths(image_filenames, image_folder_dir)

#functions
def detect_document(path):

    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
            content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)
    
    return response


def document_text(document):
    
    docText = document.full_text_annotation.text
    
    return docText


def document_conf(document):
    
     for page in document.full_text_annotation.pages:
            
            for block in page.blocks:
                print('\nBlock confidence: {}\n'.format(block.confidence))

                for paragraph in block.paragraphs:
                    print('Paragraph confidence: {}'.format(
                        paragraph.confidence))

                    for word in paragraph.words:
                        word_text = ''.join([
                            symbol.text for symbol in word.symbols
                        ])
                        print('Word text: {} (confidence: {})'.format(
                            word_text, word.confidence))

                        for symbol in word.symbols:
                            print('\tSymbol: {} (confidence: {})'.format(
                                symbol.text, symbol.confidence))

                            
def drawVertices(image_path, word_attributes):
    
    image_file = image_path.encode('utf_8') #convert str to byte

    pillow_img = Image.open(image_file)
    
    draw = ImageDraw.Draw(pillow_img)
    
    for word in word_attributes:
        
        word_conf = word_attributes.get(word)[0]
        vertices = word_attributes.get(word)[1]
        
        if word_conf >= 0.9:
            color = '#00ff00' #green
        elif word_conf >= 0.8:
            color = '#ffff00' #yellow
        else:
            color = '#ff0000' #red
    
        for i in range(len(vertices) - 1):
            draw.line(((vertices[i].x, vertices[i].y), 
                     (vertices[i + 1].x, vertices[i + 1].y)),
                     fill = color, 
                     width = 3
                     )
        draw.line(((vertices[len(vertices) - 1].x, vertices[len(vertices) - 1].y),
                  (vertices[0].x, vertices[0].y)),
                   fill = color,
                   width = 3
                   )
        
    pillow_img.show()

def word_attributes(document):
    
    word_attributes = {}
    
    for page in document.full_text_annotation.pages:
            
            for block in page.blocks:

                for paragraph in block.paragraphs:

                    for word in paragraph.words:
                        word_text = ''.join([
                            symbol.text for symbol in word.symbols
                        ])
                        
                        word_attributes[word_text] = (word.confidence, word.bounding_box.vertices)
    
    return word_attributes