In [1]:
import pandas as pd 
import os
from enum import Enum
from google.cloud import vision
from google.cloud.vision import types
from PIL import Image, ImageDraw
import numpy as np
import io
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="C:/Users/runze/OneDrive/Desktop/textdetector-f1de5e1e531e.json"

In [2]:
class FeatureType(Enum):
    PAGE = 1
    BLOCK = 2
    PARA = 3
    WORD = 4
    SYMBOL = 5

class blockContainer:
    
    def __init__(self, block):
        bounds = getBox(block.bounding_box)
        centroid = find_centroid(bounds)
        string = parse_block(block)
        self.block_list = [(bounds, centroid, string)]
        
    def add(self, block):
        bounds = getBox(block.bounding_box)
        centroid = find_centroid(bounds)
        string = parse_block(block)
        added = False
        for index, blockdata in enumerate(self.block_list):
            if blockdata[1][1] > centroid[1]:
                continue
            elif blockdata[1][0] < centroid[0]:
                continue
            else:
                self.block_list.insert(index-1, (bounds, centroid, string))
                added = True
                break
        if not added:
            self.block_list.append((bounds, centroid, string))
        
    def rectThresh(self, new_block, thresh=90):
        mindist = 100000
        for block in self.block_list:
            dist = rect_distance(block[0], getBox(new_block.bounding_box))
            if dist < mindist:
                mindist = dist
        if mindist < thresh:
            return mindist
        return False

In [4]:
def get_document_bounds(image_file, feature=FeatureType.BLOCK):
    """Returns document bounds given an image."""
    client = vision.ImageAnnotatorClient()

    bounds = []
    consblocks = []
    with io.open(image_file, 'rb') as image_file:
        content = image_file.read()

    image = types.Image(content=content)

    response = client.document_text_detection(image=image)
    document = response.full_text_annotation

    # Collect specified feature bounds by enumerating all document features
    for page in document.pages:
        for block in page.blocks:
            added = False
            minconsblock = (10000, None)
            if (feature == FeatureType.BLOCK):
                if consblocks != []:
                    for consblock in consblocks:
                        mindist = consblock.rectThresh(block)
                        if mindist:
                            if mindist < minconsblock[0]:
                                minconsblock = (mindist, consblock)
                if minconsblock[0] < 9000:
                    minconsblock[1].add(block)
                    added = True
                if not added:
                    consblocks.append(blockContainer(block))  
    # The list `bounds` contains the coordinates of the bounding boxes.
    output = []
    for consblock in consblocks:
        blockstring = ""
        for each in consblock.block_list:
            blockstring += each[2]
        output.append(blockstring)
    return output

def getBox(bound):

    xList = []
    yList = []
    for i in range(4):
        xList.append(bound.vertices[i].x)
        yList.append(bound.vertices[i].y)
    xList = np.array(xList)
    yList = np.array(yList)
    minX = np.min(xList)
    maxX = np.max(xList)
    minY = np.min(yList)
    maxY = np.max(yList)
    return (minX, minY, maxX, maxY)

def find_centroid(parsed_bounds):
    minX, minY, maxX, maxY = parsed_bounds

    return ((maxX+minX)/2, (maxY+minY)/2)

def rect_distance(bbox1, bbox2):
    w1 = bbox1[2] - bbox1[0]
    w2 = bbox2[2] - bbox2[0]
    b1 = bbox1[3] - bbox1[1]
    b2 = bbox2[3] - bbox2[1]
    centroid1 = find_centroid(bbox1)
    centroid2 = find_centroid(bbox2)
    dist = max(abs(centroid1[0] - centroid2[0]) - (w1 + w2)/2, abs(centroid2[1] - centroid1[1]) - (b1+b2)/2)
    if dist > 0:
        return dist
    else: 
        return 0

def parse_block(block):
    output = ""
    breaks = vision.enums.TextAnnotation.DetectedBreak.BreakType
    paragraphs = []
    lines = []
    for paragraph in block.paragraphs:
        para = ""
        line = ""
        for word in paragraph.words:
            for symbol in word.symbols:
                line += symbol.text
                if symbol.property.detected_break.type == breaks.SPACE:
                    line += ' '
                if symbol.property.detected_break.type == breaks.EOL_SURE_SPACE:
                    line += ' '
                    lines.append(line)
                    para += line
                    line = ''
                if symbol.property.detected_break.type == breaks.LINE_BREAK:
                    lines.append(line)
                    para += line
                    line = ''
        paragraphs.append(para)

    output = "\n".join(lines)
    return output


In [3]:
text_df = pd.DataFrame(columns=["flyer_name", "text"])
images2process = []
for subdir, dirs, files in os.walk("flyer_images"):
    for filey in files:
        images2process.append(filey.replace(".jpg", ""))

In [5]:
i = 0
for flyer_name in images2process:
    text = get_document_bounds("flyer_images/" + flyer_name + ".jpg")
    for chunk in text:
        if chunk:
            text_df.loc[i] = [flyer_name, chunk]
            i += 1
text_df.to_csv("text.csv")
