In [1]:
!pip install google-cloud-vision



In [2]:
from google.cloud import vision
import io
import numpy as np
import pandas as pd
import json

In [3]:
def corners(element):
    """Calculate four edges ((top, bottom, left, right)) of word from Vision API output
    Args:
    element: tuple representing word on document (word, y, x, height, width)

    Returns:
    top: y-coordinate of top of word
    bottom: y-coordinate of bottom of word
    left: x-coordinate of leftmost part of word
    right: x-coordinate of rightmost part of word
    """
    top = element[1]
    bottom  = top + element[3]
    left = element[2]
    right = left + element[4]
    return top, bottom, left, right

def distance(word_a, word_b):
    """Calculate the x and y distance between the given two words
    Args:
    word_a: first word element (word, y, x, height, width)
    word_b: second word element (word, y, x, height, width)
    Returns:
    x-, y-distance between word_a and word_b
    """
    top_a, bottom_a, left_a, right_a = corners(word_a)
    top_b, bottom_b, left_b, right_b = corners(word_b)
    x = left_b - right_a
    y = bottom_a - top_b
    return x, y


def sort_words(input_list, buffer):
    """Sort the words by y-coordinate and then x-coordinate. 
    Ensure that words that share the same line (overlap on the y-axis) are ordered left-to-right
    
    Args: 
    input_list: list of elements (word, y, x, height, width) in original (Vision API output) order
    buffer: minimizes the width (and height) that is considered an overlap. Adjusting the buffer is helpful in scenarios where the image is slightly 
    rotated or the fields are not perfectly aligned
    Returns:
    ordered: array sorted by y and then x axis
    a: array with new indices of each word
    ordered[a] gives sorted list

    """
    ordered = np.sort(input_list, axis = 0, order = ['y', 'x']) #sort input by y-axis and then x-axis
    a = np.arange(len(ordered)) #array of indices in order to re-order the words. ordered[a] will be the sorted output
    for i in range(len(ordered)):
        top, bottom, left, right = corners(ordered[a][i])
        k = i + 1
        j = 1
        while (k < len(ordered)) and ((ordered[a][k][1] + buffer) < bottom):
            if ((ordered[a][k][2] + buffer) < left):
                current = a[i]
                top, bottom, left, right = corners(ordered[a][k])
                a[i] = a[k] #swap locations (by moving the order of the indices)
                a[k] = current
                k = i + 1
                j += 1
            else:
                k += 1
    return ordered[a]

def line_by_line(list):
    '''Format and print the list, adding new lines and indents'''
    i = 0 #first word's leftmost x-coordinate
    right = 2000
    output = []
    sentence = ""
    leftmost = 0
    rightmost = 0
    for o in list:
        left = corners(o)[2]
        if i > o['x']: #new line
            rightmost = right
            if len(sentence.strip()) > 0:
                output.append([sentence.strip(), leftmost, rightmost])
            sentence = ""
            leftmost = left
        else:
            diff = (left - right) / 50
            if diff > 1:
                if len(sentence.strip()) > 0:
                    output.append([sentence.strip(), leftmost, rightmost])
                sentence = ""
                leftmost = left
        sentence = sentence + " "+str(np.char.decode(o[0]))
        i = o['x']
        right = corners(o)[3]
    return output

def create_word_vertex_pairs(texts):
    pageNumber = 0
    results = []
    resultList = []
    for text in texts:
        verticesList = text.bounding_poly.vertices
        x_a = verticesList[0].x 
        y_a = verticesList[0].y 
        x_b = verticesList[2].x
        y_b = verticesList[2].y
        word = text.description
        if len(word) > 0:
            #vision API considers each punctuation character its own word. Removing extraneous punctuation improves searching/parsing
            if word not in [':',"|","'","?",";",",","(",")","#","."]:
                word = word.encode('ascii',errors='ignore')
                c = tuple((word.lower(), min(y_a, y_b), min(x_a, x_b), abs(y_b - y_a), abs(x_b - x_a))) #Word, topmost-y, leftmost-x, height, width
                resultList.append(c) #each page should have its own list of words + coordinates (to allow sorting)
    input = np.asarray(resultList, dtype = [('word', 'S25'), ('y', 'int64'), ('x', 'int64'), ('height', 'int64'), ('width', 'int64')])
    results.append(input) #add each page's list to output
    return results

def format_print(list):
    '''Format and print the list, adding new lines and indents'''
    i = 0 #first word's leftmost x-coordinate
    right = 2000
    for o in list:
        left = corners(o)[2]
        if i > o['x']: #new line
            print("")
        else:
            diff = (left - right) / 50
            for n in range(int(diff)): #indent
                print("\t\t", end = "")
        print(np.char.decode(o[0]), end = " ")
        i = o['x']
        right = corners(o)[3]

### Get response from Vision API OCR

In [25]:
path = '/usr/local/google/home/fprost/Downloads/us_047.jpg'
client = vision.ImageAnnotatorClient()

with io.open(path, 'rb') as image_file:
    content = image_file.read()

image = vision.types.Image(content=content)

response = client.document_text_detection(image=image)



### Sort words (Within a column)


In [30]:
words_and_vertices[:10]

[array([(b'10martin\nde noel om w mit',   44,  216, 1983, 1250),
        (b'10',   44,  828,   64,   24),
        (b'martin',   44,  884,   64,   82),
        (b'de',   44,  950,   64,  117), (b'noel',   44, 1057,   64,   78),
        (b'om',   44, 1141,   64,   47), (b'w',   44, 1198,   64,   18),
        (b'mit',   44, 1231,   64,   37),
        (b'untut',   44, 1276,   64,   83),
        (b'us010143012b2',  108, 1055,   23,  196),
        (b'12',  190,  225,   44,   24),
        (b'united',  190,  270,   44,  134),
        (b'states',  190,  438,   44,  121),
        (b'patent',  190,  580,   44,  139),
        (b'stattin',  242,  274,   32,   86),
        (b'et',  242,  380,   32,   24), (b'al',  242,  416,   32,   28),
        (b'10',  200,  910,   34,   19),
        (b'patent',  199,  957,   34,   90),
        (b'no',  199, 1055,   34,   44), (b'us',  198, 1169,   34,   50),
        (b'10',  197, 1237,   34,   31), (b'143',  197, 1286,   34,   53),
        (b'012',  196, 1350,   

In [5]:
texts = response.text_annotations
words_and_vertices = create_word_vertex_pairs(texts)
output = sort_words(words_and_vertices[0], buffer = 0)

In [6]:
format_print(output)

10martin
de noel om w mit 10 martin de noel om w mit untut 
us010143012b2 
12 united states patent 						10 patent no 		us 10 143 012 b2 
stattin et al 																		45 date of patent 				nov 27 2018 
54 random access procedure in 						52 
wireless device 																s ci 
radio base station 				cpc u 		h04w 74 / 0833 2013 01 h04l 41 / 0654 
and methods therein 																				2013 01 h04w 74 / 08 2013 01 
58 field of classification search 
71 applicant telefonaktiebolaget l m ericsson 						none 
publ stockholm se 												see application file for complete search history 
72 inventors magnus stattin upplands vsby se 		56 						references cited 
gunnar bergquist kista se tao 
cui upplands vsby se mats folke 								u s patent documents 
vllingby se gunnar mildh 
sollentuna se elena myhre jrflla 		2009 / 0186624 al * 7 / 2009 cave 				ho4l 1 / 1887 
455 / 450 
se mikael wittberg uppsala se 		2010 / 0202288 a1 * 8 / 2010 park 				h04w 48 / 08 
370 / 230 
73 assignee telef

### Separate Columns (Column left/right)

In [7]:
l = line_by_line(output)
for line in l:
    if line[1] < 500:
        print(line[0])
print("\nSecond Column")
for line in l:
    if line[1] >= 500:
        print(line[0])

10martin
de noel om w mit 10 martin de noel om w mit untut
12 united states patent
stattin et al
54 random access procedure in
wireless device
and methods therein
71 applicant telefonaktiebolaget l m ericsson
publ stockholm se
72 inventors magnus stattin upplands vsby se
gunnar bergquist kista se tao
cui upplands vsby se mats folke
vllingby se gunnar mildh
sollentuna se elena myhre jrflla
se mikael wittberg uppsala se
73 assignee telefonaktiebolaget lm ericsson
publ stockholm se
* notice
subject to any disclaimer the term of this
patent is extended or adjusted under 35 gb
u s c 154 b by 231 days
21 appl no
14 / 892 690
22 pct filed
may 21 2014
86 pct no
pct / se2014 / 050621
$ 2 371 date
c 1
nov 20 2015
87 pct pub no w02014 / 189453
pct pub date nov 27 2014
65
prior publication data
us 2016 / 0105912 a1 apr 14 2016
related u s application data
60 provisional application no 61 / 825 593 filed on may
21 2013
51 int cl
h04w 74 / 08
h04l 12 / 24

Second Column
us010143012b2
10 patent no
us

In [76]:
l

[]

In [8]:
path = '/usr/local/google/home/fprost/Downloads/us_047.pdf'
client = vision.ImageAnnotatorClient()

with io.open(path, 'rb') as image_file:
    content = image_file.read()

image = vision.types.Image(content=content)

response = client.document_text_detection(image=image)



In [22]:
image = 'gs://pdf-processing-219114/patents_test_fprost/valid_pdf/us_047.pdf'

batch_size = 1
mime_type = 'application/pdf'
client = vision.ImageAnnotatorClient()
feature = vision.types.Feature(
  type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.types.GcsSource(uri=image)
input_config = vision.types.InputConfig(gcs_source=gcs_source,
                                        mime_type=mime_type)

gcs_destination = vision.types.GcsDestination(uri='gs://pdf-processing-219114/test_files/test.json')
output_config = vision.types.OutputConfig(
    gcs_destination=gcs_destination, batch_size=batch_size)

async_request = vision.types.AsyncAnnotateFileRequest(
  features=[feature],
  input_config=input_config,
  output_config=output_config)

operation = client.async_batch_annotate_files(requests=[async_request])
# todo(michaelsherman) catch and log the operation ID
response_mike = operation.result(timeout=150)



In [23]:
import tensorflow as tf
response_mike = tf.gfile.GFile('gs://pdf-processing-219114/test_files/test.jsonoutput-1-to-1.json', 'r')

In [24]:
response_mike = response_mike.read()

In [26]:
response_mike[:1000]

'{"inputConfig":{"gcsSource":{"uri":"gs://pdf-processing-219114/patents_test_fprost/valid_pdf/us_047.pdf"},"mimeType":"application/pdf"},"responses":[{"fullTextAnnotation":{"pages":[{"property":{"detectedLanguages":[{"languageCode":"en","confidence":0.79},{"languageCode":"sv","confidence":0.12},{"languageCode":"it","confidence":0.04},{"languageCode":"vi","confidence":0.01},{"languageCode":"gd","confidence":0.01}]},"width":612,"height":792,"blocks":[{"boundingBox":{"normalizedVertices":[{"x":0.62091506,"y":0.0479798},{"x":0.7352941,"y":0.0479798},{"x":0.7352941,"y":0.060606062},{"x":0.62091506,"y":0.060606062}]},"paragraphs":[{"boundingBox":{"normalizedVertices":[{"x":0.62091506,"y":0.0479798},{"x":0.7352941,"y":0.0479798},{"x":0.7352941,"y":0.060606062},{"x":0.62091506,"y":0.060606062}]},"words":[{"property":{"detectedLanguages":[{"languageCode":"en"}]},"boundingBox":{"normalizedVertices":[{"x":0.62091506,"y":0.0479798},{"x":0.7352941,"y":0.0479798},{"x":0.7352941,"y":0.060606062},{"x"

In [29]:
response.text_annotations

[locale: "en"
description: "10MARTIN\nDE NOEL OM W MIT UNTUT\nUS010143012B2\n(12) United States Patent\nStattin et al.\n(10) Patent No.: US 10,143,012 B2\n(45) Date of Patent: Nov. 27, 2018\n(54) RANDOM ACCESS PROCEDURE IN\nWIRELESS DEVICE, RADIO BASE STATION\nAND METHODS THEREIN\n(52) U.S. CI.\nCPC ..... H04W 74/0833 (2013.01); H04L 41/0654\n(2013.01); H04W 74/08 (2013.01)\n(58) Field of Classification Search\nNone\nSee application file for complete search history.\n(71)\nApplicant: Telefonaktiebolaget L M Ericsson\n(publ), Stockholm (SE)\n(56)\nReferences Cited\nU.S. PATENT DOCUMENTS\n(72) Inventors: Magnus Stattin, Upplands V\303\244sby (SE);\nGunnar Bergquist, Kista (SE); Tao\nCui, Upplands V\303\244sby (SE); Mats Folke,\nV\303\244llingby (SE); Gunnar Mildh,\nSollentuna (SE); Elena Myhre, J\303\244rf\303\244lla\n(SE); Mikael Wittberg, Uppsala (SE)\n2009/0186624 Al*\n7/2009 Cave ..........\n...\n2010/0202288 A1*\n8/2010 Park ........\nHO4L 1/1887\n455/450\nH04W 48/08\n370/230\n(73) 