# Document AI OCR (sync)

This notebook shows you how to do OCR on documents using the Google Cloud DocumentAI API synchronously. For the synchronous request the document content will be send as bytes and the program will block until it receives the response. The response is then visualized showing the preprocessed (e.g. rotated) image together with bounding boxes for block, paragraph, line and token.

## Set your Processor Variables 

In [None]:
PROJECT_ID = "YOUR_GCP_PROJECT_ID"
LOCATION = "eu"  # Format is 'us' or 'eu'
PROCESSOR_ID = "YOUR_DOCAI_PROCESSOR_ID"  # Create OCR processor in Cloud Console

# check supported file types at https://cloud.google.com/document-ai/docs/processors-list#processor_doc-ocr
SUPPORTED_FILE_TYPES = ["PDF", "TIF", "TIFF", "GIF", "JPG", "JPEG", "PNG", "BMP", "WEBP"]

# Sample invoices are stored in gs://cloud-samples-data/documentai/async_invoices/
GCS_INPUT_BUCKET = 'cloud-samples-data'
GCS_INPUT_PREFIX = 'documentai'

LOCAL_INPUT_PATH = '../resources/general'

TIMEOUT = 300

## Setup

In [None]:
# Install necessary Python libraries and restart your kernel after.
!pip install --quiet -r ../requirements.txt

In [None]:
from google.cloud import documentai_v1 as documentai
from google.cloud import storage

from PIL import Image, ImageDraw
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import resolve1

import mimetypes
from pathlib import Path
import io
import os
import re
import numpy as np
from enum import Enum
import pandas as pd

In [None]:
class FeatureType(Enum):
    PAGE = 1
    BLOCK = 2
    PARA = 3
    LINE = 4
    TOKEN = 5

In [None]:
def process(content: bytes, mime_type: str, skip_human_review: bool = False) -> dict:
    """Synchronous (online) process document using REST API.
    
    Processes document content with given mime type and blocks until result is returned.
    Optionally allows to skip human review if enabled for the processor.
    See details at
    https://cloud.google.com/document-ai/docs/reference/rest/v1/projects.locations.processors/process
    
    Args:
        content: Document content as byte string.
        mime_type: An IANA MIME type (RFC6838).
        skip_human_review: Optional; Whether Human Review feature should be skipped for this request. Default to false.
        
    Returns:
        A dict containing processed document and human_review_status.
        See details at
        https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessResponse
    """
    
    content_size = len(content)
    if content_size > 20*1024*1024:
        raise Exception(f"Content of size {content_size} Bytes is larger than the 20971520 Bytes (20MiB) limit of synchronous processing, please use batch processing.")
    
    page_count = 1
    
    if mime_type == "image/tiff":
        page_count = Image.open(io.BytesIO(content)).n_frames
    
    if mime_type == "application/pdf":
        parser = PDFParser(io.BytesIO(content))
        document = PDFDocument(parser)
        
        # This will give you the count of pages
        page_count = resolve1(document.catalog['Pages'])['Count']
    
    if page_count > 10:
        raise Exception(f"Page count of {page_count} is larger than 10 page limit of synchronous processing, please use batch processing.")
    
    # Instantiate a Document AI client
    client_options = {"api_endpoint": f"{LOCATION}-documentai.googleapis.com"}
    client = documentai.DocumentProcessorServiceClient(client_options = client_options)
    
    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"
    
    # Create raw document from image content
    raw_document = documentai.RawDocument(
        content = content,
        mime_type = mime_type
    )
    
    # Process document
    process_request = documentai.ProcessRequest(
        name = name,
        raw_document = raw_document,
        skip_human_review = skip_human_review
    )
    
    return client.process_document(request=process_request)

In [None]:
def get_page_bounds(page, feature):
    # [START vision_document_text_tutorial_detect_bounds]
    """Returns document bounds given the OCR output page."""

    bounds = []

    # Collect specified feature bounds by enumerating all document features
    if (feature == FeatureType.BLOCK):
        for block in page.blocks:
            if not block.layout.bounding_poly.vertices:
                block.layout.bounding_poly.vertices = []
                for normalized_vertice in block.layout.bounding_poly.normalized_vertices:
                    block.layout.bounding_poly.vertices.append(documentai.Vertex(x=int(normalized_vertice.x * page.image.width),y=int(normalized_vertice.y * page.image.height)))
            bounds.append(block.layout.bounding_poly)
    if (feature == FeatureType.PARA):
        for paragraph in page.paragraphs:
            if not paragraph.layout.bounding_poly.vertices:
                paragraph.layout.bounding_poly.vertices = []
                for normalized_vertice in paragraph.layout.bounding_poly.normalized_vertices:
                    paragraph.layout.bounding_poly.vertices.append(documentai.Vertex(x=int(normalized_vertice.x * page.image.width),y=int(normalized_vertice.y * page.image.height)))
            bounds.append(paragraph.layout.bounding_poly)
    if (feature == FeatureType.LINE):        
        for line in page.lines:
            if not line.layout.bounding_poly.vertices:
                line.layout.bounding_poly.vertices = []
                for normalized_vertice in line.layout.bounding_poly.normalized_vertices:
                    line.layout.bounding_poly.vertices.append(documentai.Vertex(x=int(normalized_vertice.x * page.image.width),y=int(normalized_vertice.y * page.image.height)))
            bounds.append(line.layout.bounding_poly)
    if (feature == FeatureType.TOKEN):        
        for token in page.tokens:
            if not token.layout.bounding_poly.vertices:
                token.layout.bounding_poly.vertices = []
                for normalized_vertice in token.layout.bounding_poly.normalized_vertices:
                    token.layout.bounding_poly.vertices.append(documentai.Vertex(x=int(normalized_vertice.x * page.image.width),y=int(normalized_vertice.y * page.image.height)))
            bounds.append(token.layout.bounding_poly)


    # The list `bounds` contains the coordinates of the bounding boxes.
    # [END vision_document_text_tutorial_detect_bounds]
    return bounds

In [None]:
def draw_boxes(image, bounds, color, width):
    """Draw a border around the image using the hints in the vector list."""
    draw = ImageDraw.Draw(image)

    for bound in bounds:
        points = (
            (bound.vertices[0].x, bound.vertices[0].y),
            (bound.vertices[1].x, bound.vertices[1].y),
            (bound.vertices[2].x, bound.vertices[2].y),
            (bound.vertices[3].x, bound.vertices[3].y),
            (bound.vertices[0].x, bound.vertices[0].y)
        )
        draw.line(points,fill=color,width=width,joint='curve')
    return image

In [None]:
def render_doc_text(page):  
    image = Image.open(io.BytesIO(page.image.content))
    
    # this will draw the bounding boxes for block, paragraph, line and token
    bounds = get_page_bounds(page, FeatureType.BLOCK)
    draw_boxes(image, bounds, color='blue', width=8)
    bounds = get_page_bounds(page, FeatureType.PARA)
    draw_boxes(image, bounds, color='red',width=6)
    bounds = get_page_bounds(page, FeatureType.LINE)
    draw_boxes(image, bounds, color='yellow',width=4)
    bounds = get_page_bounds(page, FeatureType.TOKEN)
    draw_boxes(image, bounds, color='green',width=2)
        
    image.show()
    
    # uncomment if you want to save the image with bounding boxes locally
    #image.save(document.name)

### Process documents synchronously

In [None]:
def process_gcs_samples():
    # Instantiate a Google Cloud Storage Client
    storage_client = storage.Client()
    
    # Sample invoices are stored in gs://cloud-samples-data/documentai/async_invoices/
    blobs = storage_client.list_blobs(GCS_INPUT_BUCKET, prefix=GCS_INPUT_PREFIX)
    for blob in blobs:
        for file_type in SUPPORTED_FILE_TYPES:
            if file_type.casefold() in blob.name.casefold():
                gcs_input_uri = f"gs://{GCS_INPUT_BUCKET}/{blob.name}"
                print(f"Processing {gcs_input_uri}...")
                
                mime_type = mimetypes.guess_type(blob.name)[0]
                image_content = blob.download_as_string()

                try:
                    process_response = process(content = image_content, mime_type = mime_type)

                    pages = process_response.document.pages
                    for page in pages:
                        print(f"Rendering file {blob.name} - Page {page.page_number}/{len(pages)}")
                        render_doc_text(page=page)
                except Exception as e: 
                    print("\x1b[31m" + str(e) + "\x1b[0m")

In [None]:
def process_local_samples():
    for path in Path(LOCAL_INPUT_PATH).iterdir():
        print(path.resolve())
        if path.suffix[1:].casefold() in map(str.casefold, SUPPORTED_FILE_TYPES):
            print(f"Processing {path.name}...")

            mime_type = mimetypes.guess_type(path.name)[0]
            image_content = open(path, "rb").read()
            
            try:
                process_response = process(content = image_content, mime_type = mime_type)
                pages = process_response.document.pages
                for page in pages:
                    print(f"Rendering file {path.name} - Page {page.page_number}/{len(pages)}")
                    render_doc_text(page=page)
            except Exception as e: 
                print("\x1b[31m" + str(e) + "\x1b[0m")

In [None]:
process_gcs_samples()

In [None]:
process_local_samples()