# Amazon Textract


## 2. Asynchronous operations

Asynchronous operations can also process documents that are in PDF format. Using PDF format files enables you to process multipage documents. 

For asynchronous operations, you need to supply input documents in an **Amazon S3 bucket**.

In [None]:
import sys

In [None]:
!{sys.executable} -m pip install --upgrade pip
!conda install -y -c conda-forge poppler
!{sys.executable} -m pip install pdf2image

In [None]:
#Detects text in a document stored in an S3 bucket. Display polygon box around text and angled text 

import io
import boto3
import json
import time
import sys
import textract.util as tu
from io import BytesIO
import psutil

import copy
import math
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import numpy as np
from pdf2image import convert_from_path, convert_from_bytes


%matplotlib inline

In [None]:
textract = boto3.client('textract')
s3_client = boto3.client('s3')
s3 = boto3.resource('s3')

In [None]:
class ProcessType:
    DETECTION = 1
    ANALYSIS = 2

<h2><span style="color:red">아래 Setting을 완성해 주시기 바랍니다!!!</span></h2>

In [None]:
## S3 데이터 버킷 정보, 실제 분석한 파일(jpg, png, pdf)를 올리는 장소로 CloudFormation에 생성된 bucket을 넣어주시면 됩니다.
bucket='amazon-textract-demo-xxx'  

## 분석할 문서 이름을 넣어주시기 바랍니다.
test_document = [
    'xxxxxxxxxxxxx.pdf'
]

## 분석 타입을 선택해 주시기 바랍니다. DETECTION or ANALYSIS
types=ProcessType.ANALYSIS

## 2-1. Performing ProcessDocument

In [None]:
def process_textract(types, bucket, document):
    #Determine which type of processing to perform
    if types==1:
        response = textract.start_document_text_detection(DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}})
        print('Processing type: Detection')
        validType=True        

    elif types==2:
        response = textract.start_document_analysis(DocumentLocation={'S3Object': {'Bucket': bucket, 'Name': document}},
            FeatureTypes=["TABLES", "FORMS"])
        print('Processing type: Analysis')
        validType=True    

    if validType==False:
        print("Invalid processing type. Choose Detection or Analysis.")
    return response


In [None]:
%%time
analyzers = []
jobids = []
for document in test_document:
    print(document)
    try:
        response = process_textract(types,bucket,document)
        jobid = response['JobId']
    except Exception as e:
        print("Exception : {}".format(e))
        pass
    finally:
        print(jobid)
        jobids.append(jobid)

In [None]:
import time

max_time = time.time() + 3 * 60 * 60 # 3 hours
while time.time() < max_time:
    status = []
    cnt = 0
    for i, jobid in enumerate(jobids):
        if types==1:
            response = textract.get_document_text_detection(JobId=jobid)
            status = response["JobStatus"]
            print("Detecting Textract for {} job {} : {}".format(i+1, jobid, status))
        elif types==2:
            response = textract.get_document_analysis(JobId=jobid)
            status = response["JobStatus"]
            print("Analyzing Textract for {} job {} : {}".format(i+1, jobid, status))
#         status.append(response["JobStatus"])
        
        if status == "SUCCEEDED" or status == "FAILED":
            cnt += 1
        
    if cnt == len(jobids):
        break
        
    time.sleep(10)

## 2-2. Anaylzing Results for Asynchronous operations

<h2><span style="color:red">결과를 확인할 문서의 번호 (리스트의 순서대로 1,2,~ ) 를 넣어주세요. </span></h2>

### Input a document number among test_documents :

In [None]:
document_id = 1

In [None]:
doc_block = tu.get_multipages_block_result(jobids[document_id-1], types)

## 2-3. Convert PDF to Image

In [None]:
try:
    pdf_byte_string = s3_client.get_object(Bucket=bucket, Key=test_document[document_id-1])['Body'].read()
    image = convert_from_bytes(pdf_byte_string)
except:
    print("To show images is available only in pdf documents.")
    pass

## 2-4. Checking the numbers of  image and doc_block for multipage documents

In [None]:
num_img = len(image)
num_dblock = len(doc_block)
if not num_img==num_dblock:
    assert "The numbers of documents and blocks is different."
else:
    print("Numbers of documents: {}, and of blocks: {}".format(num_img, num_dblock))

<h2><span style="color:red">PDF는 여러 페이지가 가능합니다. 확인할 페이지 번호 (1,2,~ ) 를 넣어주세요. </span></h2>

Display 목적으로 page를 나눈 것이며, 모든 page의 결과는 json으로 받을 수 있습니다.

### Input a page number in the selected document:

In [None]:
page_num = 1

In [None]:
if types==1:
    result_image, result_blocks = tu.get_pdf_detect_document_text(image[page_num-1], doc_block[page_num-1])
elif types==2:
    result_image, result_blocks = tu.get_pdf_analyze_document(image[page_num-1], doc_block[page_num-1])

In [None]:
fig_x, fig_y = 20, 15
plt.figure(figsize = (fig_x,fig_y))
print("Displays the results of a text analysis on page {} of the document.".format(page_num))
plt.imshow(np.array(result_image))

In [None]:
page = tu.get_page(result_blocks)

## 3. Amazon Comprehend
### 3-1. Detecting Entitiy

In [None]:
## Input text size exceeds limit. Max length of request text allowed is 5000 bytes
tu.detect_entities_for_comprehend(page[:5000])


### 3-2. Extracting Key-Value Pairs

In [None]:
key_map, value_map, block_map = tu.get_kv_map(result_blocks)

# Get Key Value relationship
kvs = tu.get_kv_relationship(key_map, value_map, block_map)
print("\n\n== FOUND KEY : VALUE pairs ===\n")
tu.print_kvs(kvs)

# Start searching a key value
while input('\n Do you want to search a value for a key? (enter "n" for exit) ') != 'n':
    search_key = input('\n Enter a search key:')
    print('The value is:', tu.search_value(kvs, search_key))