In [None]:
import sagemaker
import boto3
import time
import codecs
import os
import random
from datetime import datetime
from multiprocessing import Pool

In [None]:
# s3 URI is not required. Just provide bucket name 
input_bucket = "<Enter your s3 data bucket name>"
input_prefix = "how-many-athenas-with-athena/"
output_bucket= "<Enter your s3 output bucket name>"
output_prefix = "output/"

In [None]:
# the frequency at which we will check whether Textract asynchronous job has been completed
sleep_time = 5

# function to start a Textract Document Text Detection job (asynchronous)
# note that for documents with more than 1 page, we need to run Textract asynchronously
def StartDocumentTextDetection(s3BucketName, objectName):
    response = None
    client = boto3.client('textract')
    response = client.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        }
    )
    return response["JobId"]

# function to check whether Textract asynchronous job has been completed
def isJobComplete(jobId):
    # For production use cases, use SNS based notification 
    # Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
    time.sleep(sleep_time)
    client = boto3.client('textract')
    response = client.get_document_text_detection(JobId=jobId)
    status = response["JobStatus"]
    #print("Job status: {}".format(status))

    while (status == "IN_PROGRESS"):
        time.sleep(sleep_time)
        response = client.get_document_text_detection(JobId=jobId)
        status = response["JobStatus"]
        #print("Job status: {}".format(status))

    return status

# function to collect detected text from all pages of the document
def getDocumentTextDetectionResults(jobId):
    pages = []
    client = boto3.client('textract')
    response = client.get_document_text_detection(JobId=jobId)
 
    pages.append(response)
    #print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    while(nextToken):
        response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        #print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']
    
    return pages

In [None]:
# create a S3 client
s3_client = boto3.client('s3')

# get list of input files
pdf_object_list = []
response = s3_client.list_objects(
    Bucket= input_bucket,
    Prefix= input_prefix
)

for obj in response['Contents']:
    if obj['Size']!=0:
        pdf_object_list.append(obj['Key'])

pdf_object_list[:5]

In [None]:
print (" Submitting file to Textract for Processing")
def f(file_obj):
    #print('Textract Processing JPG: \t'+ file_obj)             
    job_id = StartDocumentTextDetection(input_bucket, file_obj)
    #print('Textract Job Submitted: \t'+ job_id)
    status = isJobComplete(job_id)
    if status=='SUCCEEDED':
        response = getDocumentTextDetectionResults(job_id)
    #print (response)
        
    
    # renaming .pdf to .text
    random_string = str(random.random())
    text_output_name = file_obj.replace('.jpg', f"{random_string}.csv")
    text_output_name = text_output_name[(text_output_name.rfind('/')+1):]
    #print('Output Name:\t', text_output_name)
    
 
    
    # Writing Textract Output to Text Files:
    with codecs.open(text_output_name, "w", "utf-8") as output_file:
        for resultPage in response:
            for item in resultPage["Blocks"]:
                if item["BlockType"] == "WORD":
                    #print('\033[94m' + item["Text"] + '\033[0m')
                    output_file.write(item["Text"]+'\n')
    output_file.close()
    
    try:
        s3_client.upload_file(text_output_name, output_bucket, 
                              output_prefix+text_output_name)
        
    except Exception as e:
        print("Exception message from S3 buckets upload: {}".format(str(e)))

 
with Pool(5) as p:
    p.map(f, pdf_object_list)

print("Textract Processing Completed")

