In [1]:
% pip install -r requirements.txt

UsageError: Line magic function `%` not found.


In [21]:
# -*- coding: utf-8 -*-

import typing as T
import uuid
import os
import json

import attr
from pathlib_mate import Path
from rich import print as rprint
from PIL import Image, ImageDraw
from s3pathlib import S3Path, context
from pdf2image import convert_from_path
from boto_session_manager import BotoSesManager, AwsServiceEnum


class Config:
    aws_profile = "aws_data_lab_sanhe_us_east_2"
    bucket = "aws-data-lab-sanhe-for-everything-us-east-2"


bsm = BotoSesManager(profile_name=Config.aws_profile)
context.attach_boto_session(bsm.boto_ses)

tt_client = bsm.get_client(AwsServiceEnum.Textract)
ch_client = bsm.get_client(AwsServiceEnum.Comprehend)

dir_here = Path(os.getcwd()).absolute()

s3dir_here = S3Path(Config.bucket, "projects", "2022-10-10-textract-with-pdf-and-image").to_dir()
path_raw_pdf = dir_here / "apartment-lease.pdf"
dir_images = dir_here / "images"
dir_annotated_images = dir_here / "annotated_images"

dir_images.mkdir_if_not_exists()
dir_annotated_images.mkdir_if_not_exists()


def get_nth_page_filename(ith: int) -> str:
    return f"{ith}.jpg"


@attr.s
class BinaryFile:
    path: Path = attr.ib()
    s3dir: S3Path = attr.ib()

    @property
    def s3path_raw(self) -> S3Path:
        return self.s3dir / "01-raw" / self.path.basename

    @property
    def s3dir_textract_output(self) -> S3Path:
        return (self.s3dir / "02-textract_output" / self.path.basename).to_dir()

    @property
    def s3dir_textract_job_run(self) -> S3Path:
        return list(self.s3dir_textract_output.iterdir())[0]

    @property
    def s3path_merged_json(self) -> S3Path:
        return self.s3dir_textract_job_run / "merged.json"

    @property
    def s3path_merged_txt(self) -> S3Path:
        return self.s3dir_textract_job_run / "merged.txt"

    @property
    def s3dir_comprehend_output(self) -> S3Path:
        return (self.s3dir / "03-comprehend_output" / self.path.basename).to_dir()

    @property
    def s3path_entity_json(self) -> S3Path:
        return self.s3dir_comprehend_output / "entities.json"

    def step_00_upload_raw_file(self):
        self.s3path_raw.upload_file(self.path.abspath, overwrite=True)

    def step_01_textract_analyze_document(self) -> dict:
        """
        Ref:

        - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html#Textract.Client.start_document_analysis
        """
        print("start document analysis ...")
        print(f"  preview raw file at: {self.s3path_raw.console_url}")
        print(f"  preview textract output file at: {self.s3dir_textract_output.console_url}")
        response = tt_client.start_document_analysis(
            DocumentLocation=dict(
                S3Object=dict(
                    Bucket=self.s3path_raw.bucket,
                    Name=self.s3path_raw.key,
                ),
            ),
            ClientRequestToken=uuid.uuid4().hex,
            FeatureTypes=["TABLES", "FORMS"],
            OutputConfig=dict(
                S3Bucket=self.s3dir_textract_output.bucket,
                S3Prefix=self.s3dir_textract_output.to_file().key,
            )
        )
        print("  done")
        return response

    def step_02_merge_textract_output(self):
        print("merge textract output")
        print(f"  preview merged json at: {self.s3path_merged_json.console_url}")
        print(f"  preview merged text at: {self.s3path_merged_txt.console_url}")
        merged_data = {"Blocks": []}
        lines = list()
        s = set()
        for s3path in self.s3dir_textract_job_run.iter_objects():
            if s3path.basename not in [".s3_access_check", "merged.json", "merged.txt"]:
                data = json.loads(s3path.read_text())
                merged_data["Blocks"].extend(data["Blocks"])
                for block in data["Blocks"]:
                    s.add(block["BlockType"])
                    if block["BlockType"] == "LINE":
                        lines.append(block["Text"])
        self.s3path_merged_json.write_text(json.dumps(merged_data, indent=4))
        self.s3path_merged_txt.write_text("\n".join(lines))
        Path(dir_here, self.path.fname + ".json").write_text(json.dumps(merged_data, indent=4))
        Path(dir_here, self.path.fname + ".txt").write_text("\n".join(lines))
        print("  done")

    def step_03_pdf_to_image(self):
        print("Convert pdf to images")
        print(f"  preview at {dir_images}")
        images = convert_from_path(path_raw_pdf.abspath)
        for page, image in enumerate(images, start=1):
            image.save(dir_images.append_parts(get_nth_page_filename(page)).abspath)
        print("  done")

    def step_04_annotate_image_for_keyword(self, keyword: str):
        print(f"Annotate images for keyword {keyword}")

        def locate_block(keyword: str) -> dict:
            for block in json.loads(
                path_raw_pdf.change(new_ext=".json").read_text()
            )["Blocks"]:
                # if block["Text"] == "1215927470":
                if isinstance(block["Text"], str):
                    if keyword.lower() in block["Text"].lower():
                        return block
            raise Exception("Not found!")

        block = locate_block(keyword)
        print("  preview block:")
        rprint(block)
        page = block["Page"]
        polygon: T.List[T.Dict[str, float]] = block["Geometry"]["Polygon"]
        file = get_nth_page_filename(page)
        with Image.open(
            dir_images.append_parts(file).abspath
        ) as im:
            x, y = im.size
            draw = ImageDraw.Draw(im)
            for dot1, dot2 in zip(
                polygon,
                polygon[1:] + [polygon[0], ]
            ):
                draw.line(
                    (
                        x * dot1["X"],
                        y * dot1["Y"],
                        x * dot2["X"],
                        y * dot2["Y"],
                    ),
                    fill=128,
                )
            im.save(dir_annotated_images.append_parts(file).abspath)
        print("  done")


bfile = BinaryFile(
    path=path_raw_pdf,
    s3dir=s3dir_here,
)

In [15]:
s3dir_here.delete_if_exists()
print(s3dir_here.console_url)

https://console.aws.amazon.com/s3/buckets/aws-data-lab-sanhe-for-everything-us-east-2?prefix=projects/2022-10-10-textract-with-pdf-and-image/


In [16]:
bfile.step_00_upload_raw_file()

In [17]:
bfile.step_01_textract_analyze_document()

start document analysis ...
  preview raw file at: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=projects/2022-10-10-textract-with-pdf-and-image/01-raw/apartment-lease.pdf
  preview textract output file at: https://console.aws.amazon.com/s3/buckets/aws-data-lab-sanhe-for-everything-us-east-2?prefix=projects/2022-10-10-textract-with-pdf-and-image/02-textract_output/apartment-lease.pdf/
  done


{'JobId': '44b5803a8840898ccd276251fb1db1c253ec42fd0964bb0c17d7985cd087f7d3',
 'ResponseMetadata': {'RequestId': '13e7bc72-0cb2-4bd2-ad7b-529eabeba32b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '13e7bc72-0cb2-4bd2-ad7b-529eabeba32b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '76',
   'date': 'Tue, 11 Oct 2022 19:02:50 GMT'},
  'RetryAttempts': 0}}

In [18]:
bfile.step_02_merge_textract_output()

merge textract output
  preview merged json at: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=projects/2022-10-10-textract-with-pdf-and-image/02-textract_output/apartment-lease.pdf/44b5803a8840898ccd276251fb1db1c253ec42fd0964bb0c17d7985cd087f7d3/merged.json
  preview merged text at: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=projects/2022-10-10-textract-with-pdf-and-image/02-textract_output/apartment-lease.pdf/44b5803a8840898ccd276251fb1db1c253ec42fd0964bb0c17d7985cd087f7d3/merged.txt
  done


In [22]:
# ensure the ``images`` and ``annotated_images`` are created
bfile.step_03_pdf_to_image()

Convert pdf to images
  preview at /Users/sanhehu/Documents/GitHub/Dev-Exp-Share/docs/source/01-AWS/14-Machine-Learning/03-AWS-Textract-Root/04-Use-Textract-with-PDF-and-Image/images
  done


In [23]:
bfile.step_04_annotate_image_for_keyword("Alice")
# bfile.step_04_annotate_image_for_keyword("Bob")
bfile.step_04_annotate_image_for_keyword("1350")

Annotate images for keyword Alice
  preview block:


  done
Annotate images for keyword 1350
  preview block:


  done
