In [1]:
# -*- coding: utf-8 -*-

import typing as T
import base64
import os
import json

import attr
from pathlib_mate import Path
from rich import print as rprint
from s3pathlib import S3Path, context
from boto_session_manager import BotoSesManager, AwsServiceEnum


class Config:
    aws_profile = "aws_data_lab_sanhe_us_east_2"
    bucket = "aws-data-lab-sanhe-for-everything-us-east-2"


bsm = BotoSesManager(profile_name=Config.aws_profile)
context.attach_boto_session(bsm.boto_ses)

tt_client = bsm.get_client(AwsServiceEnum.Textract)
ch_client = bsm.get_client(AwsServiceEnum.Comprehend)

dir_here = Path(os.getcwd()).absolute()
s3dir_here = S3Path(Config.bucket, "projects", "2022-10-10-textract-with-pdf-and-image").to_dir()
path_raw_pdf = dir_here / "apartment-lease.pdf"


@attr.s
class BinaryFile:
    path: Path = attr.ib()
    s3dir: S3Path = attr.ib()

    @property
    def s3path_raw(self) -> S3Path:
        return self.s3dir / "01-raw" / self.path.basename

    @property
    def s3dir_textract_output(self) -> S3Path:
        return (self.s3dir / "02-textract_output" / self.path.basename).to_dir()

    @property
    def s3dir_textract_job_run(self) -> S3Path:
        return list(self.s3dir_textract_output.iterdir())[0]

    @property
    def s3path_merged_json(self) -> S3Path:
        return self.s3dir_textract_job_run / "merged.json"

    @property
    def s3path_merged_txt(self) -> S3Path:
        return self.s3dir_textract_job_run / "merged.txt"

    @property
    def s3dir_comprehend_output(self) -> S3Path:
        return (self.s3dir / "03-comprehend_output" / self.path.basename).to_dir()

    @property
    def s3path_entity_json(self) -> S3Path:
        return self.s3dir_comprehend_output / "entities.json"

    def step_00_upload_raw_file(self):
        self.s3path_raw.upload_file(self.path.abspath, overwrite=True)

    def step_01_textract_analyze_document(self):
        """
        Ref:

        - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html#Textract.Client.start_document_analysis
        """
        print("asdf: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html#Textract.Client.start_document_analysis")
        # print("start document analysis ...")
        # print(f"  preview raw file at: {self.s3path_raw.console_url}")
        # print(f"  preview textract output file at: {self.s3dir_textract_output.console_url}")
        # response = tt_client.start_document_analysis(
        #     DocumentLocation=dict(
        #         S3Object=dict(
        #             Bucket=self.s3path_raw.bucket,
        #             Name=self.s3path_raw.key,
        #         ),
        #     ),
        #     # ClientRequestToken=self.path.md5,
        #     FeatureTypes=["TABLES", "FORMS"],
        #     OutputConfig=dict(
        #         S3Bucket=self.s3dir_textract_output.bucket,
        #         S3Prefix=self.s3dir_textract_output.to_file().key,
        #     )
        # )
        # rprint(response)
        # print("  done")

    def step_02_merge_textract_output(self):
        print(f"preview merged json at: {self.s3path_merged_json.console_url}")
        print(f"preview merged text at: {self.s3path_merged_txt.console_url}")
        merged_data = {"Blocks": []}
        lines = list()
        s = set()
        for s3path in self.s3dir_textract_job_run.iter_objects():
            if s3path.basename not in [".s3_access_check", "merged.json", "merged.txt"]:
                data = json.loads(s3path.read_text())
                merged_data["Blocks"].extend(data["Blocks"])
                for block in data["Blocks"]:
                    s.add(block["BlockType"])
                    if block["BlockType"] == "LINE":
                        lines.append(block["Text"])
        self.s3path_merged_json.write_text(json.dumps(merged_data, indent=4))
        self.s3path_merged_txt.write_text("\n".join(lines))
        Path(dir_here, self.path.fname + ".json").write_text(json.dumps(merged_data, indent=4))
        Path(dir_here, self.path.fname + ".txt").write_text("\n".join(lines))

    # def step_03_inspect_merge_textract_output(self):

    # data = json.loads(s3path_merged_json.read_text())
    # for block in data["Blocks"]:
    #     print(block)

    def step_04_detect_entity(self):
        print(f"preview detected entities at: {self.s3path_entity_json.console_url}")
        res = ch_client.detect_entities(
            Text=self.s3path_merged_txt.read_text(),
            LanguageCode="en",
        )
        entity_json_content = json.dumps(res, indent=4)
        self.s3path_entity_json.write_text(entity_json_content)
        Path(dir_here, self.path.fname + "-entity.json").write_text(entity_json_content)

    def step_11_analyze_document(self):
        res = tt_client.start_document_analysis(
            DocumentLocation=dict(
                S3Object=dict(
                    Bucket=self.s3path_raw.bucket,
                    Name=self.s3path_raw.key,
                ),
            ),
            FeatureTypes=["TABLES", "FORMS"],
            OutputConfig=dict(
                S3Bucket=self.s3dir_textract_output.bucket,
                S3Prefix=self.s3dir_textract_output.to_file().key,
            )
        )
        print(self.s3dir_textract_output.console_url)
        rprint(res)
        Path(dir_here, "analyze_document_output.json").write_text(
            json.dumps(res)
        )


# def s2_inspect_textract_result():
#     s3path_textract_output

if __name__ == "__main__":
    bfile = BinaryFile(
        path=path_raw_pdf,
        s3dir=s3dir_here,
    )

    # bfile.step_00_upload_raw_file()
    bfile.step_01_textract_analyze_document()
    # bfile.step_02_merge_textract_output()
    # bfile.step_04_detect_entity()
    # bfile.step_03_inspect_merge_textract_output()

    # bfile.step_11_analyze_document()

    # s3dir = S3Path.from_s3_uri("s3://aws-data-lab-sanhe-for-everything/poc/2022-10-10-textract-example/02-textract_output/ADR-RAC-PB-Cotiviti.pdf/3ec6b60f98e26cf1f1007396f71364ae37e7192fd8efec85b6c56e08c3f33099/")
    # blocks: T.List[dict] = list()
    # for s3path in s3dir.iter_objects():
    #     if s3path.basename != ".s3_access_check":
    #         print(s3path)
    #         data = json.loads(s3path.read_text())
    #         blocks.extend(data["Blocks"])
    # dir_here.append_parts("analyze_document_output.json").write_text(
    #     json.dumps({
    #         "Blocks": blocks
    #     })
    # )

asdf: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html#Textract.Client.start_document_analysis
