In [None]:
# ===============================================================================================================#
# Copyright 2023 Infosys Ltd.                                                                          #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

# Use case 01 - Extract Chunks From Image

<hr/>

## Sample Task: Document preprocessing(segmentation, chunking) using sample Annual Report

<hr/>

<img src="./data/sample/input/annual_report1.jpg" style="border-style: dotted;" width=1000 height=700 />

#### Import libraries

In [None]:
import json
import os
import shutil
import infy_dpp_sdk
import infy_fs_utils
from _internal_utils.pipeline_helper import PipelineHelper

#### Define configuration file path

In [None]:
STORAGE_ROOT_PATH = 'C:/DPP/infy_libraries_client/STORAGE'
CONTAINER_ROOT_PATH = 'C:/DPP/infy_libraries_client/CONTAINER'
PIPELINE_INPUT_CONFIG_FILE_PATH = '/data/config/dpp_pipeline1_input_config.json'

#### Copying files
<div style="line-height: 1;">
<span style="color:Red"><b>NOTE: </b>In this notebook below is used to copy sample files to folders in <i>STORAGE_ROOT_PATH</i>.<br>
In production the data and config files should be kept under respective folders in <i>STORAGE_ROOT_PATH </i>.<br>
</span>
</div>

In [None]:
curr_data = os.path.abspath('./data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data/input'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data/input')     
shutil.copy(f'{curr_data}/sample/input/annual_report1.jpg',
            f'{STORAGE_ROOT_PATH}/data/input/annual_report1.jpg')
shutil.copytree(f'{curr_data}/sample/config',f'{STORAGE_ROOT_PATH}/data/config',
                dirs_exist_ok=True)

In [None]:
# Show pipeline card
PipelineHelper(PIPELINE_INPUT_CONFIG_FILE_PATH, STORAGE_ROOT_PATH, CONTAINER_ROOT_PATH).show_pipeline_card()

#### Initialize Client Config

In [None]:
storage_config_data = infy_fs_utils.data.StorageConfigData(
        **{
            "storage_root_uri": f"file://{STORAGE_ROOT_PATH}",
            "storage_server_url": "",
            "storage_access_key": "",
            "storage_secret_key": ""
        })
client_config_data = infy_dpp_sdk.ClientConfigData(
    **{
        "container_data": {
            "container_root_path": f"{CONTAINER_ROOT_PATH}",
        }
    })
file_sys_handler = infy_fs_utils.provider.FileSystemHandler(
    storage_config_data)
if not infy_fs_utils.manager.FileSystemManager().has_fs_handler(
    infy_dpp_sdk.common.Constants.FSH_DPP):
    infy_fs_utils.manager.FileSystemManager().add_fs_handler(
        file_sys_handler,
        infy_dpp_sdk.common.Constants.FSH_DPP)
infy_dpp_sdk.ClientConfigManager().load(client_config_data)

#### Initialize Logging

In [None]:
logging_config_data = infy_fs_utils.data.LoggingConfigData(
        **{
            # "logger_group_name": "my_group_1",
            "logging_level": 10,
            "logging_format": "",
            "logging_timestamp_format": "",
            "log_file_data": {
                "log_file_dir_path": "/logs",
                "log_file_name_prefix": "index",
                # "log_file_name_suffix": "1",
                "log_file_extension": ".log"

            }})

### Index pipeline

In [None]:
if not infy_fs_utils.manager.FileSystemLoggingManager().has_fs_logging_handler(
    infy_dpp_sdk.common.Constants.FSLH_DPP):
    infy_fs_utils.manager.FileSystemLoggingManager().add_fs_logging_handler(
            infy_fs_utils.provider.FileSystemLoggingHandler(
                logging_config_data, file_sys_handler),
            infy_dpp_sdk.common.Constants.FSLH_DPP)

In [None]:
dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNative(
        input_config_file_path=PIPELINE_INPUT_CONFIG_FILE_PATH)

In [None]:
processor_response_list = dpp_orchestrator.run_batch()

### Verify results

<div  style="line-height: 1;">
    <span style="color:Green"><b>NOTE: </b> The results of the pipeline will be available in <i>processor_response_data.json</i> file at <i>work_folder_path</i>.</span></div>

In [None]:
processor_response_list[0].context_data.get('request_closer')

In [None]:
print("Please note down the 'group_request_file', this will be needed in other usecases: ")
print(json.dumps(processor_response_list[0].context_data.get("request_creator"),indent=4))

<div  style="line-height: 1;">
    <span style="color:Green"><b>NOTE:</b> Progression through each stage of the segmentation process can be seen below.</span></div>

In [None]:
import base64
from IPython.display import Image, display, HTML
debug_path= STORAGE_ROOT_PATH + (processor_response_list[0].context_data.get("request_creator").get('work_file_path'))+'_files/debug'

directories = {
    '1.segment_generator': '1.Segment Generation: Segmentation of the input document into segments based on selected techniques.',
    '2.segment_consolidator': '2.Segment Consolidation: Consolidation of the results of various segmentation techniques to into one.',
    '3.segment_classifier': '3. Segment Classification: Classification of the segments into headers, footers, content.',
    '4.column_detector': '4.Column Detection: Detection of columns in the document.',
    '5.segment_merger': '5.Segment Merging: Merging of the smaller segments into larger ones.',
    '6.segment_sequencer': '6.Segment Sequencing: Sequencing of the segments based on the order of appearance in the document.',
}

for dir, text in directories.items():
    dir_path = os.path.join(debug_path, dir)
    if os.path.exists(dir_path):
        for file in os.listdir(dir_path):
            if file.endswith(".png") or file.endswith(".jpg"):
                img_path = os.path.join(dir_path, file)
                with open(img_path, "rb") as img_file:
                    b64_string = base64.b64encode(img_file.read()).decode()
                text_before_colon, text_after_colon = text.split(':', 1)
                html = f"""
                <div style="border:2px solid black; padding:10px; margin:5px;">
                    <p><b>{text_before_colon}:</b>{text_after_colon}</p>
                    <img src="data:image/png;base64,{b64_string}" alt="Image">
                </div>
                """
                display(HTML(html))
                break