In [None]:
# ===============================================================================================================#
# Copyright 2023 Infosys Ltd.                                                                                    #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

# Use case 02 - Extract Chunks From Pdf

<hr/>

## Sample Task: Document preprocessing(segmentation, chunking) using sample Annual Report

<hr/>

In [None]:
from IPython.display import IFrame
IFrame(fr"./data/sample/input/AR_2022-23_page-14-17.pdf", width=800, height=400)

#### Import libraries

In [None]:
import json
import os
import shutil
import infy_dpp_sdk
import infy_fs_utils

#### Define configuration file path

In [None]:
STORAGE_ROOT_PATH = 'C:/DPP/infy_libraries_client/STORAGE'
CONTAINER_ROOT_PATH = 'C:/DPP/infy_libraries_client/CONTAINER'
INDEX_INPUT_CONFIG_FILE_PATH = '/data/config/dpp_pipeline2_input_config.json'

In [None]:
curr_data = os.path.abspath('./data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data')
if not os.path.exists(f'{STORAGE_ROOT_PATH}/data/input'):
    os.makedirs(f'{STORAGE_ROOT_PATH}/data/input')     
shutil.copy(f'{curr_data}/sample/input/AR_2022-23_page-14-17.pdf',
            f'{STORAGE_ROOT_PATH}/data/input/AR_2022-23_page-14-17.pdf')
shutil.copytree(f'{curr_data}/sample/config',f'{STORAGE_ROOT_PATH}/data/config',
                dirs_exist_ok=True)

#### Initialize Client Config

In [None]:
storage_config_data = infy_fs_utils.data.StorageConfigData(
        **{
            "storage_root_uri": f"file://{STORAGE_ROOT_PATH}",
            "storage_server_url": "",
            "storage_access_key": "",
            "storage_secret_key": ""
        })

client_config_data = infy_dpp_sdk.ClientConfigData(
    **{
        "container_data": {
            "container_root_path": f"{CONTAINER_ROOT_PATH}",
        }
    })

file_sys_handler = infy_fs_utils.provider.FileSystemHandler(
    storage_config_data)
infy_fs_utils.manager.FileSystemManager().add_fs_handler(
    file_sys_handler,
    infy_dpp_sdk.common.Constants.FSH_DPP)
infy_dpp_sdk.ClientConfigManager().load(client_config_data)

#### Initialize Logging

In [None]:
logging_config_data = infy_fs_utils.data.LoggingConfigData(
        **{
            # "logger_group_name": "my_group_1",
            "logging_level": 10,
            "logging_format": "",
            "logging_timestamp_format": "",
            "log_file_data": {
                "log_file_dir_path": "/logs",
                "log_file_name_prefix": "index",
                # "log_file_name_suffix": "1",
                "log_file_extension": ".log"

            }})

### Index pipeline

In [None]:
infy_fs_utils.manager.FileSystemLoggingManager().add_fs_logging_handler(
        infy_fs_utils.provider.FileSystemLoggingHandler(
            logging_config_data, file_sys_handler),
        infy_dpp_sdk.common.Constants.FSLH_DPP)

In [None]:
dpp_orchestrator = infy_dpp_sdk.orchestrator.OrchestratorNativeBasic(
        input_config_file_path=INDEX_INPUT_CONFIG_FILE_PATH)

In [None]:
processor_response_list = dpp_orchestrator.run_batch()

In [None]:
processor_response_list[0].context_data.get('request_closer')

In [None]:
# print("Please note down the 'group_request_file', this will be needed in other usecases: ")
# print(json.dumps(processor_response_list[0].context_data.get("request_creator"),indent=4))

### Review Output

In [None]:
response_data_json=json.loads(json.dumps(processor_response_list[0].context_data.get('chunk_data_parser').get('page_segment_data'),indent=4))
# print(response_data_json)
print(response_data_json.get("2"))