In [None]:
# ===============================================================================================================#
# Copyright 2022 Infosys Ltd.                                                                          #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

## 1. Preparation

In [None]:
from _internal_utils.demo_helper import DemoHelper
import pandas as pd
from IPython.display import Image

#### <span style='color:blue'>Enter image and OCR file paths</span>

In [None]:
IMAGE_PATHS=["./data/coi_1.jpg", "./data/coi_2.jpg"]
IMAGE_OCR_PATHS=['./data/coi_1.jpg.hocr', './data/coi_2.jpg.hocr']

#### Visualize

In [None]:
tab_widget = DemoHelper.create_tab_toolbar(IMAGE_PATHS)
display(tab_widget)
for idx, image_path in enumerate(IMAGE_PATHS):
    with tab_widget.children[idx]:
        display(Image(filename=image_path, width=1000, height=50))

## 2. Initialization

#### <span style='color:blue'>Create new instance with desired OCR service provider</span>

<!-- ### 1.1. Example -  Create Instance - Without logging -->

In [None]:
import json
import logging
import os

from infy_ocr_parser import ocr_parser
from infy_ocr_parser.providers.tesseract_ocr_data_service_provider import TesseractOcrDataServiceProvider

ocr_file_list = IMAGE_OCR_PATHS

if not os.path.exists("./logs"):
    os.makedirs("./logs")
logging.basicConfig(
    filename=("./logs" + "/app_log.log"),
    format="%(asctime)s- %(levelname)s- %(message)s",
    level=logging.INFO,
    datefmt="%d-%b-%y %H:%M:%S",
)
logger = logging.getLogger()
data_service_provider_obj = TesseractOcrDataServiceProvider()

ocr_parse_obj = ocr_parser.OcrParser(ocr_file_list=ocr_file_list,
                                     data_service_provider=data_service_provider_obj,
                                     logger=logger)

## 3. API - save_tokens_as_json()

In [None]:
TOKEN_TYPE_WORD=1
TOKEN_TYPE_LINE=2
TOKEN_TYPE_PHRASE=3

### 3.1 - Save `word` tokens to file

In [None]:
output_file_path = "./data/word_token.json"
result = ocr_parse_obj.save_tokens_as_json(
    output_file_path,
    token_type_value=TOKEN_TYPE_WORD)

#### Output

In [None]:
print(json.dumps(result, indent=4))

#### Visualize

In [None]:
file_content = DemoHelper.read_file(output_file_path)
print(DemoHelper.get_shortened_text(file_content, max_line_count=30))

### 3.2 - Save `phrase` tokens to file

In [None]:
output_file_path = "./data/phrase_token.json"
result = ocr_parse_obj.save_tokens_as_json(
    output_file_path, 
    token_type_value=TOKEN_TYPE_PHRASE)

#### Output

In [None]:
print(json.dumps(result, indent=4))

#### Visualize

In [None]:
file_content = DemoHelper.read_file(output_file_path)
print(DemoHelper.get_shortened_text(file_content, max_line_count=35))

### 3.3 - Save `line` tokens to file

In [None]:
output_file_path = "./data/line_token.json"
result = ocr_parse_obj.save_tokens_as_json(
    output_file_path,
    token_type_value=TOKEN_TYPE_LINE)

#### 2.2.1. Output

In [None]:
print(json.dumps(result, indent=4))

#### Visualize

In [None]:
file_content = DemoHelper.read_file(output_file_path)
print(DemoHelper.get_shortened_text(file_content, max_line_count=35))