In [None]:
# ===============================================================================================================#
# Copyright 2022 Infosys Ltd.                                                                          #
# Use of this source code is governed by Apache License Version 2.0 that can be found in the LICENSE file or at  #
# http://www.apache.org/licenses/                                                                                #
# ===============================================================================================================#

## 1. Preparation

In [None]:
import json
import time
from _internal_utils.demo_helper import DemoHelper
import pandas as pd
from IPython.display import Image

#### <span style='color:blue'>Validate external tools</span>

In [None]:
#Checking if Tesseract is installed.
DemoHelper.check_command(['tesseract','--version'])

#### <span style='color:blue'>Enter image paths</span>

In [None]:
IMAGE_PATH='./data/sample_01.jpg'
TEMP_FOLDER_PATH = './data/temp'

#### Visualize

In [None]:
IMAGE_PATHS=[IMAGE_PATH]
tab_widget = DemoHelper.create_tab_toolbar(IMAGE_PATHS)
display(tab_widget)
for idx, image_path in enumerate(IMAGE_PATHS):
    with tab_widget.children[idx]:
        display(Image(filename=image_path, width=1000, height=50)) 

## 2. Initialization

#### <span style='color:blue'>Create new instance</span>

In [None]:
import logging
import os
import infy_table_extractor as ite

if not os.path.exists(TEMP_FOLDER_PATH):
    os.makedirs(TEMP_FOLDER_PATH)

if not os.path.exists('./logs'):
    os.makedirs('./logs')
logging.basicConfig(level=logging.CRITICAL, format='%(asctime)s.%(msecs)03d %(levelname)s'
                                ' ainauto-tabula - %(module)s - %(funcName)s: %(message)s')
logger = logging.getLogger()

provider = ite.bordered_table_extractor.providers.TesseractDataServiceProvider(None,logger=logger, log_level=logging.CRITICAL)

obj = ite.bordered_table_extractor.BorderedTableExtractor(
        table_detection_provider = provider,
        cell_extraction_provider = provider,
        temp_folderpath = TEMP_FOLDER_PATH,
        logger=logger)

#### <span style='color:blue'>Enter Within Bounding Box Coordinates</span>

In [None]:
# Format = [x,y,w,h]
my_within_bbox = [40, 200, 2000, 400]

#### Visualize

In [None]:
img = DemoHelper.read_image(IMAGE_PATH)
img = DemoHelper.draw_bboxes_on_image(img, [my_within_bbox], border_thickness=4,
                                         border_color = DemoHelper.Constants.COLOR_BLUE)
img = DemoHelper.reduce_image_based_on_context(img, [my_within_bbox])

DemoHelper.show_image(img)

## 3. API - extract_all_fields()

### 3.1 Example - To extract all cells using `RGB_LINE_DETECT` method and auto-detect header names
Default line detection method is `RGB_LINE_DETECT` 

With the help of `values` key, custom names can be provided to the header row or else it automatically names the 
column 'col_1', 'col_2' ...

In [None]:
config_param_dict = {
    'col_header': {
        'use_first_row': True,
        'values': []
    },
    'line_detection_method': [ite.interface.LineDetectionMethod.RGB_LINE_DETECT]
}
start_time = time.time()
result_1 = obj.extract_all_fields(
    image_file_path=IMAGE_PATH,
    within_bbox = my_within_bbox,
    config_param_dict = config_param_dict)
print("--- %s sec(s) ---" % round((time.time() - start_time),2))

#### Output

In [None]:
# print(json.dumps(result_1, indent=4))
df = pd.json_normalize(result_1['fields'][0]['table_value'])
# Postprocessing to remove newline chars
df.columns = df.columns.str.replace(r'\n','', regex=True)
df = df.replace(r'\n','', regex=True)

print('No. of records found =', len(df))
df.index = df.index + 1
df

### 3.2 Example - To extract all cells using `OPENCV_LINE_DETECT` method and save as an excel file

In [None]:
config_param_dict = {
        'col_header': {
            'use_first_row': True,
        },
        'line_detection_method': [ite.interface.LineDetectionMethod.OPENCV_LINE_DETECT]
    }

start_time = time.time()
result_2 = obj.extract_all_fields(image_file_path=IMAGE_PATH,
                                  within_bbox = my_within_bbox,
                                  config_param_dict = config_param_dict)
print("--- %s sec(s) ---" % round((time.time() - start_time),2))

***Note: The output of the extracted table above using two different methods `RGB_LINE_DETECT` and `OPENCV_LINE_DETECT` are different depending on the type of image. Hence, the selection of the line detection method plays a major role here.***

#### Output

In [None]:
# print(json.dumps(result_2, indent=4))
df = pd.json_normalize(result_2['fields'][0]['table_value'])
# Postprocessing to remove newline chars
df.columns = df.columns.str.replace(r'\n','', regex=True)
df = df.replace(r'\n','', regex=True)

print('No. of records found =', len(df))
df.index = df.index + 1
df

### 3.3 Example - To extract custom cells

In [None]:
config_param_dict = {
    "custom_cells": [
        {
            "rows": ["2:4"],
            "columns":[1]
        },
        {
            "rows": ["5"],
            "columns":[2]
        }
    ]
}

start_time = time.time()
result_3 = obj.extract_all_fields(image_file_path=IMAGE_PATH, 
                                  within_bbox = my_within_bbox,
                                  config_param_dict =config_param_dict)
print("--- %s sec(s) ---" % round((time.time() - start_time),2))

#### Output

In [None]:
# print(json.dumps(result_3, indent=4))
df = pd.json_normalize(result_3['fields'][0]['table_value'])
# Postprocessing to remove newline chars
df.columns = df.columns.str.replace(r'\n','', regex=True)
df = df.replace(r'\n','', regex=True)

print('No. of records found =', len(df))
df.index = df.index + 1
df