### Imports

In [1]:
import os
import cv2
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd

from skimage import io, color, filters, morphology
from skimage.util import invert
from skimage.filters import threshold_otsu
from skimage.morphology import convex_hull_image
from skimage.feature import canny
from scipy import ndimage as ndi

import warnings
warnings.filterwarnings("ignore")

### Functions

In [4]:
def pure_binarize(image):
    thresh = threshold_otsu(image)
    image = image > thresh
    return image    


def binarize_image(image):
    image = color.rgb2gray(image)
    thresh = threshold_otsu(image)
    image = image > thresh
    return image


def get_bboxes(img, file, input_dir, output_dir, output_txt_dir, width_threshold, height_threshold):

    img = invert(img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, binary = cv2.threshold(gray, 100, 255, cv2.THRESH_OTSU)
    inverted_binary = ~binary
    width, height = inverted_binary.shape
    contours, hierarchy = cv2.findContours(inverted_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    bboxes = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        w = int(w*(1/WIDTH_THRESHOLD))
        h = int(h*(1/HEIGHT_THRESHOLD))
        if cv2.contourArea(c) > 30:
            bboxes.append(['text',1,x, y, w, h])

    final_img = cv2.imread(input_dir + file)
    for b in bboxes:
        x = b[2]
        y = b[3]
        w = int(b[4])
        h = int(b[5])
        cv2.rectangle(final_img,(x,y), (x+w,y+h), (0, 255, 0),1)

    df = pd.DataFrame(bboxes, columns = ['label', 'confidence', 'X', 'Y', 'W', 'H'])
    name = file[:len(file) - 4]
    io.imsave(output_dir + name + '_pred.jpg', final_img)
    df.to_csv(output_txt_dir + name + '.txt', sep=' ',index=False, header=False)


def get_boxes(image, width_threshold, height_threshold, thickness=4, type="double"):

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    ret, binary = cv2.threshold(gray, 100, 255, cv2.THRESH_OTSU)
    inverted_binary = ~binary
    width, height = inverted_binary.shape
    contours, hierarchy = cv2.findContours(inverted_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    if(type == "double"):
        with_contours_int = cv2.drawContours(image, contours, -1,(0,0,255),thickness)
        gray_contour = cv2.cvtColor(with_contours_int, cv2.COLOR_BGR2GRAY)
        ret, binary_contour = cv2.threshold(gray_contour, 100, 255, cv2.THRESH_OTSU)
        inverted_contour = ~binary_contour
        contours, hierarchy = cv2.findContours(inverted_contour, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    bboxes = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        
#         if (w < width/2): #For English DocBank use the below one
        if (cv2.contourArea(c)) > (width*height)/100000 and h<(height/4) and (w < width/2): # For multilingual
            bboxes.append([x, y, w, h])

    final_img = np.zeros((image.shape), dtype = np.uint8)
    for b in bboxes:
        x = b[0]
        y = b[1]
        w = int(b[2]*width_threshold)
        h = int(b[3]*height_threshold)
        cv2.rectangle(final_img,(x,y), (x+w,y+h), (255, 255, 255),-1)
    final_img = ~final_img
    final_img = binarize_image(final_img)
    final_img = final_img*1
    return final_img


def get_image_edges(image, width_threshold, height_threshold, thickness=4, value="double"):
    image = binarize_image(image)
    image = invert(image)
    edges = filters.sobel(image)
    edges = pure_binarize(edges)
    io.imsave("temp.jpg", edges)
    image = cv2.imread("temp.jpg")
    return get_boxes(image, width_threshold, height_threshold, thickness, value)


def get_pillow_image_edges(image, width_threshold, height_threshold):
    image = image.convert("L")    # Converting Image to Gray
    edges = image.filter(ImageFilter.FIND_EDGES)
    edges = np.array(edges)
    edges = pure_binarize(edges)
    io.imsave("temp.jpg", edges)
    image = cv2.imread("temp.jpg")
    return get_boxes(image, width_threshold, height_threshold, "single")


def get_segmentation_labels(image, width_threshold, height_threshold, thickness=4, value="double"):
    image = binarize_image(image)
    edges = canny(image)
    image = ndi.binary_fill_holes(edges)
    image = pure_binarize(image)
    io.imsave("temp.jpg", image)
    image = cv2.imread("temp.jpg")
    return get_boxes(image, width_threshold, height_threshold, thickness, value)


def get_contour_labels(image, width_threshold, height_threshold, thickness=4, value="single"):
    return get_boxes(image, width_threshold, height_threshold, thickness, value)

### Contour Based LF

In [20]:
input_img_dir = './../../processed/docbank_100/images/' 
output_txt_dir = './../../results/LF_latest/contour/txt/'
output_img_dir = './../../results/LF_latest/contour/images/'
output_pix_dir = './../../results/LFs/contour/pixel/'


dir_list = os.listdir(input_img_dir)

WIDTH_THRESHOLD = 1
HEIGHT_THRESHOLD = 1

for file in tqdm(dir_list):
    out_img_file = output_img_dir + file
    ori_img = cv2.imread(input_img_dir + file)
    image = get_contour_labels(ori_img, WIDTH_THRESHOLD, HEIGHT_THRESHOLD, thickness=4, value="double")
    io.imsave("temp.jpg", image)
    image = cv2.imread("temp.jpg")
    get_bboxes(image, file, input_img_dir, output_img_dir, output_txt_dir, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)









100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:41<00:00,  2.43it/s]


### Edges Based LF

In [13]:
input_img_dir = './../../processed/docbank_100/images/' 
output_txt_dir = './../../results/LF_latest/edges/txt/'
output_img_dir = './../../results/LF_latest/edges/images/'


dir_list = os.listdir(input_img_dir)

WIDTH_THRESHOLD = 1
HEIGHT_THRESHOLD = 1

for file in tqdm(dir_list):
    if(file=='275.tar_1809.08252.gz_PapierFluctuations3_0_pro.jpg'):
    out_img_file = output_img_dir + file
    ori_img = cv2.imread(input_img_dir + file)
    image = get_image_edges(ori_img, WIDTH_THRESHOLD, HEIGHT_THRESHOLD, thickness=4, value="single")
    io.imsave("temp.jpg", image)
    image = cv2.imread("temp.jpg")
    get_bboxes(image, file, input_img_dir, output_img_dir, output_txt_dir, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)









100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:12<00:00,  1.38it/s]


### Segment Based LF

In [5]:
input_img_dir = './../../processed/docbank_100/images/' 
output_txt_dir = './../../results/LF_latest/segment/txt/'
output_img_dir = './../../results/LF_latest/segment/images/'


dir_list = os.listdir(input_img_dir)

WIDTH_THRESHOLD = 1
HEIGHT_THRESHOLD = 1

for file in tqdm(dir_list):
    out_img_file = output_img_dir + file
    ori_img = cv2.imread(input_img_dir + file)
    image = get_segmentation_labels(ori_img, WIDTH_THRESHOLD, HEIGHT_THRESHOLD, thickness=4, value="single")
    io.imsave("temp.jpg", image)
    image = cv2.imread("temp.jpg")
    get_bboxes(image, file, input_img_dir, output_img_dir, output_txt_dir, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)









100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:51<00:00,  1.12s/it]


### DocTR LF

In [None]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)

In [3]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)

input_dir   = './../../processed/docbank_100/images/'
output_dir  = './../../data/00_docbank_100/doctr/'

input_dir   = './../../funsd/testing_data/images/'
output_dir  = './../../data/06_funsd/doctr/'

dir_list = os.listdir(input_dir)
results = {}
images = []

for image in tqdm(dir_list):
    doc = DocumentFile.from_images(input_dir+image)
    result = model(doc)
    results[image] = result
    images.append(image)
    
# with open(output_dir + 'results.pkl', 'rb') as f:
#     results = pickle.load(f)
    
# for i in range(len(results)):
#     results[i].show(docs[i])

dimensions = {}
total = {}
for image,result in tqdm(results.items()):
    dim = tuple(reversed(result.pages[0].dimensions))
    predictions = []
    img = io.imread(os.path.join(input_dir,image))
    for block in result.pages[0].blocks:
        for line in block.lines:
            for word in line.words:
                values = []
                geo = word.geometry
                a = list(int(a*b) for a,b in zip(geo[0],dim))
                b = list(int(a*b) for a,b in zip(geo[1],dim))
                values.append("text")
                values.append(word.confidence)
                values.append(a[0])
                values.append(a[1])
                values.append(b[0]-a[0])
                values.append(b[1]-a[1])
                values.append(word.value)
                predictions.append(values)
                cv2.rectangle(img, (a[0], a[1]), (b[0], b[1]), (0, 255, 0))
    io.imsave(output_dir + 'images/' + image,img)
    total[image] = predictions
    
    
for image,result in tqdm(total.items()):
    name = image[:len(image) - 4]
    df = pd.DataFrame(result, columns = ['label', 'confidence', 'X', 'Y', 'W', 'H', 'prediction'])
    df.to_csv(output_dir + 'txt/' + name + '.txt', sep=' ',index=False, header=False)

with open(output_dir+'results.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:42<00:00,  2.05s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 13.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 791.63it/s]


### Tesseract LF

In [5]:
from pytesseract import Output, image_to_data

input_dir   = './../../processed/docbank_100/images/'
output_dir  = './../../data/00_docbank_100/tesseract/'  

dir_list = os.listdir(input_dir)

dimensions = {}
results = {}
total = {}

for image in tqdm(dir_list):
    predictions = []
    img = cv2.imread(input_dir + image)
    d = image_to_data(img, output_type=Output.DICT)
    for i in range(len(d['level'])):
        if d['level'][i]==5:
            values = []
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            (x, y, w, h) = (int(x), int(y), int(w), int(h))
            values.append('text')
            values.append(d['conf'][i])
            values = values + [x,y,w,h]
            values.append(d['text'][i])
            if(x==0 and y==0):
                continue
            else:
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255))
            predictions.append(values)
    total[image] = predictions
    results[image] = d
    cv2.imwrite(output_dir + 'images/' + image, img)

for image,result in total.items():
    name = image[:len(image) - 4]
    df = pd.DataFrame(result, columns = ['label', 'confidence', 'X', 'Y', 'W', 'H', 'prediction'])
    df.to_csv(output_dir + 'txt/' + name + '.txt', sep=' ',index=False, header=False)

with open(output_dir+'results.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:47<00:00,  1.05it/s]
