### Imports

In [2]:
import os
import cv2
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd

from skimage import io, color, filters, morphology
from skimage.util import invert
from skimage.filters import threshold_otsu
from skimage.morphology import convex_hull_image
from skimage.feature import canny
from scipy import ndimage as ndi

import warnings
warnings.filterwarnings("ignore")

### Functions

In [34]:
def pure_binarize(image):
    thresh = threshold_otsu(image)
    image = image > thresh
    return image    


def binarize_image(image):
    image = color.rgb2gray(image)
    thresh = threshold_otsu(image)
    image = image > thresh
    return image


def get_bboxes(img, file, input_dir, output_dir, output_txt_dir, width_threshold, height_threshold):

    img = invert(img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Convert the grayscale image to binary
    ret, binary = cv2.threshold(gray, 100, 255, cv2.THRESH_OTSU)

    # To detect object contours, we want a black background and a white foreground, so we invert the image (i.e. 255 - pixel value)
    inverted_binary = ~binary

    # Find the contours on the inverted binary image, and store them in a list
    # Contours are drawn around white blobs. hierarchy variable contains info on the relationship between the contours
    contours, hierarchy = cv2.findContours(inverted_binary,
    cv2.RETR_TREE,
    cv2.CHAIN_APPROX_SIMPLE)
    
    #This is inmtermediate contour image having red contours plotted along the letters
    with_contours_int = cv2.drawContours(image, contours, -1,(0,0,255),2)

    #We again perform binarization of above image inorder to find contours again 
    gray_contour = cv2.cvtColor(with_contours_int, cv2.COLOR_BGR2GRAY)

    ret, binary_contour = cv2.threshold(gray_contour, 100, 255, 
    cv2.THRESH_OTSU)
    inverted_contour = ~binary_contour

    # We find contours again of this inverted binary map so that word boundaries are detected
    contours, hierarchy = cv2.findContours(inverted_contour,
    cv2.RETR_TREE,
    cv2.CHAIN_APPROX_SIMPLE)


    bboxes = []
    # Draw a bounding box around all contours
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        w = int(w*(1/width_threshold))
        h = int(h*(1/height_threshold))
        # Make sure contour area is large enough
#         if (cv2.contourArea(c)) > 25 and (cv2.contourArea(c) < 10000):
        bboxes.append(['text',1,x, y, w, h])

    final_img = cv2.imread(input_dir + file)
    for b in bboxes:
        x = b[2]
        y = b[3]
        w = int(b[4])
        h = int(b[5])
        cv2.rectangle(final_img,(x,y), (x+w,y+h), (0, 255, 128),1)

    df = pd.DataFrame(bboxes, columns = ['label', 'confidence', 'x0', 'y0', 'w', 'h'])
    name = file[:len(file) - 4]
    io.imsave(output_dir + file, final_img)
    df.to_csv(output_txt_dir + name + '.txt', sep=' ',index=False)


def get_boxes(image, width_threshold, height_threshold, type="single"):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Convert the grayscale image to binary
    ret, binary = cv2.threshold(gray, 100, 255, cv2.THRESH_OTSU)

    # To detect object contours, we want a black background and a white 
    # foreground, so we invert the image (i.e. 255 - pixel value)
    inverted_binary = ~binary
    width, height = inverted_binary.shape

    # Find the contours on the inverted binary image, and store them in a list
    # Contours are drawn around white blobs.
    # hierarchy variable contains info on the relationship between the contours
    contours, hierarchy = cv2.findContours(inverted_binary,
    cv2.RETR_TREE,
    cv2.CHAIN_APPROX_SIMPLE)

    if(type == "double"):
        #This is inmtermediate contour image having red contours plotted along the letters
        with_contours_int = cv2.drawContours(image, contours, -1,(0,0,255),2)

        #We again perform binarization of above image inorder to find contours again 
        gray_contour = cv2.cvtColor(with_contours_int, cv2.COLOR_BGR2GRAY)

        ret, binary_contour = cv2.threshold(gray_contour, 100, 255, 
        cv2.THRESH_OTSU)
        inverted_contour = ~binary_contour

        # We find contours again of this inverted binary map so that word boundaries are detected
        contours, hierarchy = cv2.findContours(inverted_contour,
        cv2.RETR_TREE,
        cv2.CHAIN_APPROX_SIMPLE)

    bboxes = []
    # Draw a bounding box around all contours
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        # Make sure contour area is large enough
        if (cv2.contourArea(c) > 20) and h<(height/30):
            bboxes.append([x, y, w, h])

    final_img = np.zeros((image.shape), dtype = np.uint8)
    for b in bboxes:
        x = b[0]
        y = b[1]
        w = int(b[2]*width_threshold)
        h = int(b[3]*height_threshold)
        cv2.rectangle(final_img,(x,y), (x+w,y+h), (255, 255, 255),-1)
    final_img = ~final_img
    final_img = binarize_image(final_img)
    final_img = final_img*1
    return final_img


def get_image_edges(image, width_threshold, height_threshold):
    image = binarize_image(image)
    image = invert(image)
    edges = filters.sobel(image)
    edges = pure_binarize(edges)
    io.imsave("temp.jpg", edges)
    image = cv2.imread("temp.jpg")
    return get_boxes(image, width_threshold, height_threshold, "double")


def get_pillow_image_edges(image, width_threshold, height_threshold):
    image = image.convert("L")    # Converting Image to Gray
    edges = image.filter(ImageFilter.FIND_EDGES)
    edges = np.array(edges)
    edges = pure_binarize(edges)
    io.imsave("temp.jpg", edges)
    image = cv2.imread("temp.jpg")
    return get_boxes(image, width_threshold, height_threshold, "single")


def get_segmentation_labels(image, width_threshold, height_threshold):
    image = binarize_image(image)
    edges = canny(image)
    image = ndi.binary_fill_holes(edges)
    image = pure_binarize(image)
    io.imsave("temp.jpg", image)
    image = cv2.imread("temp.jpg")
    return get_boxes(image, width_threshold, height_threshold, "double")


def get_contour_labels(image, width_threshold, height_threshold):
    return get_boxes(image, width_threshold, height_threshold, "double")

### Contour Based LF

In [36]:
input_img_dir = './../../processed/docbank_100/images/' 
output_txt_dir = './../../results/LFs/contour/txt/'
output_img_dir = './../../results/LFs/contour/images/'
output_pix_dir = './../../results/LFs/contour/pixel/'


dir_list = os.listdir(input_img_dir)

WIDTH_THRESHOLD = 1
HEIGHT_THRESHOLD = 1

for file in tqdm(dir_list):
    out_img_file = output_img_dir + file
    ori_img = cv2.imread(input_img_dir + file)
    image = get_contour_labels(ori_img, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)
    io.imsave(output_pix_dir + file, image)
    image = cv2.imread(output_pix_dir + file)
    get_bboxes(image, file, input_img_dir, output_img_dir, output_txt_dir, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)





100%|█████████████████████████████████████████| 100/100 [01:00<00:00,  1.66it/s]


### Edges Based LF

In [23]:
input_img_dir = './../../processed/docbank_100/images/' 
output_txt_dir = './../../results/LFs/edges/txt/'
output_img_dir = './../../results/LFs/edges/images/'


dir_list = os.listdir(input_img_dir)

WIDTH_THRESHOLD = 1
HEIGHT_THRESHOLD = 1

for file in tqdm(dir_list):
    out_img_file = output_img_dir + file
    ori_img = cv2.imread(input_img_dir + file)
    image = get_image_edges(ori_img, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)
    io.imsave("temp.jpg", image)
    image = cv2.imread("temp.jpg")
    get_bboxes(image, file, input_img_dir, output_img_dir, output_txt_dir, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)

  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)


  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)


  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)


  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave(output_dir + file, final_img)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edge

  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)


  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)


  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
  io.imsave("temp.jpg", edges)
  io.imsave("temp.jpg", image)
100%|█████████████████████████████████████████| 100/100 [01:12<00:00,  1.38it/s]


### Segment Based LF

In [14]:
input_img_dir = './../../processed/docbank_100/images/' 
output_txt_dir = './../../results/LFs/segment/txt/'
output_img_dir = './../../results/LFs/segment/images/'


dir_list = os.listdir(input_img_dir)

WIDTH_THRESHOLD = 1
HEIGHT_THRESHOLD = 1

for file in tqdm(dir_list):
    if(file == '171.tar_1510.07771.gz_manuscript_v1_5_pro.jpg'):
        out_img_file = output_img_dir + file
        ori_img = cv2.imread(input_img_dir + file)
        image = get_segmentation_labels(ori_img, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)
        io.imsave("temp.jpg", image)
        image = cv2.imread("temp.jpg")
        get_bboxes(image, file, input_img_dir, output_img_dir, output_txt_dir, WIDTH_THRESHOLD, HEIGHT_THRESHOLD)

  io.imsave("temp.jpg", image)


1700
2200


  io.imsave("temp.jpg", image)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 79.34it/s]


### DocTR LF

In [None]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)

input_dir   = './../../processed/docbank_100/images_resized/'
output_dir  = './../../results/LFs/doctr/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for image in tqdm(dir_list):
    doc = DocumentFile.from_images(input_dir+image)
    result = model(doc)
    results[image] = result
    images.append(image)
    
# for i in range(len(results)):
#     results[i].show(docs[i])

dimensions = {}
total = {}
for image,result in results.items():
    dim = tuple(reversed(result.pages[0].dimensions))
    predictions = []
    img = io.imread(os.path.join(input_dir,image))
    for block in result.pages[0].blocks:
        for line in block.lines:
            for word in line.words:
                values = []
                geo = word.geometry
                a = list(int(a*b) for a,b in zip(geo[0],dim))
                b = list(int(a*b) for a,b in zip(geo[1],dim))
                values.append("text")
                values.append(word.confidence)
                values.append(a[0])
                values.append(a[1])
                values.append(b[0]-a[0])
                values.append(b[1]-a[1])
                values.append(word.value)
                predictions.append(values)
                cv2.rectangle(img, (a[0], a[1]), (b[0], b[1]), (0, 255, 0))
    io.imsave(output_dir + 'images_resized/' + image,img)
    total[image] = predictions
    
    
for image,result in total.items():
    name = image[:len(image) - 8]
    df = pd.DataFrame(result, columns = ['label', 'confidence', 'X', 'Y', 'W', 'H', 'prediction'])
    df.to_csv(output_dir + 'txt_resized/' + name + '.txt', sep=' ',index=False, header=False)

with open(output_dir+'results_resized.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

 57%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 57/100 [05:37<03:37,  5.06s/it]

### Tesseract LF

In [6]:
from pytesseract import Output, image_to_data

input_dir   = './../../processed/docbank_100/images/'
output_dir  = './../../results/LFs/tesseract/'  
dir_list = os.listdir(input_dir)

dimensions = {}
results = {}
total = {}

for image in tqdm(dir_list):
    predictions = []
    img = cv2.imread(input_dir + image)
    d = image_to_data(img, output_type=Output.DICT)
    for i in range(len(d['level'])):
        if d['level'][i]==5:
            values = []
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            (x, y, w, h) = (int(x), int(y), int(w), int(h))
            values.append('text')
            values.append(d['conf'][i])
            values = values + [x,y,w,h]
            values.append(d['text'][i])
            if(x==0 and y==0):
                continue
            else:
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255))
            predictions.append(values)
    total[image] = predictions
    results[image] = d
    cv2.imwrite(output_dir + 'images/' + image, img)

for image,result in total.items():
    name = image[:len(image) - 12]
    df = pd.DataFrame(result, columns = ['label', 'confidence', 'X', 'Y', 'W', 'H', 'prediction'])
    df.to_csv(output_dir + 'txt/' + name + '.txt', sep=' ',index=False, header=False)

with open(output_dir+'results.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [05:26<00:00,  3.27s/it]
