### Imports

In [1]:
import os
import cv2
import pickle
from tqdm import tqdm
import pandas as pd

from skimage import io, color
from skimage.filters import threshold_otsu

In [2]:
def binarize_image(image):
    image = color.rgb2gray(image) 
    thresh = threshold_otsu(image)
    image = image > thresh
    return image

## Remove Picture/Figure Information in Images  ( Applies to DocBank data )

In [9]:
input_txt_dir = './../../data/docbank_100/txt/'
input_img_dir = './../../data/docbank_100/ori_black/'
output_img_dir = './../../processed/docbank_100/images/' 
output_txt_dir = './../../processed/docbank_100/txt/'


dir_list = os.listdir(input_txt_dir)

for file in tqdm(dir_list):
    name = file[:len(file) - 4]
    out_img_file = output_img_dir + name + '_pro.jpg'
    ori_img = cv2.imread(input_img_dir + name + '_ori.jpg')
    
    try:
        df = pd.read_csv(input_txt_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
        df1 = df[df['label']=='figure'].reset_index(drop=True)
        df2 = df[df['label']!='figure'].reset_index(drop=True)

        # Removing Figure/Picture in Image
        height, width, _ = ori_img.shape
        for i in range(df1.shape[0]):
            x0, y0, x1, y1  = (df1['x0'][i], df1['y0'][i], df1['x1'][i], df1['y1'][i])
            
            # Removing bboxes of Text within Figure/Picture in Image from TXT file
            df2.drop(df2[(df2['x0'] >= x0) & (df2['x1'] <= x1) & (df2['y0'] >= y0) & (df2['y1'] <= y1)].index, inplace = True)
            df2 = df2.reset_index(drop=True)
            
            x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
            cv2.rectangle(ori_img, (x0, y0), (x1, y1), (255, 255, 255), cv2.FILLED)

        cv2.imwrite(out_img_file, ori_img)
        df2.to_csv(output_txt_dir + file, sep=' ', index=False, header=False)
    except:
        continue

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:09<00:00, 10.82it/s]


### Draw bounding boxes with Ground truth

In [8]:
input_img_dir = './../../processed/docbank_100/images/'
input_txt_dir = './../../processed/docbank_100/txt/'
output_ann_dir1 = './../../processed/docbank_100/ann_bb/'
output_ann_dir2 = './../../processed/docbank_100/ann/'

dir_list = os.listdir(input_txt_dir)

for file in tqdm(dir_list):
    name = file[:len(file) - 4]
    out_img_file1 = output_ann_dir1 + name + '_ann.jpg'
    out_img_file2 = output_ann_dir2 + name + '_ann.jpg'
    ori_img1 = cv2.imread(input_img_dir + name + '_pro.jpg')
    ori_img2 = ori_img1
    df = pd.read_csv(input_txt_dir+file, delimiter='\t',
                     names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])

    height, width, _ = ori_img1.shape
    for i in range(df.shape[0]):
        x0, y0, x1, y1  = (df['x0'][i], df['y0'][i], df['x1'][i], df['y1'][i])
        x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
        cv2.rectangle(ori_img1, (x0, y0), (x1, y1), (0, 255, 0), 1)
        cv2.rectangle(ori_img2, (x0, y0), (x1, y1), (0, 0, 0), cv2.FILLED)

    cv2.imwrite(out_img_file1, ori_img1)
    cv2.imwrite(out_img_file2, ori_img2)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.44it/s]


### Resize Images to Annotation size ( As Needed )

In [5]:
input_img_dir  = './../../processed/docbank_100/images/'
output_img_dir = './../../processed/docbank_100/images_resized/'

value = 1.5


dir_list = os.listdir(input_img_dir)

for file in tqdm(dir_list):
    img = io.imread(input_img_dir + file)
    height, width, _ = img.shape
    pro_img = cv2.resize(img, (int(width/value),int(height/value)))
    io.imsave(output_img_dir + file, pro_img)

  io.imsave(output_img_dir + file, pro_img)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:18<00:00,  5.38it/s]


# EQUATIONS ONLY

### Retain Only EQUATION Information

In [None]:
input_dir   = './../../docbank_processed/original_data/txt/'
ori_img_dir = './../../docbank_processed/original_data/ori_black/'
output_dir  = './../../docbank_processed/processed_data/only_eqn/ori_black/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for file in dir_list:
    name = file[:len(file) - 4]
    ori_img = cv2.imread(ori_img_dir + name + '_ori.jpg')
    df = pd.read_csv(input_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
    df = df[df['label']!='equation'].reset_index()
    
    height, width, _ = ori_img.shape
    for i in range(df.shape[0]):
        x0, y0, x1, y1  = (df['x0'][i], df['y0'][i], df['x1'][i], df['y1'][i])
        x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
        cv2.rectangle(ori_img, (x0, y0), (x1, y1), (255, 255, 255), cv2.FILLED)
    
    cv2.imwrite(output_dir + name + '_ori_pro.jpg', ori_img)

### Retaion only EQUATION Information in Annotations

In [None]:
input_dir   = './../../docbank_processed/original_data/txt/'
ann_img_dir = './../../docbank_processed/original_data/ann/'
output_dir  = './../../docbank_processed/processed_data/only_eqn/ann/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for file in dir_list:
    name = file[:len(file) - 4]
    ann_img = cv2.imread(ann_img_dir + name + '_ann.jpg')
    df = pd.read_csv(input_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
    df = df[df['label']!='equation'].reset_index()
    
    height, width, _ = ann_img.shape
    for i in range(df.shape[0]):
        x0, y0, x1, y1  = (df['x0'][i], df['y0'][i], df['x1'][i], df['y1'][i])
        x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
        cv2.rectangle(ann_img, (x0, y0), (x1, y1), (255, 255, 255), cv2.FILLED)
    
    cv2.imwrite(output_dir + name + '_ann_pro.jpg', ann_img)

In [None]:
input_dir   = './../../docbank_processed/original_data/txt/'
ann_img_dir = './../../docbank_processed/original_data/ann/'
output_dir  = './../../docbank_processed/processed_data/only_eqn/txt/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for file in dir_list:
    name = file[:len(file) - 4]
    ann_img = cv2.imread(ann_img_dir + name + '_ann.jpg')
    df = pd.read_csv(input_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
    df = df[df['label']=='equation'].reset_index()
    df.to_csv(output_dir + file,sep='\t',index=False)

### Remove files with No Equations

In [None]:
input_dir   = './../../../docbank_processed/processed_data/only_eqn/ori_txt/'
ori_txt_dir = './../../../docbank_processed/processed_data/only_eqn/ori_txt/'
parent = './../../../docbank_processed/processed_data/only_eqn/'

dir_list = os.listdir(input_dir)

for dir1 in os.listdir(parent):
    for file2 in os.listdir(parent+dir1):
        found = False
        for file in os.listdir(ori_txt_dir):
            name = file[:len(file) - 4]
            if(name in file2):
                found = True
                break
        if(found==False):
            os.remove(parent+dir1+'/'+file2)