### Imports

In [None]:
import cv2
import os

from skimage import io, color
from skimage.filters import threshold_otsu

In [None]:
def process_image(image):
    image = color.rgb2gray(image) 
    thresh = threshold_otsu(image)
    image = image > thresh
    return image

### Resize Images to Annotation size

In [None]:
input_dir1  = './../../docbank_processed/processed_data/ori_black/'
input_dir2  = './../../docbank_processed/processed_data/ann/'
output_dir1 = './../../docbank_processed/processed_data/spear_ori_black/'

# input_dir1  = './../../docbank_processed/processed_data/only_eqn/ori_black/'
# input_dir2  = './../../docbank_processed/processed_data/only_eqn/ann/'
# output_dir1 = './../../docbank_processed/processed_data/only_eqn/spear_ori_black/'

dir_list = os.listdir(input_dir2)

for img in dir_list:
    img1 = io.imread(input_dir2 + img)
    x,y,_ = img1.shape
    name = img[:len(img) - 11]
    img2 = io.imread(input_dir1 + name + 'ori_pro.jpg')
    pro_img = cv2.resize(img2, (y,x))
    io.imsave(output_dir1 + name + 'ori_pro.jpg', pro_img)

### Save DocTR Predictions

In [None]:
import os
import pickle

from skimage import io
import pandas as pd
import cv2

from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)

input_dir   = './docbank_processed/processed_data/ori_black/'
output_dir  = './docbank_processed/resultant_data/doctr/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for image in dir_list:
    doc = DocumentFile.from_images(input_dir+image)
    result = model(doc)
    results[image] = result
    images.append(image)
    
# for i in range(len(results)):
#     results[i].show(docs[i])

dimensions = {}
total = {}
for image,result in results.items():
    dim = tuple(reversed(result.pages[0].dimensions))
    predictions = []
    img = io.imread(os.path.join(input_dir,image))
    for block in result.pages[0].blocks:
        for line in block.lines:
            for word in line.words:
                values = []
                geo = word.geometry
                a = list(int(a*b) for a,b in zip(geo[0],dim))
                b = list(int(a*b) for a,b in zip(geo[1],dim))
                values.append(a[0])
                values.append(a[1])
                values.append(b[0])
                values.append(b[1])
                values.append(word.value)
                values.append(word.confidence)
                predictions.append(values)
                cv2.rectangle(img, (a[0], a[1]), (b[0], b[1]), (0, 0, 255))
    io.imsave(output_dir + 'images/' + image,img)
    total[image] = predictions
    
    
for image,result in total.items():
    name = image[:len(image) - 12]
    df = pd.DataFrame(result, columns = ['x0', 'y0', 'x1', 'y1', 'prediction', 'confidence'])
    df.to_csv(output_dir + 'txt/' + name + '.txt', sep='\t',index=False)

with open(output_dir+'results.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

### Save Tesseract Predictions

In [None]:
import os
import cv2
import pickle
from pytesseract import Output, image_to_data

input_dir   = './docbank_processed/processed_data/ori_black/'
output_dir  = './docbank_processed/resultant_data/tesseract/' 
dir_list = os.listdir(input_dir)

dimensions = {}
results = {}
total = {}

for image in dir_list:
    predictions = []
    img = cv2.imread(input_dir + image)
    d = image_to_data(img, output_type=Output.DICT)
    for i in range(len(d['level'])):
        values = []
        (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        (x, y, w, h) = (int(x), int(y), int(w), int(h))
        values.append('text')
        values.append(d['conf'][i])
        values = values + [x,y,w,h]
        values.append(d['text'][i])
        if(x==0 and y==0):
            continue
        else:
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255))
        predictions.append(values)
    total[image] = predictions
    results[image] = d
    cv2.imwrite(output_dir + 'images/' + image, img)

for image,result in total.items():
    name = image[:len(image) - 12]
    df = pd.DataFrame(result, columns = ['class', 'confidence', 'X', 'Y', 'W', 'H', 'prediction'])
    df.to_csv(output_dir + 'txt/' + name + '.txt', sep='\t',index=False)

with open(output_dir+'results.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

# REMOVE IMAGES

In [None]:
input_dir   = './../../dataset/original/txt/'
ori_img_dir = './../../dataset/original/images/'
output_dir  = './../../dataset/processed/images/' 


ann_img_dir    = './../../dataset/original/ann/'
ann_output_dir = './../../dataset/processed/ann/' 
txt_output_dir = './../../dataset/processed/txt/'


dir_list = os.listdir(input_dir)
results = {}
images = []

for file in tqdm(dir_list):
    name = file[:len(file) - 4]
    value = output_dir + name + '_ori_pro.jpg'
    if not os.path.exists(value):
        ori_img = cv2.imread(ori_img_dir + name + '_ori.jpg')
        try:
            df = pd.read_csv(input_dir+file, delimiter='\t', 
                    names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
            df = df[df['label']=='figure'].reset_index()

            height, width, _ = ori_img.shape
            for i in range(df.shape[0]):
                x0, y0, x1, y1  = (df['x0'][i], df['y0'][i], df['x1'][i], df['y1'][i])
                x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
                cv2.rectangle(ori_img, (x0, y0), (x1, y1), (255, 255, 255), cv2.FILLED)

            cv2.imwrite(value, ori_img)
        except:
            continue

dir_list = os.listdir(input_dir)
results = {}
images = []

for file in tqdm(dir_list):
    value = txt_output_dir + file
    if not os.path.exists(value):
        try:
            df = pd.read_csv(input_dir+file, delimiter='\t', 
                    names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
            df = df[df['label']!='figure'].reset_index()
            df.to_csv(value, sep='\t', index=False)
        except:
            continue
     
# Remove Images from Annotations    
dir_list = os.listdir(input_dir)
results = {}
images = []

for file in tqdm(dir_list):
    name = file[:len(file) - 4]
    ann_img = cv2.imread(ann_img_dir + name + '_ann.jpg')
    df = pd.read_csv(input_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
    df = df[df['label']=='figure'].reset_index()
    
    height, width, _ = ann_img.shape
    for i in range(df.shape[0]):
        x0, y0, x1, y1  = (df['x0'][i], df['y0'][i], df['x1'][i], df['y1'][i])
        x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
        cv2.rectangle(ann_img, (x0, y0), (x1, y1), (255, 255, 255), cv2.FILLED)
    
    cv2.imwrite(ann_output_dir + name + '_ann_pro.jpg', ann_img)

# EQUATIONS ONLY

### Retain Only EQUATION Information

In [None]:
input_dir   = './../../docbank_processed/original_data/txt/'
ori_img_dir = './../../docbank_processed/original_data/ori_black/'
output_dir  = './../../docbank_processed/processed_data/only_eqn/ori_black/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for file in dir_list:
    name = file[:len(file) - 4]
    ori_img = cv2.imread(ori_img_dir + name + '_ori.jpg')
    df = pd.read_csv(input_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
    df = df[df['label']!='equation'].reset_index()
    
    height, width, _ = ori_img.shape
    for i in range(df.shape[0]):
        x0, y0, x1, y1  = (df['x0'][i], df['y0'][i], df['x1'][i], df['y1'][i])
        x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
        cv2.rectangle(ori_img, (x0, y0), (x1, y1), (255, 255, 255), cv2.FILLED)
    
    cv2.imwrite(output_dir + name + '_ori_pro.jpg', ori_img)

### Retaion only EQUATION Information in Annotations

In [None]:
input_dir   = './../../docbank_processed/original_data/txt/'
ann_img_dir = './../../docbank_processed/original_data/ann/'
output_dir  = './../../docbank_processed/processed_data/only_eqn/ann/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for file in dir_list:
    name = file[:len(file) - 4]
    ann_img = cv2.imread(ann_img_dir + name + '_ann.jpg')
    df = pd.read_csv(input_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
    df = df[df['label']!='equation'].reset_index()
    
    height, width, _ = ann_img.shape
    for i in range(df.shape[0]):
        x0, y0, x1, y1  = (df['x0'][i], df['y0'][i], df['x1'][i], df['y1'][i])
        x0, y0, x1, y1 = (int(x0*width/1000), int(y0*height/1000), int(x1*width/1000), int(y1*height/1000))
        cv2.rectangle(ann_img, (x0, y0), (x1, y1), (255, 255, 255), cv2.FILLED)
    
    cv2.imwrite(output_dir + name + '_ann_pro.jpg', ann_img)

In [None]:
input_dir   = './../../docbank_processed/original_data/txt/'
ann_img_dir = './../../docbank_processed/original_data/ann/'
output_dir  = './../../docbank_processed/processed_data/only_eqn/txt/' 

dir_list = os.listdir(input_dir)
results = {}
images = []

for file in dir_list:
    name = file[:len(file) - 4]
    ann_img = cv2.imread(ann_img_dir + name + '_ann.jpg')
    df = pd.read_csv(input_dir+file, delimiter='\t', 
                names=["token", "x0", "y0", "x1", "y1", "R", "G", "B", "font name", "label"])
    df = df[df['label']=='equation'].reset_index()
    df.to_csv(output_dir + file,sep='\t',index=False)

### Remove files with No Equations

In [None]:
input_dir   = './../../../docbank_processed/processed_data/only_eqn/ori_txt/'
ori_txt_dir = './../../../docbank_processed/processed_data/only_eqn/ori_txt/'
parent = './../../../docbank_processed/processed_data/only_eqn/'

dir_list = os.listdir(input_dir)

for dir1 in os.listdir(parent):
    for file2 in os.listdir(parent+dir1):
        found = False
        for file in os.listdir(ori_txt_dir):
            name = file[:len(file) - 4]
            if(name in file2):
                found = True
                break
        if(found==False):
            os.remove(parent+dir1+'/'+file2)