### Imports

In [1]:
import os
import pickle
import cv2

from doctr.io import DocumentFile
from doctr.models import ocr_predictor

from skimage import io, color
from skimage.filters import threshold_otsu

### Functions

In [2]:
def process_image(image):
    image = color.rgb2gray(image)
    
    thresh = threshold_otsu(image)
    image = image > thresh
    return image

### DocTR model loading

In [3]:
input_dir  = './../../temp_data/images/'
output_dir = './../../temp_data/results/doctr/'

In [4]:
model = ocr_predictor(pretrained=True)



### Get Results

In [5]:
dir_list = os.listdir(input_dir)
results = {}
images = []

for image in dir_list:
    doc = DocumentFile.from_images(input_dir+image)
    result = model(doc)
    results[image] = result
    images.append(image)
    
# for i in range(len(results)):
#     results[i].show(docs[i])

In [6]:
dimensions = {}
pixels = {}
for image,result in results.items():
    dim = tuple(reversed(result.pages[0].dimensions))
    values = []
    img = io.imread(os.path.join(input_dir,image))
    for block in result.pages[0].blocks:
        for line in block.lines:
            for word in line.words:
                geo = word.geometry
                a = list(a*b for a,b in zip(geo[0],dim))
                b = list(a*b for a,b in zip(geo[1],dim))
                values.append(a+b)
                cv2.rectangle(img, (int(a[0]), int(a[1])), (int(b[0]), int(b[1])), (0, 0, 0), -1)
    dimensions[image] = values
    img = process_image(img)
    img = img*1
    pixels[image] = img
#     io.imsave(output_dir + 'images/' + image, img)

### Store Results

In [7]:
with open(output_dir+'results.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)
    
with open(output_dir+'pixels.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(pixels, outp, pickle.HIGHEST_PROTOCOL)
    
with open(output_dir+'dimensions.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(dimensions, outp, pickle.HIGHEST_PROTOCOL)