In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")
from ocr_ensemble.data import load_dataset
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from functools import partial
from collections import defaultdict
from copy import deepcopy
import torch

from paddleocr import draw_ocr
from PIL import Image

In [3]:
from ocr_ensemble.classifiers import ClipEmbedding, ClipMulticlass, ClipPresence
from ocr_ensemble.proposers import PaddleOCRProposalGenerator
from ocr_ensemble.proposers import rotatedCrop
from ocr_ensemble.experts import HandwrittenExpert, Stage1Expert, PaddleOCRExpert
from ocr_ensemble.data import identity

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
expert_size = 'large'
parquet_fname = '../data/laion2b-en-10K.parquet'
parquet_fname = '../data/laion2b-en-1K.parquet'
parquet_result_fname = f'../data/laion2b-en-1K-experts-{expert_size}.parquet'
dataset_path = '../data/laion2b-en-1K-large'


In [5]:
dataset = load_dataset(dataset_path, parquet_fname, image_size=512, number_sample_per_shard=100)

# compute subset of images that contain text

In [6]:
clip_emb = ClipEmbedding()
presence = ClipPresence(clip_emb=clip_emb)

In [7]:
dataset_clip = deepcopy(dataset)
dataset_clip.map_tuple(presence.get_transform(), identity, identity)

<webdataset.compat.WebDataset at 0x21f5047fbe0>

In [8]:
loader = DataLoader(dataset_clip, batch_size=200, num_workers=4)

In [9]:
features = []
labels = []
label_dict = {}
for imgs, captions, keys in tqdm(loader):
    preds = presence.predict(imgs.to(device))
    labels += [preds]
    for key, pred in zip(keys, preds):
        label_dict[key] = pred
    
labels = np.concatenate(labels, axis=0)

6it [00:17,  2.95s/it]


# compute & label bounding boxes

In [10]:
from torch.utils.data import default_collate

def collate(batch):
    imgs = []
    bboxes = []
    labels = []
    for img, bbox, label in batch:
        imgs += [img]
        bboxes += [bbox]
        labels += [label]
    return default_collate(imgs), bboxes, labels

def get_crops(src, label_dict, proposer):
    for img, caption, key in tqdm(src):
        if label_dict[key] == 1:
            crops, bboxes = proposer(img)
            for crop, bbox in zip(crops, bboxes):
                yield crop, bbox, key

def get_imgs_containing_text(src, label_dict):
    for img, caption, key in tqdm(src):
        if label_dict[key] == 1:
            yield img, caption, key

proposer = PaddleOCRProposalGenerator(device='cpu')
dataset_crops = deepcopy(dataset)
dataset_crops.compose(partial(get_crops, label_dict=label_dict, proposer=proposer))

<webdataset.compat.WebDataset at 0x21f52a49cf0>

In [11]:
expert_text_dict = {f"trocr-{expert_size}-handwritten": "handwritten text, handwriting, black on white",
                    "paddleocr": "text in a document, website, or presentation",
                    f"trocr-{expert_size}-stage1": "text in a scene"}
expert_dict = {f"trocr-{expert_size}-handwritten": HandwrittenExpert(expert_size),
               "paddleocr": PaddleOCRExpert(device='cpu'),
               f"trocr-{expert_size}-stage1": Stage1Expert(expert_size)}

clf = ClipMulticlass(list(expert_text_dict.values()),
                     clip_emb = clip_emb)

dataset_crops.map_tuple(clf.get_transform(), identity, identity)
loader_crops = DataLoader(dataset_crops, 
                          batch_size=200,
                          collate_fn=collate) # here num_worker breaks things in non-trivial ways

microsoft/trocr-large-handwritten


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


microsoft/trocr-large-stage1


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


only supported targets ['handwritten', 'printed', 'scene']


In [14]:
bbox_dict = defaultdict(list)
bbox_label_dict = defaultdict(list)
bbox_scores_dict = defaultdict(list)

for crops, bboxes, keys in tqdm(loader_crops):
    labels = clf.predict(crops.to(device))

    for label, bbox, key in zip(labels, bboxes, keys):
        bbox_dict[key] += [bbox]
        bbox_label_dict[key] += [label]
        #bbox_scores_dict[key] += [score.tolist()]

0it [00:00, ?it/s]
0it [00:00, ?it/s]
4it [00:00,  6.93it/s]
6it [00:00,  6.28it/s]
7it [00:01,  4.36it/s]
12it [00:01,  7.10it/s]
13it [00:02,  5.34it/s]
14it [00:02,  3.96it/s]
17it [00:03,  5.12it/s]
18it [00:03,  4.83it/s]
19it [00:03,  4.13it/s]
22it [00:04,  5.31it/s]
26it [00:04,  7.29it/s]
27it [00:05,  4.66it/s]
34it [00:05,  8.42it/s]
39it [00:06,  9.36it/s]
43it [00:06,  9.55it/s]
45it [00:08,  4.32it/s]
46it [00:08,  3.82it/s]
48it [00:08,  4.16it/s]
50it [00:09,  4.05it/s]
51it [00:09,  3.73it/s]
56it [00:10,  5.69it/s]
58it [00:10,  5.52it/s]
59it [00:11,  4.78it/s]
63it [00:11,  6.45it/s]
65it [00:11,  6.18it/s]
67it [00:12,  6.14it/s]
1it [00:14, 14.34s/it]]
70it [00:14,  2.16it/s]
71it [00:14,  2.31it/s]
75it [00:15,  3.84it/s]
77it [00:15,  3.93it/s]
78it [00:16,  3.23it/s]
79it [00:16,  3.24it/s]
80it [00:16,  3.13it/s]
81it [00:17,  2.59it/s]
88it [00:17,  6.56it/s]
90it [00:18,  5.80it/s]
92it [00:18,  5.65it/s]
96it [00:19,  7.44it/s]
97it [00:19,  6.02it/s]
98it 

753it [02:22,  3.67it/s]
755it [02:22,  4.14it/s]
756it [02:23,  3.71it/s]
760it [02:24,  4.06it/s]
761it [02:24,  3.80it/s]
764it [02:24,  4.44it/s]
765it [02:25,  4.05it/s]
766it [02:25,  4.07it/s]
771it [02:25,  7.03it/s]
773it [02:26,  6.80it/s]
776it [02:26,  7.22it/s]
780it [02:27,  7.00it/s]
781it [02:27,  6.08it/s]
785it [02:27,  7.16it/s]
786it [02:28,  6.08it/s]
787it [02:28,  4.66it/s]
789it [02:29,  4.30it/s]
790it [02:29,  3.96it/s]
792it [02:30,  4.06it/s]
796it [02:30,  5.90it/s]
797it [02:30,  4.60it/s]
802it [02:31,  7.31it/s]
803it [02:31,  4.92it/s]
806it [02:32,  5.13it/s]
807it [02:32,  4.32it/s]
811it [02:33,  6.01it/s]
812it [02:33,  5.56it/s]
813it [02:33,  5.06it/s]
814it [02:34,  4.34it/s]
817it [02:34,  5.52it/s]
821it [02:34,  7.31it/s]
824it [02:35,  7.64it/s]
825it [02:35,  6.01it/s]
826it [02:36,  4.75it/s]
831it [02:36,  7.30it/s]
832it [02:36,  6.24it/s]
833it [02:37,  5.41it/s]
11it [02:38, 15.07s/it]]
838it [02:38,  4.11it/s]
853it [02:39,  5.36it/s]


# perform ocr with the experts

In [15]:
def get_efficient_and_filtered_crops(src, expert_idx, bbox_dict, bbox_label_dict):
    for img, caption, key in tqdm(src):
        bboxes = bbox_dict[key]
        labels = bbox_label_dict[key]
        for bbox_idx, (bbox, label) in enumerate(zip(bboxes, labels)):
            if label == expert_idx:
                crop = rotatedCrop(img, bbox)
                yield crop, bbox_idx, key

def collate_crops(batch, img_collate):
    crops = []
    bboxes = []
    keys = []
    for crop, bbox, key in batch:
        crops += [crop]
        bboxes += [bbox]
        keys += [key]
    return img_collate(crops), bboxes, keys

In [16]:
ocr_dict = defaultdict(list)
for idx, key in enumerate(expert_dict.keys()):
    print(f'Expert {key} ...')
    expert = expert_dict[key]
    dataset_filtered = deepcopy(dataset)
    dataset_filtered.compose(partial(get_efficient_and_filtered_crops,
                                     expert_idx = idx, 
                                     bbox_dict = bbox_dict,
                                     bbox_label_dict = bbox_label_dict))
    dataset_filtered.map_tuple(expert.get_transform(), identity, identity)
    if key is "paddleocr":
        collate_fn = partial(collate_crops, img_collate=expert.get_collate())
    else:
        collate_fn = collate
    loader_expert = DataLoader(dataset_filtered, 
                           batch_size=200,
                           collate_fn=collate_fn)

    
    for crops, bbox_ids, keys in tqdm(loader_expert):
        texts = expert.process_batch(crops)
        for text, bbox_idx, key in zip(texts, bbox_ids, keys):
            ocr_dict[key] += [(bbox_idx, text)]

Expert trocr-large-handwritten ...


0it [00:00, ?it/s]
0it [00:00, ?it/s]
14it [00:00, 140.00it/s]
28it [00:00, 116.66it/s]
42it [00:00, 123.73it/s]
55it [00:00, 122.44it/s]
68it [00:00, 123.73it/s]
81it [00:00, 121.83it/s]
94it [00:00, 118.50it/s]
106it [00:00, 110.75it/s]
121it [00:01, 121.83it/s]
134it [00:01, 120.71it/s]
147it [00:01, 120.61it/s]
161it [00:01, 125.77it/s]
174it [00:01, 122.06it/s]
187it [00:01, 118.59it/s]
199it [00:01, 114.21it/s]
215it [00:01, 125.15it/s]
228it [00:01, 123.73it/s]
243it [00:01, 129.31it/s]
256it [00:02, 108.03it/s]
268it [00:02, 101.79it/s]
283it [00:02, 112.38it/s]
297it [00:02, 117.72it/s]
312it [00:02, 123.39it/s]
327it [00:02, 130.27it/s]
341it [00:02, 112.29it/s]
353it [00:02, 112.54it/s]
367it [00:03, 116.37it/s]
379it [00:03, 112.83it/s]
391it [00:03, 112.64it/s]
403it [00:03, 114.65it/s]
415it [00:03, 105.49it/s]
427it [00:03, 108.50it/s]
441it [00:03, 115.56it/s]
454it [00:03, 118.59it/s]
469it [00:03, 125.36it/s]
482it [00:04, 123.17it/s]
495it [00:04, 123.71it/s]
510it [

Expert paddleocr ...


0it [00:00, ?it/s]
0it [00:00, ?it/s]
7it [00:00, 55.55it/s]
14it [00:00, 49.34it/s]
27it [00:00, 71.38it/s]
40it [00:00, 88.90it/s]
50it [00:00, 79.24it/s]
61it [00:00, 85.17it/s]
70it [00:00, 65.72it/s]
80it [00:01, 71.83it/s]
90it [00:01, 76.82it/s]
99it [00:01, 69.42it/s]
107it [00:01, 70.76it/s]
115it [00:01, 65.66it/s]
122it [00:01, 65.92it/s]
129it [00:01, 62.62it/s]
136it [00:01, 62.29it/s]
145it [00:02, 57.03it/s]
151it [00:02, 52.57it/s]
157it [00:02, 46.78it/s]
164it [00:02, 50.16it/s]
170it [00:02, 52.11it/s]
181it [00:02, 66.10it/s]
188it [00:02, 64.10it/s]
195it [00:03, 52.85it/s]
204it [00:03, 61.48it/s]
211it [00:03, 45.15it/s]
217it [00:03, 42.87it/s]
222it [00:03, 43.35it/s]
229it [00:03, 48.52it/s]
238it [00:03, 58.25it/s]
245it [00:04, 47.88it/s]
253it [00:04, 54.84it/s]
260it [00:04, 35.95it/s]
265it [00:05, 23.60it/s]
271it [00:05, 25.69it/s]
281it [00:05, 36.27it/s]
294it [00:05, 51.99it/s]
303it [00:05, 52.00it/s]
310it [00:06, 36.66it/s]
320it [00:06, 46.27it/s

Expert trocr-large-stage1 ...


0it [00:00, ?it/s]
0it [00:00, ?it/s]
4it [00:00, 12.95it/s]
7it [00:00, 10.70it/s]
12it [00:00, 15.14it/s]
14it [00:01, 10.71it/s]
17it [00:01, 12.77it/s]
19it [00:01, 12.12it/s]
26it [00:01, 20.34it/s]
29it [00:02, 13.20it/s]
39it [00:02, 21.51it/s]
43it [00:02, 22.43it/s]
46it [00:04,  7.14it/s]
50it [00:04,  8.14it/s]
52it [00:04,  8.34it/s]
56it [00:04, 10.27it/s]
58it [00:05, 10.17it/s]
60it [00:05, 10.81it/s]
65it [00:05, 14.01it/s]
1it [30:07, 1807.92s/it]
70it [30:08, 137.13s/it]
75it [30:08, 88.88s/it] 
77it [30:08, 74.01s/it]
79it [30:08, 59.75s/it]
81it [30:09, 46.89s/it]
88it [30:09, 22.55s/it]
90it [30:09, 18.63s/it]
92it [30:09, 14.92s/it]
98it [30:10,  8.08s/it]
100it [30:10,  6.65s/it]
102it [30:11,  5.33s/it]
108it [30:11,  2.85s/it]
110it [30:11,  2.35s/it]
113it [30:11,  1.70s/it]
115it [30:12,  1.39s/it]
121it [30:12,  1.35it/s]
124it [30:12,  1.76it/s]
127it [30:13,  2.15it/s]
130it [30:13,  2.81it/s]
134it [30:13,  3.99it/s]
139it [30:13,  6.04it/s]
142it [30:13,

In [17]:
ocr_dict_sorted = defaultdict(list)
for key in ocr_dict.keys():
    list_of_tuples = ocr_dict[key]
    sorted_list_of_tuples = sorted(list_of_tuples, key=lambda x: x[0])
    ocr_dict_sorted[key] = [t[1] for t in sorted_list_of_tuples]

In [18]:
expert_name_dict = defaultdict(list)
expert_names = list(expert_dict.keys())
for key, val in bbox_label_dict.items():
    expert_name_dict[key] = [expert_names[idx] for idx in val]

# serialize result

In [19]:
df = pd.read_parquet(parquet_fname, engine='pyarrow')

In [20]:
ocr_col = ['']*len(df)
bbox_col = [[]]*len(df)
exp_col = ['']*len(df)

for key in bbox_dict.keys():
    idx = int(key)
    ocr_col[idx] = ocr_dict_sorted[key]
    bbox_col[idx] = bbox_dict[key]
    exp_col[idx] = expert_name_dict[key]
df['OCR_BBOXES'] = bbox_col
df['OCR_EXPERTS'] = exp_col
df['OCR_TEXT'] = ocr_col


In [21]:
df['OCR_BBOXES'] = df['OCR_BBOXES'].astype(str)
df['OCR_EXPERTS'] = df['OCR_EXPERTS'].astype(str)
df['OCR_TEXT'] = df['OCR_TEXT'].astype(str)

In [22]:
import fastparquet as pq
import os
pq.write(os.path.abspath(parquet_result_fname), df)

# eyeball result

In [23]:
from IPython.display import Image, display
from IPython.core.display import HTML 

In [24]:
handwritten_idcs = []
for idx, exps in enumerate(exp_col):
    if f'trocr-{expert_size}-handwritten' in exps:
        handwritten_idcs += [idx]
        display(Image(url=df.iloc[idx]['URL']))
        print(df.iloc[idx]['OCR_TEXT'])
        print(df.iloc[idx]['OCR_EXPERTS'])
        print()
print(handwritten_idcs)

['2', 'cm c.', 'alamy', 'alamy', 'alamy', 'mige ID: CPSRII', 'alamy', 'www.alamy.com']
['trocr-large-stage1', 'trocr-large-handwritten', 'trocr-large-stage1', 'trocr-large-stage1', 'trocr-large-stage1', 'trocr-large-stage1', 'trocr-large-stage1', 'trocr-large-stage1']



['DOS E', 'OF', 'COLORS']
['trocr-large-handwritten', 'paddleocr', 'paddleocr']



['GREATE', 'IN ME A', 'Clean heart.', 'O GOD, AND', 'renew A', 'RIGHT SPIRIT', 'WITHIN ME.', 'PSALM 51:10 ESV', 'incorragen.']
['trocr-large-stage1', 'paddleocr', 'trocr-large-handwritten', 'trocr-large-stage1', 'trocr-large-stage1', 'trocr-large-stage1', 'trocr-large-stage1', 'paddleocr', 'trocr-large-stage1']



['Jun', 'fn.', 'the sun.']
['trocr-large-stage1', 'trocr-large-handwritten', 'trocr-large-handwritten']

[272, 484, 646, 688]


In [25]:
Image(url=df.iloc[idx]['URL'])

In [26]:
df.iloc[idx]['OCR_TEXT']

'[]'

In [27]:
df.iloc[idx]['OCR_EXPERTS']

'[]'

In [28]:
df.iloc[idx]['OCR_BBOXES']

'[]'