In [32]:
import ipywidgets as widgets
from IPython.display import display, Image, clear_output
import pandas as pd
import ast
import requests
from io import BytesIO
import re
import cv2
import urllib.request
import numpy as np
from PIL import Image as PILImage


parquet_fname = '../data/laion2b-en-1K-experts-large.parquet'
df = pd.read_parquet(parquet_fname, engine='pyarrow')

In [93]:
def parse_list_string(list_string):
    return list_string.strip('[]').replace("'", "").split(',')

def parse_bounding_boxes(input_string):
    input_string = input_string.replace(' ', '')
    pattern = r'\[\[([\d.,-]+)\],\s?\[([\d.,-]+)\],\s?\[([\d.,-]+)\],\s?\[([\d.,-]+)\]\]'
    matches = re.findall(pattern, input_string)
    bounding_boxes = []

    for match in matches:
        box_coords = [list(map(float, coord.split(','))) for coord in match]
        bounding_boxes.append(box_coords)

    return bounding_boxes


def add_bounding_boxes(image_data, bounding_boxes):
    h = image_data.shape[0]
    w = image_data.shape[1]
    
    if h == w:
        yscale = h/512
        xscale = w/512
        xoffset = 0
        yoffset = 0
    if h > w:
        wscaled = (512/h) * w
        yscale = h/512
        xscale = w/wscaled
        xoffset = -(512 - wscaled)/2
        yoffset = 0
    else: 
        hscaled = (512/w)*h
        yscale = h/hscaled
        xscale = w/512
        xoffset = 0
        yoffset = -(512 - hscaled)/2
        
    
    # Iterate through the bounding boxes and draw them on the image
    for box in bounding_boxes:
        scaled_box = []
        for point in box:
            x = (point[0]+xoffset)*yscale
            y = (point[1]+yoffset)*yscale
            scaled_box += [[x, y]]
        box = np.array([tuple(map(int, point)) for point in scaled_box], dtype=np.int32)  # Convert coordinates to int and create a NumPy array
        cv2.polylines(image_data, [box], True, (0, 255, 0), 2)  # Green color, thickness 2
    
    # Convert the image back to PIL format
    image_pil = PILImage.fromarray(cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB))
    return image_pil

def read_image_from_url(url):
    with urllib.request.urlopen(url) as response:
        image_data = response.read()
    image_array = np.asarray(bytearray(image_data), dtype=np.uint8)
    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    return image

In [94]:
exp_col = df['OCR_EXPERTS']

In [95]:
handwritten_idcs = []
for idx, exps in enumerate(exp_col):
    #if 'paddleocr' in exps:
    if f'trocr-large-handwritten' in exps:
        handwritten_idcs += [idx]

In [96]:
def on_click(idx):
    with output:
        clear_output(wait=True)
        #img = Image(url=df.iloc[idx]['URL'])
        img = read_image_from_url(df.iloc[idx]['URL'])
        img = add_bounding_boxes(img, parse_bounding_boxes(df.iloc[idx]['OCR_BBOXES']))
        display(img)
        ocr_texts = parse_list_string(df.iloc[idx]['OCR_TEXT'])
        ocr_experts = parse_list_string(df.iloc[idx]['OCR_EXPERTS'])
        for ocr_text, ocr_experts in zip(ocr_texts, ocr_experts):
            print("{0: <40} {1: <40}".format(ocr_text, ocr_experts))
    
def url_to_memory_view(url):
    response = requests.get(url)
    img_data = BytesIO(response.content)
    return memoryview(img_data.getvalue())

def on_thumbnail_click(idx, button):
    on_click(idx)

In [97]:
buttons = []
for i, idx in enumerate(handwritten_idcs):
    button = widgets.Button(description='', icon='check', layout=widgets.Layout(width='100px', height='100px'))
    button.style.button_color = 'transparent'
    button.on_click(lambda btn, idx=idx: on_thumbnail_click(idx, btn))
    buttons.append(button)

thumbnails = [widgets.HTML(f'<img src="{df.iloc[idx]["URL"]}" width="100" height="100"/>') for idx in handwritten_idcs]

thumbnails_grid = widgets.GridBox([widgets.VBox([thumbnail, button]) for thumbnail, button in zip(thumbnails, buttons)], layout=widgets.Layout(grid_template_columns="repeat(auto-fill, minmax(100px, 1fr))", grid_gap="10px"))

output = widgets.Output()

main_layout = widgets.VBox([thumbnails_grid, output])
display(main_layout)

VBox(children=(GridBox(children=(VBox(children=(HTML(value='<img src="https://c8.alamy.com/comp/CP9R1H/theory-…