In [None]:
import json
import hashlib
from datasets import load_from_disk, load_dataset, concatenate_datasets
import io
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
from PIL import Image
import numpy as np
import cv2
import os
import base64
src = "/mnt/AI_NAS/datalake/source/provider=huggingface/synth_invoices_en"

In [None]:
ds = load_from_disk(src)

In [None]:
class_dict = {
    10: "table",
    11: "logo"
}

In [None]:
records = []
os.makedirs("images", exist_ok=True)
for data in tqdm(ds):
    img = Image.open(io.BytesIO(base64.b64decode(data["image"])))
    
    img_np = np.array(img)
    annots = data["annotations"]


    
    bboxes = annots['bboxes'] 
    unique_bboxes = {}
    for idx, bbox in enumerate(bboxes):
        bbox_tuple = tuple(bbox)
        if bbox_tuple not in unique_bboxes:
            unique_bboxes[bbox_tuple] = []
        unique_bboxes[bbox_tuple].append(idx)
    ner_tags = annots['ner_tags']
    words = annots['words']
    label = []
    for unique_idx in unique_bboxes.values():
        bbox = bboxes[unique_idx[0]]
        x1, y1, x2, y2 = bbox
        lines = []
        tags = []
        for idx in unique_idx:
            word = words[idx]
            tag = class_dict.get(ner_tags[idx], "text")
            if tag != "text":
                word = ""
            tags.append(tag)
            lines.append(f"{word}")
        tags = list(set(tags))
        class_ = tags[0]
        text = " ".join(lines)
        label.append({
            "text": text,
            "class": class_,
            "bbox": [x1, y1, x2, y2],
        })
        
    buf = io.BytesIO()
    img.save(buf, format='JPEG')
    byte_data = buf.getvalue()
    hash_val = hashlib.sha256(byte_data).hexdigest()
    img_path = hash_val + ".jpg"
    sha256_dir = img_path[:1]
    img_path = os.path.join(sha256_dir, img_path)
    os.makedirs(sha256_dir, exist_ok=True)
    output_path = os.path.join(sha256_dir, img_path)
    img.save(output_path)
    records.append({
        "image_path": img_path,
        "width": img.width,
        "height": img.height,
        "label": json.dumps(label, ensure_ascii=False),
    })

df = pd.DataFrame(records)
    

In [None]:
df.to_parquet("synth_invoices_en.parquet", index=False)

In [None]:
slider = widgets.IntSlider(min=0, max=len(ds)-1, step=1, value=0)

In [None]:
def display_sample(idx):
    data = ds[idx]
    img = Image.open(io.BytesIO(base64.b64decode(data["image"])))
    img_np = np.array(img)
    annots = data["annotations"]

    bboxes = annots['bboxes'] 
    ner_tags = annots['ner_tags']
    words = annots['words']

    unique_bboxes = {}
    for idx, bbox in enumerate(bboxes):
        bbox_tuple = tuple(bbox)
        if bbox_tuple not in unique_bboxes:
            unique_bboxes[bbox_tuple] = []
        unique_bboxes[bbox_tuple].append(idx)
    print(f"Unique BBoxes: {len(unique_bboxes)}")
    ner_tags = annots['ner_tags']
    words = annots['words']
    missed_tags = []
    for unique_idx in unique_bboxes.values():
        bbox = bboxes[unique_idx[0]]
        x1, y1, x2, y2 = bbox
        lines = []
        tags = []
        for idx in unique_idx:
            word = words[idx]
            tag = class_dict.get(ner_tags[idx], "text")
            if tag != "text":
                word = ""
            tags.append(tag)
            lines.append(f"{word}")
        tags = list(set(tags))
        class_ = tags[0]
        text = " ".join(lines)
        label.append({
            "text": text,
            "class": class_,
            "bbox": [x1, y1, x2, y2],
        })
        cv2.rectangle(img_np, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(img_np, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    display(Image.fromarray(img_np))

In [None]:
widgets.interact(display_sample, idx=slider)