In [None]:
import json
import hashlib
from datasets import load_from_disk, load_dataset, concatenate_datasets, Dataset
import io
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import cv2
import os

src = "/mnt/AI_NAS/datalake/source/provider=huggingface/fatura2_invoices"

In [None]:
ds = load_from_disk(src)

In [None]:
records = []
os.makedirs("images", exist_ok=True)
for data in tqdm(ds):
    img = Image.open(io.BytesIO(data['doc_bytes']))
    img_np = np.array(img)
    label_ = eval(data['target_data'])
    label_.pop("INVOICE_INFO")
    table = label_.pop('TABLE')[0][0]
    label_['TABLE'] = {
        'bbox': table['bbox'],
    }
    label = {}
    for key, item in label_.items():
        key = key.lower()
        bbox = item.get('bbox', None)
        text = item.get('text', None)
        if bbox is not None:
            bbox = np.array(bbox).astype(np.int32).reshape(-1, 2)
            bbox[:, 1] = img.height - bbox[:, 1]
            tl = np.min(bbox, axis=0)
            br = np.max(bbox, axis=0)
            tl = tl.tolist()
            br = br.tolist()
            bbox = [tl[0], tl[1], br[0], br[1]]
        label[key] = {}
        if text is not None:
            label[key]['text'] = text
        if bbox is not None:
            label[key]['bbox'] = bbox
    
    buf = io.BytesIO()
    img.save(buf, format='JPEG')
    byte_data = buf.getvalue()
    hash_val = hashlib.sha256(byte_data).hexdigest()
    img_path = hash_val + ".jpg"
    output_path = "images/" + img_path
    img.save(output_path, format='JPEG')
    records.append({
        "image_path": img_path,
        "width": img.width,   
        "height": img.height,
        "label": json.dumps(label, ensure_ascii=False),
    })

df = pd.DataFrame(records)

In [None]:
df.to_parquet("fatura2_invoices.parquet", index=False)

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
def display_sample(idx):
    data = ds.loc[idx]
    img = Image.open("images/"+data['image_path'])
    img_np = np.array(img)
    label_ = eval(data['label'])
    for key, item in label_.items():
        bbox = item.get('bbox', None)
        text = item.get('text', None)
        if bbox is not None:
            cv2.rectangle(img_np, 
                        (int(bbox[0]), int(bbox[1])), 
                        (int(bbox[2]), int(bbox[3])), 
                        (255, 0, 0), 2)
            cv2.putText(img_np, key, 
                        (int(bbox[0]), int(bbox[1]) - 10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 
                        0.5, (0, 0, 255), 2)
    display(Image.fromarray(img_np))
slider = widgets.IntSlider(min=0, max=len(ds)-1, step=1, value=0)
widgets.interact(display_sample, idx=slider)