In [233]:
import json
import hashlib
from datasets import load_from_disk, load_dataset, concatenate_datasets
import io
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import cv2
import os

src = "/mnt/AI_NAS/datalake/source/provider=huggingface/fatura2_invoices_original_strat"

In [234]:
ds = load_from_disk(src)

In [242]:
records = []
os.makedirs("images", exist_ok=True)
for data in tqdm(ds):
    img = Image.open(io.BytesIO(data['doc_bytes']))
    img_np = np.array(img)
    label_ = eval(data['target_data'])
    label_.pop("INVOICE_INFO")
    table = label_.pop('TABLE')[0][0]
    label_['TABLE'] = {
        'bbox': table['bbox'],
    }
    label = {}
    for key, item in label_.items():
        key = key.lower()
        bbox = item.get('bbox', None)
        text = item.get('text', None)
        if bbox is not None:
            bbox = np.array(bbox).astype(np.int32).reshape(-1, 2)
            bbox[:, 1] = img.height - bbox[:, 1]
            tl = np.min(bbox, axis=0)
            br = np.max(bbox, axis=0)
            tl = tl.tolist()
            br = br.tolist()
            bbox = [tl[0], tl[1], br[0], br[1]]
        label[key] = {}
        if text is not None:
            label[key]['text'] = text
        if bbox is not None:
            label[key]['bbox'] = bbox
    
    buf = io.BytesIO()
    img.save(buf, format='JPEG')
    byte_data = buf.getvalue()
    hash_val = hashlib.sha256(byte_data).hexdigest()
    img_path = hash_val + ".jpg"
    output_path = "images/" + img_path
    img.save(output_path, format='JPEG')
    records.append({
        "image_path": img_path,
        "width": img.width,   
        "height": img.height,
        "label": json.dumps(label, ensure_ascii=False),
    })

df = pd.DataFrame(records)

  0%|          | 0/5250 [00:00<?, ?it/s]

100%|██████████| 5250/5250 [01:59<00:00, 43.85it/s]


In [245]:
df.to_parquet("fatura2_invoices.parquet", index=False)

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [228]:
def display_sample(idx):
    data = ds[idx]
    img = Image.open(io.BytesIO(data['doc_bytes']))
    img_np = np.array(img)
    label_ = eval(data['target_data'])
    label_.pop("INVOICE_INFO")
    table = label_.pop('TABLE')[0][0]
    label_['TABLE'] = {
        'bbox': table['bbox'],
    }
    for key, item in label_.items():
        bbox = item.get('bbox', None)
        text = item.get('text', "")
        if bbox is not None:
            bbox = np.array(bbox).astype(np.int32).reshape(-1, 2)
            bbox[:, 1] = img.height - bbox[:, 1]
            tl = np.min(bbox, axis=0)
            br = np.max(bbox, axis=0)
            tl = tl.tolist()
            br = br.tolist()
            cv2.rectangle(img_np, tuple(tl), tuple(br), (255, 0, 0), 2)
            cv2.putText(img_np, key, tuple(tl), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            cv2.putText(img_np, text, (tl[0], tl[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    display(Image.fromarray(img_np))
slider = widgets.IntSlider(min=0, max=len(ds)-1, step=1, value=0)
widgets.interact(display_sample, idx=slider)

interactive(children=(IntSlider(value=0, description='idx', max=5249), Output()), _dom_classes=('widget-intera…

<function __main__.display_sample(idx)>

In [154]:
label

{'TABLE': [[{'bbox': [[31.0, 443.8898], [581.0, 389.8898]]}]],
 'INVOICE_INFO': [],
 'BILL_TO': {'bbox': [[10.0, 592.4127999999997],
   [159.78700000000006, 682.6128]],
  'text': 'BILL_TO:\nMark Cook\n3118 Elizabeth Flat Suite 356\nHeatherside, NJ 61519 US\nTel:+(171)767-0447\nEmail:hbaker@example.net\nSite:http://massey.com/'},
 'DATE': {'bbox': [[396.0, 610.4058], [497.37600000000003, 622.4058]],
  'text': 'Date: 28-Dec-2000'},
 'DISCOUNT': {'bbox': [[373.0, 313.4058], [530.356, 325.4058]],
  'text': 'DISCOUNT(1.32%): (-)  8.73'},
 'DUE_DATE': {'bbox': [[396.0, 645.4058], [528.0360000000001, 657.4058]],
  'text': 'Due Date : 04-Feb-2012'},
 'NOTE': {'bbox': [[259.0, 201.0058], [507.03999999999996, 227.4058]],
  'text': 'Note: All payments to be made in cash.\nContact us for queries on these quotations.'},
 'SELLER_ADDRESS': {'bbox': [[157.0, 772.0058], [376.408, 798.4058]],
  'text': 'Address:2180 Michael Ridges Apt. 385\nPort Lindsey, MP 98258 US'},
 'SELLER_NAME': {'bbox': [[157, 8