In [None]:
import json
import hashlib
from datasets import load_from_disk, load_dataset, concatenate_datasets
import io
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image
import os

src = "/mnt/AI_NAS/datalake/source/provider=huggingface/invoice_kie"
src_add = src + "/add"

In [None]:
ds = load_from_disk(src)
ds_add = load_from_disk(src_add)

In [None]:
records = []
MAX_SIZE = 2048
os.makedirs("images", exist_ok=True)
for data in tqdm(ds, desc="Processing dataset"):
    img = data['file']
    label = eval(data['data'])
    
    scale = min(MAX_SIZE / img.width, MAX_SIZE / img.height)
    if scale < 1:
        img = img.resize((int(img.width * scale), int(img.height * scale)), resample=Image.Resampling.LANCZOS)
    
    buf = io.BytesIO()
    img.save(buf, format='JPEG')
    byte_data = buf.getvalue()
    hash_val = hashlib.sha256(byte_data).hexdigest()
    img_path = hash_val + ".jpg"
    output_path = "images/" + img_path
    img.save(output_path, format='JPEG')
    records.append({
        "image_path": img_path,
        "width": img.width,   
        "height": img.height,
        "label": json.dumps(label, ensure_ascii=False),
    })
        
df = pd.DataFrame(records)

In [None]:
records = []
MAX_SIZE = 2048
for data in tqdm(ds_add, desc="Processing dataset"):
    img = data['image']
    label_ = data['json']
    
    label = {
        'items': [],
        'total': label_['Closing Balance']
    }
    for item in label_['Transactions']:
        label['items'].append({
            'amount': item['Amount'],
            'date': item['Date'],
            'ref': item['Ref'],
        })
    
    scale = min(MAX_SIZE / img.width, MAX_SIZE / img.height)
    if scale < 1:
        img = img.resize((int(img.width * scale), int(img.height * scale)), resample=Image.Resampling.LANCZOS)
    
    buf = io.BytesIO()
    img.save(buf, format='JPEG')
    byte_data = buf.getvalue()
    hash_val = hashlib.sha256(byte_data).hexdigest()
    img_path = hash_val + ".jpg"
    os.makedirs("images", exist_ok=True)
    output_path = "images/" + img_path
    img.save(output_path, format='JPEG')
    records.append({
        "image_path": img_path,
        "width": img.width,   
        "height": img.height,
        "label": json.dumps(label, ensure_ascii=False),
    })
        
df_add = pd.DataFrame(records)

In [None]:
pd.read_parquet("/mnt/AI_NAS/datalake/catalog/provider=huggingface/dataset=invoice_kie/task=kie/variant=kie_struct/lang=en/src=real/data.parquet")

In [None]:
pd.concat([df, df_add]).to_parquet("invoice_kie.parquet", index=False)