In [None]:
pip install transformers xmltodict accelerate

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForImageTextToText

tokenizer = AutoTokenizer.from_pretrained("mychen76/invoice-and-receipts_donut_v1")
model = AutoModelForImageTextToText.from_pretrained("mychen76/invoice-and-receipts_donut_v1")

In [None]:
import torch, transformers, accelerate, platform
print("torch", torch.__version__)
print("transformers", transformers.__version__)
print("accelerate", accelerate.__version__)
print("python", platform.python_version())


In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch

model_id = "mychen76/invoice-and-receipts_donut_v1"
processor = DonutProcessor.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"

model = VisionEncoderDecoderModel.from_pretrained(
    model_id,
    device_map=device,      # <-- single device → no meta tensors
    low_cpu_mem_usage=False,      # full, eager load
    torch_dtype=torch.float32,    # or float16 if VRAM is tight
)

print(next(model.parameters()).device)    # cuda:0  (or cpu)

In [None]:
from PIL import Image
import json

def generateTextInImage(processor,model,input_image,task_prompt="<s_receipt>"):
    pixel_values = processor(input_image, return_tensors="pt").pixel_values
    print ("input pixel_values: ",pixel_values.shape)
    task_prompt = "<s_receipt>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    outputs = model.generate(pixel_values.to(device),
                               decoder_input_ids=decoder_input_ids.to(device),
                               max_length=model.decoder.config.max_position_embeddings,
                               early_stopping=True,
                               pad_token_id=processor.tokenizer.pad_token_id,
                               eos_token_id=processor.tokenizer.eos_token_id,
                               use_cache=True,
                               num_beams=1,
                               bad_words_ids=[[processor.tokenizer.unk_token_id]],
                               return_dict_in_generate=True,
                               output_scores=True,)
    return outputs

def generateOutputXML(processor,model, input_image, task_start="<s_receipt>",task_end="</s_receipt>"):
    import re
    outputs=generateTextInImage(processor,model,input_image,task_prompt=task_start)
    sequence = processor.batch_decode(outputs.sequences)[0]
    print(sequence)
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
    return sequence

def convertOutputToJson(processor, xml):
    sequence = generateOutputXML()
    result=processor.token2json(sequence)
    print(":vampire:",result)
    return result

def generateOutputJson(processor,model, input_image, task_start="<s_receipt>",task_end="</s_receipt>"):
    xml = generateOutputXML(processor,model, input_image,task_start=task_start,task_end=task_end)
    result=processor.token2json(xml)
    print(":vampire:",result)
    return result

IMAGE_PATH = "./cropped_receipt/cropped_image.png"
OUTPUT_PATH = "./outputs/hf_donut.json"
input_image = Image.open(IMAGE_PATH)

## generate json
invoice1_json=generateOutputJson(processor,model,input_image)
print(invoice1_json)
with open(OUTPUT_PATH, 'w') as file:
    json.dump(invoice1_json, file, indent=4)


# ## generate xml
# xml=generateOutputXML(processor,model,input_image)
# print(xml)