In [None]:
# Install required packages
!pip install -q transformers datasets sentencepiece
!pip install -q pytorch-lightning wandb
!pip install -q donut-python

# !huggingface-cli login this shouldh be done from the terminal

In [25]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

processor = DonutProcessor.from_pretrained("Jac-Zac/thesis_test_donut", revision = 'c13aef46a13c2646b315bf37bb6bfa38033a48db')
model = VisionEncoderDecoderModel.from_pretrained("Jac-Zac/thesis_test_donut", revision = 'c13aef46a13c2646b315bf37bb6bfa38033a48db')

Downloading pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]

In [26]:
import re
import json
import torch
from tqdm.auto import tqdm
import numpy as np
import random
from PIL import Image

from donut import JSONParseEvaluator
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

model.eval()
model.to(device)

output_list = []
accs = []

image_path = "/Users/jaczac/Github/Thesis/donut_example/small_copy/img_resized"

dataset = load_dataset(image_path, split="test")


for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # prepare encoder inputs
    pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    # prepare decoder inputs
    task_prompt = "<s_herbarium>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
    decoder_input_ids = decoder_input_ids.to(device)
    
    # autoregressively generate sequence
    outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )

    # turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    seq = processor.token2json(seq)

    ground_truth = json.loads(sample["ground_truth"])
    
    # modify ground_truth to replace " " with "" since I would still count it as a correct prediction
    ground_truth = json.loads(sample["ground_truth"].replace('" "', '""'))
    
    evaluator = JSONParseEvaluator()
    score = evaluator.cal_acc(seq, ground_truth)

    accs.append(score)
    output_list.append(seq)

scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
print(scores, f"length : {len(accs)}")

Resolving data files:   0%|          | 0/1553 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/138 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/139 [00:00<?, ?it/s]

Found cached dataset imagefolder (/Users/jaczac/.cache/huggingface/datasets/imagefolder/img_resized-7f5590504a871c24/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/138 [00:00<?, ?it/s]

{'accuracies': [0.7297297297297297, 0.6060606060606061, 0.23728813559322037, 0.6965517241379311, 0.7857142857142857, 0.8253968253968254, 0.05208333333333337, 1.0, 0.8, 0.696969696969697, 0.8717948717948718, 0.7, 0.5686274509803921, 0.6019417475728155, 0.44285714285714284, 0.3538461538461538, 0.608, 0.375, 0.5593220338983051, 0.6551724137931034, 0.23076923076923073, 0.589041095890411, 0, 0.8961038961038961, 0.14432989690721654, 0.3893805309734514, 0.6194690265486726, 0.7604166666666666, 0.908256880733945, 0.851063829787234, 0.8613861386138614, 1.0, 0.8571428571428572, 0, 0.5421686746987953, 0.3362068965517241, 0.5941176470588235, 0.33684210526315794, 1.0, 0.8272727272727273, 0.9125, 0.8818181818181818, 0.53125, 0.9743589743589743, 0.8048780487804879, 1.0, 1.0, 0.9159663865546218, 0.7571428571428571, 0.8676470588235294, 0.8117647058823529, 0.8674698795180723, 0.9745222929936306, 0.9739130434782609, 0.868421052631579, 0.8840579710144928, 1.0, 0.6634615384615384, 0.7972972972972973, 0.8089

In [27]:
print("Mean accuracy:", np.mean(accs))

Mean accuracy: 0.7103449067622407


In [None]:
print("Mean accuracy:", np.median(accs))

In [None]:
# get indices of worst 5 predictions
worst_idxs = np.argsort(accs)[:5].tolist()

# prepare decoder inputs
task_prompt = "<s_herbarium>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)

for idx in worst_idxs:
    sample = dataset[idx]

    # prepare encoder inputs
    pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    # autoregressively generate sequence
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    # turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    seq = processor.token2json(seq)
    
    print(f"Ground Truth: {sample['ground_truth']}\n")
    print(f"Prediction: {seq}\n")
    print(f"Score: {accs[idx]}\n")
    display(sample["image"])