In [None]:
# Install required packages
!pip install -q transformers datasets sentencepiece
!pip install -q pytorch-lightning wandb
!pip install -q donut-python

# !huggingface-cli login this shouldh be done from the terminal

In [57]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

processor = DonutProcessor.from_pretrained("Jac-Zac/thesis_test_donut", revision = '8c5467cb66685e801ec6ff8de7e7fdd247274ed0')
model = VisionEncoderDecoderModel.from_pretrained("Jac-Zac/thesis_test_donut", revision = '8c5467cb66685e801ec6ff8de7e7fdd247274ed0')

Downloading pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]

In [58]:
import re
import json
import torch
from tqdm.auto import tqdm
import numpy as np
import random
from PIL import Image

from donut import JSONParseEvaluator
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

model.eval()
model.to(device)

output_list = []
accs = []

image_path = "/Users/jaczac/Github/Thesis/donut_example/copy/img_resized"

dataset = load_dataset(image_path, split="test")


for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # prepare encoder inputs
    pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    # prepare decoder inputs
    task_prompt = "<s_herbarium>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
    decoder_input_ids = decoder_input_ids.to(device)
    
    # autoregressively generate sequence
    outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )

    # turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    seq = processor.token2json(seq)

    ground_truth = json.loads(sample["ground_truth"])
    
    # modify ground_truth to replace " " with "" since I would still count it as a correct prediction
    ground_truth = json.loads(sample["ground_truth"].replace('" "', '""'))
    
    evaluator = JSONParseEvaluator()
    score = evaluator.cal_acc(seq, ground_truth)

    accs.append(score)
    output_list.append(seq)

scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
print(scores, f"length : {len(accs)}")

Resolving data files:   0%|          | 0/1553 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/138 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/139 [00:00<?, ?it/s]

Found cached dataset imagefolder (/Users/jaczac/.cache/huggingface/datasets/imagefolder/img_resized-ab7fa470c2235037/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/138 [00:00<?, ?it/s]

{'accuracies': [0.75, 0.6616161616161615, 0.30508474576271183, 0.7517241379310344, 0.8714285714285714, 0.7698412698412699, 0.7395833333333333, 1.0, 0.8105263157894737, 0.6515151515151515, 0.7564102564102564, 0.7133333333333334, 0.607843137254902, 0.22330097087378642, 0, 0.4, 0.688, 0.9166666666666666, 0.5932203389830508, 0.5977011494252873, 0.7564102564102564, 0.6712328767123288, 0.6056338028169015, 0.8961038961038961, 0.4742268041237113, 0.9380530973451328, 0.8407079646017699, 0.16666666666666663, 0.8256880733944953, 0.9893617021276596, 0.8118811881188119, 1.0, 0.9010989010989011, 0.07594936708860756, 0.6867469879518072, 0.6896551724137931, 0.5705882352941176, 0.6210526315789473, 1.0, 0.7636363636363637, 0.8875, 0.8727272727272728, 0.84375, 0.9230769230769231, 0.7560975609756098, 0.9894736842105263, 1.0, 0.8907563025210083, 0.6, 0.8823529411764706, 0.7529411764705882, 0.7710843373493976, 0.9617834394904459, 1.0, 0.868421052631579, 0.9855072463768116, 0.96, 0.6634615384615384, 0.864864

In [74]:
print("Mean accuracy:", np.mean(accs))

Mean accuracy: 0.7638958523977475


In [75]:
print("Mean accuracy:", np.median(accs))

Mean accuracy: 0.830369809893124


In [83]:
mean_without_worst = np.mean(np.sort(accs)[10:])
print("Mean accuracy (excluding worst 10):", mean_without_worst)

Mean accuracy (excluding worst 10): 0.8158658554664913


In [None]:
# get indices of worst 5 predictions
worst_idxs = np.argsort(accs)[:10].tolist()

# prepare decoder inputs
task_prompt = "<s_herbarium>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)

for idx in worst_idxs:
    sample = dataset[idx]

    # prepare encoder inputs
    pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    # autoregressively generate sequence
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    # turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    seq = processor.token2json(seq)
    
    print(f"Ground Truth: {sample['ground_truth']}\n")
    print(f"Prediction: {seq}\n")
    print(f"Score: {accs[idx]}\n")
    display(sample["image"])