# Notebook to run the model on unseen images

### Install necessary dependencies

In [1]:
# Install required packages
!pip install -q transformers datasets sentencepiece
!pip install -q pytorch-lightning wandb
!pip install -q donut-python

# !huggingface-cli login this shouldh be done from the terminal

## Resize the images
> Image 005294.jpg was wierd

I want to have the images in the correct size and flip them on the correct side

In [20]:
from PIL import Image, ImageOps
import shutil
import os

# Define the paths for the input and output directories
input_dir = "../donut_example/Immagini_Esposito"
output_dir = "img_resized/"
size = (1600,1200)

# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through all the image files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".jpg"):
        # Open the image and resize it
        with Image.open(os.path.join(input_dir, filename)) as img:
            
            # Resize the image to a specific size
            img = img.resize(size)
            
            # Automatically rotate the image based on its EXIF orientation metadata
            img = ImageOps.exif_transpose(img)
            
            # Check if the image is in landscape orientation
            if img.width > img.height:
                print(filename)
                
                # Rotate the image 90 degrees clockwise
                img = img.rotate(-90, expand=True)
            
            # Save the cropped and resized image to the output directory
            img.save(os.path.join(output_dir, filename))

004452.jpg
004334.jpg
004320.jpg
004446.jpg
004491.jpg
004485.jpg
004678.jpg
004136.jpg
004888.jpg
004650.jpg
004644.jpg
004122.jpg
004877.jpg
004863.jpg
004687.jpg
004718.jpg
004730.jpg
004903.jpg
004917.jpg
004095.jpg
004268.jpg
004526.jpg
004240.jpg
004254.jpg
004532.jpg
004283.jpg
004297.jpg
096093.jpg
004296.jpg
004282.jpg
004255.jpg
004533.jpg
004527.jpg
004241.jpg
004269.jpg
003506.jpg
004094.jpg
003089.jpg
004916.jpg
004902.jpg
004731.jpg
004725.jpg
004719.jpg
004686.jpg
004692.jpg
004862.jpg
004876.jpg
004123.jpg
004137.jpg
004651.jpg
004679.jpg
004484.jpg
004490.jpg
004321.jpg
004447.jpg
004453.jpg
004335.jpg
004479.jpg
004445.jpg
004323.jpg
004337.jpg
004451.jpg
004486.jpg
004492.jpg
004109.jpg
004121.jpg
004647.jpg
004653.jpg
004135.jpg
004860.jpg
004874.jpg
004684.jpg
004690.jpg
004733.jpg
004727.jpg
004914.jpg
004096.jpg
004519.jpg
004531.jpg
004257.jpg
004243.jpg
004525.jpg
004294.jpg
004280.jpg
004281.jpg
004295.jpg
004242.jpg
004524.jpg
004530.jpg
004256.jpg
004518.jpg

In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

# Using the model that I think works the best and generalize which is epoch 9 of the last run (very similar to epoch 10)
processor = DonutProcessor.from_pretrained("Jac-Zac/thesis_test_donut",  revision="ba396d4b3d39a4eaf7c8d4919b384ebcf6f0360f")
model = VisionEncoderDecoderModel.from_pretrained("Jac-Zac/thesis_test_donut",  revision="ba396d4b3d39a4eaf7c8d4919b384ebcf6f0360f")

Downloading (…)f6f0360f/config.json:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

In [8]:
import re
import json
import torch
from tqdm.auto import tqdm
import numpy as np
import random
from PIL import Image

from donut import JSONParseEvaluator
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

model.eval()
model.to(device)

output_list = []
accs = []

images_path = "img_resized"

# Loop through all the image files in the input directory
for filename in os.listdir(images_path):
    if filename.endswith(".jpg"):
        # Load the image
        image = Image.open(os.path.join(images_path, filename))
        # Prepare encoder inputs
        pixel_values = processor(image.convert("RGB"), return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(device)
        # prepare decoder inputs
        task_prompt = "<s_herbarium>"
        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
        decoder_input_ids = decoder_input_ids.to(device)

        # autoregressively generate sequence
        outputs = model.generate(
                pixel_values,
                decoder_input_ids=decoder_input_ids,
                max_length=model.decoder.config.max_position_embeddings,
    #            early_stopping=True,
                pad_token_id=processor.tokenizer.pad_token_id,
                eos_token_id=processor.tokenizer.eos_token_id,
                use_cache=True,
                num_beams=1,
                bad_words_ids=[[processor.tokenizer.unk_token_id]],
                return_dict_in_generate=True,
            )

        # turn into JSON
        seq = processor.batch_decode(outputs.sequences)[0]
        seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
        seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
        seq = processor.token2json(seq)

        output_list.append({"sample_id": idx, "filename": filename ,"prediction": seq})
    
# Save output to JSON file
output_file_path = "../output.json"  # Replace with your desired output file path
with open(output_file_path, "w") as f:
    json.dump(output_list, f)

Resolving data files:   0%|          | 0/1553 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/138 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/139 [00:00<?, ?it/s]

Found cached dataset imagefolder (/Users/jaczac/.cache/huggingface/datasets/imagefolder/img_resized-7f5590504a871c24/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/137 [00:00<?, ?it/s]