In [None]:
# !pip install -q transformers datasets sentencepiece
#!pip install -q pytorch-lightning wandb

In [None]:
import os

# Import dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = '/content/drive/MyDrive/Donut/synthtiger_files/outputs/SynthDoG_he'
#data_path = '/content/drive/MyDrive/Donut/SynthDoG_he'
os.listdir(data_path)

['validation', 'test', 'train']

# Load model and processor

In [None]:
from transformers import VisionEncoderDecoderConfig

image_size = [1280, 960]
max_length = 768

# update image_size of the encoder
# during pre-training, a larger image size was used
config = VisionEncoderDecoderConfig.from_pretrained("naver-clova-ix/donut-base")
config.encoder.image_size = image_size # (height, width)
# update max_length of the decoder (for generation)
config.decoder.max_length = max_length
# TODO we should actually update max_position_embeddings and interpolate the pre-trained ones:
# https://github.com/clovaai/donut/blob/0acc65a85d140852b8d9928565f0f6b2d98dc088/donut/model.py#L602

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel, AutoTokenizer

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base", config=config)

print(len(processor.tokenizer))

#change to hebrew tokenizer
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-he')

tokenizer.add_special_tokens({'bos_token' : '<s>'})

processor.tokenizer = tokenizer
model.decoder.resize_token_embeddings(len(processor.tokenizer))
print(len(processor.tokenizer))

57525




65840


In [None]:
model.decoder.model.decoder

MBartDecoder(
  (embed_tokens): Embedding(65840, 1024)
  (embed_positions): MBartLearnedPositionalEmbedding(1538, 1024)
  (layers): ModuleList(
    (0-3): 4 x MBartDecoderLayer(
      (self_attn): MBartAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): MBartAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (encoder_attn_

# Create PyTorch dataset
Here we create a regular PyTorch dataset.<br>

The model doesn't directly take the (image, JSON) pairs as input and labels. Rather, we create pixel_values and labels. Both are PyTorch tensors. The pixel_values are the input images (resized, padded and normalized), and the labels are the input_ids of the target sequence (which is a flattened version of the JSON), with padding tokens replaced by -100 (to make sure these are ignored by the loss function). Both are created using DonutProcessor (which internally combines an image processor, for the image modality, and a tokenizer, for the text modality).

Note that we're also adding tokens to the vocabulary of the decoder (and corresponding tokenizer) for all keys of the dictionaries in our dataset, like "<s_menu>". This makes sure the model learns an embedding vector for them. Without doing this, some keys might get split up into multiple subword tokens, in which case the model just learns an embedding for the subword tokens, rather than a direct embedding for these keys.

In [None]:
import json
import random
from typing import Any, List, Tuple
import torch
from torch.utils.data import Dataset
from PIL import Image

added_tokens = []

class DonutDataset(Dataset):
    """
    PyTorch Dataset for Donut. This class takes a HuggingFace Dataset as input.

    Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
    and it will be converted into pixel_values (vectorized image) and labels (input_ids of the tokenized string).

    Args:
        dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
        max_length: the max number of tokens for the target sequences
        split: whether to load "train", "validation" or "test" split
        ignore_id: ignore_index for torch.nn.CrossEntropyLoss
        task_start_token: the special token to be fed to the decoder to conduct the target task
        prompt_end_token: the special token at the end of the sequences
        sort_json_key: whether or not to sort the JSON keys
    """

    def __init__(
        self,
        dataset_name_or_path: str,
        max_length: int,
        split: str = "train",
        ignore_id: int = -100,
        task_start_token: str = "",
        prompt_end_token: str = None,
        sort_json_key: bool = True,
    ):
        super().__init__()

        self.max_length = max_length
        self.split = split
        self.ignore_id = ignore_id
        self.task_start_token = task_start_token
        self.prompt_end_token = prompt_end_token if prompt_end_token else task_start_token
        self.sort_json_key = sort_json_key

        self.dataset = self.load_dataset(dataset_name_or_path)
        self.dataset_length = len(self.dataset)

    def load_dataset(self,dataset_name_or_path):

      dataset = []
      with open(os.path.join(dataset_name_or_path,'metadata.jsonl'), 'r') as file:
        for line in file:

          data_point = json.loads(line)
          img_path = os.path.join(dataset_name_or_path,data_point['file_name'])
          text_sequence = json.loads(data_point['ground_truth'])['gt_parse']['text_sequence']
          dataset.append({'img_path':img_path,'text_sequence':text_sequence})
      return dataset

    def __len__(self) -> int:
        return self.dataset_length

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Load image from image_path of given dataset_path and convert into input_tensor and labels
        Convert gt data into input_ids (tokenized string)
        Returns:
            input_tensor : preprocessed image
            input_ids : tokenized gt_data
            labels : masked labels (model doesn't need to predict prompt and pad token)
        """
        sample = self.dataset[idx]
        img = Image.open(sample["img_path"])

        # inputs
        pixel_values = processor(img, random_padding=self.split == "train", return_tensors="pt").pixel_values
        pixel_values = pixel_values.squeeze()

        # targets
        target_sequence = sample['text_sequence']
        input_ids = processor.tokenizer( #add bos token to sequence
            processor.tokenizer.bos_token + " " + target_sequence,
            add_special_tokens=False,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].squeeze(0)

        labels = input_ids.clone()
        labels[labels == processor.tokenizer.pad_token_id] = self.ignore_id  # model doesn't need to predict pad token
        return pixel_values, labels, target_sequence #returns pixels labels and the text

instantiate the datasets:

In [None]:

# we update some settings which differ from pretraining; namely the size of the images + no rotation required
# source: https://github.com/clovaai/donut/blob/master/config/train_cord.yaml
processor.image_processor.size = image_size[::-1] # should be (width, height)
processor.image_processor.do_align_long_axis = False

train_dataset = DonutDataset(os.path.join(data_path,'train'), max_length=max_length,
                             split="train", task_start_token="", prompt_end_token="",
                             sort_json_key=False, # cord dataset is preprocessed, so no need for this
                             )

val_dataset = DonutDataset(os.path.join(data_path,'validation'), max_length=max_length,
                             split="validation", task_start_token="", prompt_end_token="",
                             sort_json_key=False, # cord dataset is preprocessed, so no need for this
                             )


test_dataset = DonutDataset(os.path.join(data_path,'test'), max_length=max_length,
                             split="test", task_start_token="", prompt_end_token="",
                             sort_json_key=False, # cord dataset is preprocessed, so no need for this
                             )

In [None]:
len(processor.tokenizer)

65840

In [None]:
processor.tokenizer.decode([3])

','

In [None]:
pixel_values, labels, target_sequence = train_dataset[0]

In [None]:
decoded_seq = processor.tokenizer.decode(labels.tolist())#, skip_special_tokens=True
print(decoded_seq[0:100])
print(target_sequence)

<s> ימניה חום ויוב ש, בלי רוח כמעט.ב ישראל, מגדירים יום שרבי כיום שבו הלחו ת היחסית המ מו<unk> פחותה
ימניה חום ויוב ש, בלי רוח כמעט.ב ישראל, מגדירים יום שרבי כיום שבו הלחו ת היחסית המ מוצעת פחותה מ־05


test datasets

Another important thing is that we need to set 2 additional attributes in the configuration of the model. This is not required, but will allow us to train the model by only providing the decoder targets, without having to provide any decoder inputs.

The model will automatically create the decoder_input_ids (the decoder inputs) based on the labels, by shifting them one position to the right and prepending the decoder_start_token_id. I recommend checking this video if you want to understand how models like Donut automatically create decoder_input_ids - and more broadly how Donut works

In [None]:
processor.tokenizer.bos_token # added this token to represent start of seq


model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.bos_token_id

In [None]:
processor.tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-he', vocab_size=65839, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	65838: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	65839: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
# sanity check
print("Pad token ID:", processor.decode([model.config.pad_token_id]))
print("Decoder start token ID:", processor.decode([model.config.decoder_start_token_id])) # start_token!!!!!!!!

Pad token ID: <pad>
Decoder start token ID: <s>


# Create PyTorch DataLoaders

Next, we create corresponding PyTorch DataLoaders, which allow us to loop over the dataset in batches:

In [None]:
from torch.utils.data import DataLoader

# feel free to increase the batch size if you have a lot of memory
# I'm fine-tuning on Colab and given the large image size, batch size > 1 is not feasible
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4) #dataloader gets train dataset to do overfiting

In [None]:
#Let's verify a batch:
batch = next(iter(train_dataloader))
pixel_values, labels, target_sequences = batch
print(pixel_values.shape)

  self.pid = os.fork()


torch.Size([4, 3, 1280, 960])


In [None]:
print(processor.tokenizer.decode(labels[0].tolist(), skip_special_tokens=True))

ניהם היו מוע מדים בעיתיים ינת הגוש הגדו ל של מצבי עים רפוב ליקנים מהימין ה שמרני- דתי: ג' וליאני, ש עמד  ש אחת ה ערים הליבר ליות ביות ר ות- הברית, גי לה עמדות ליברליות מדי ל בנושאים הפלות מל ותיות, הגבלות על נשק, ויחס למהגרים, וגם חייו האי שיים הוו בשערוריות והיו רחוקים ממודל איש ה משפחה המסור.מקיין נחשב למי שגילה עצמ אות יתר מהקו המפלגתי, ונזקפו נ<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <u

In [None]:
# for id in labels.squeeze().tolist()[:30]:
#   if id != -100:
#     print(processor.tokenizer.decode([id]))
#   else:
#     print(id)

In [None]:
target_sequences

("ניהם היו מוע מדים בעיתיים מבחינת הגוש הגדו ל של מצבי עים רפוב ליקנים מהימין ה שמרני- דתי: ג' וליאני, ש עמד ברא ש אחת ה ערים הליבר ליות ביות ר בארצות- הברית, גי לה עמדות ליברליות מדי לטעמם בנושאים הפלות מלאכ ותיות, הגבלות על נשק, ויחס למהגרים, וגם חייו האי שיים הוכתמו בשערוריות והיו רחוקים ממודל איש ה משפחה המסור.מקיין נחשב למי שגילה עצמ אות יתר מהקו המפלגתי, ונזקפו נ",
 'מותו של הסב והאבוב מחקה את הברווז.באופן דומ ה מתאר החליל את הציפור, ה קלרינט משויך לחתו ל ותופי הדוד מבשרים את בוא הציידים.הדמויו ת מזוהות עם הכלים המוזיקליים, מה שיוצר בקרב ה צופים ציפייה להופעת הדמות עם השמעת הנעימה המתאימה לה.ד"ר אח מד נזי ף ) בע רבית: أحمد نظيف\u200e; נולד ב-8 ביולי 2591( הוא ראש המ משלה של מצרים בעבר, שכיהן בתפקידו מאז ה-41 ביולי 2 400 ו עד 92 בינוא',
 'יברסיטא ות מעמיד ות לרשותם א ת מ יטב המ אמנים וה מורי ם.אס" א מ רח יב מש נה ל שנה את היק ף התח רויות ה בינל אומיות, בהן',
 "'C' -זכה ה ykS eht ni ralleC 800, בתחרות cnalB letsaC ud , יחד עם ח ברת אל על , כיין ה לבן הטוב ביותר המוגש ב מחלקה ראשונה

In [None]:
print(len(train_dataset))
print(len(val_dataset))

32668
3953


In [None]:
# let's check the first validation batch
batch = next(iter(val_dataloader))
pixel_values, labels, target_sequences = batch
print(pixel_values.shape)

print(target_sequences[0])

torch.Size([4, 3, 1280, 960])
רביי י שראל ה ם עקור ים או צאצ אי עקו רים כ תוצאה ממלחמ ת העצ מאות, כ לומר אנש ים שע זבו את מקו ם יישו בם ול א הורש ו לחזור אל יו.לעתים האנשים עקרו ליישוב אחר בתחילת המלח מה וכ שמקו ם מו שבם השני נכבש הם נשארו תח ת ש


# Define LightningModule
Next, we define a LightningModule, which is the standard way to train a model in PyTorch Lightning. A LightningModule is an nn.Module with some additional functionality.

Basically, PyTorch Lightning will take care of all device placements (.to(device)) for us, as well as the backward pass, putting the model in training mode, etc.

In [None]:
from pathlib import Path
import re
from nltk import edit_distance
import numpy as np
import math

from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import LambdaLR

import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_only


class DonutModelPLModule(pl.LightningModule):
    def __init__(self, config, processor, model):
        super().__init__()
        self.config = config
        self.processor = processor
        self.model = model

    def training_step(self, batch, batch_idx):
        pixel_values, labels, _ = batch

        outputs = self.model(pixel_values, labels=labels)
        loss = outputs.loss
        self.log("train_loss", loss)
        print(f"train loss: {loss}")
        return loss

    def validation_step(self, batch, batch_idx, dataset_idx=0):
        pixel_values, labels, answers = batch
        batch_size = pixel_values.shape[0]
        # we feed the prompt to the model
        decoder_input_ids = torch.full((batch_size, 1), self.model.config.decoder_start_token_id, device=self.device)

        outputs = self.model.generate(pixel_values,
                                   decoder_input_ids=decoder_input_ids,
                                   max_length=max_length,
                                   early_stopping=True,
                                   pad_token_id=self.processor.tokenizer.pad_token_id,
                                   eos_token_id=self.processor.tokenizer.eos_token_id,
                                   use_cache=True,
                                   num_beams=1,
                                   bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
                                   return_dict_in_generate=True,)

        predictions = []
        for seq in self.processor.tokenizer.batch_decode(outputs.sequences):
            seq = seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "")
            seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
            predictions.append(seq)

        scores = []
        for pred, answer in zip(predictions, answers):
            #pred = re.sub(r"(?:(?<=>) | (?=))", "", answer, count=1)
            answer = answer.replace(self.processor.tokenizer.eos_token, "")
            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))

            if self.config.get("verbose", False) and len(scores) == 1:
                print(f"Prediction: {predictions}")
                print(f"    Answer: {answer}")
                print(f" Normed ED: {scores[0]}")

        self.log("val_edit_distance", np.mean(scores))

        return scores

    def configure_optimizers(self):
        # you could also add a learning rate scheduler if you want
        optimizer = torch.optim.Adam(self.parameters(), lr=self.config.get("lr"))

        return optimizer

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return val_dataloader

In [None]:
config = {"max_epochs":100,
          "val_check_interval":1, # how many times we want to validate during an epoch
          "check_val_every_n_epoch":10,
          "gradient_clip_val":1.0,
          "num_training_samples_per_epoch": 44,
          "lr":3e-5,
          "train_batch_sizes": [8],
          "val_batch_sizes": [1],
          # "seed":2022,
          "num_nodes": 1,
          "warmup_steps": 300, # 800/8*30/10, 10%
          "result_path": "./result",
          "verbose": True,
          }

model_module = DonutModelPLModule(config, processor, model)

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import Callback, EarlyStopping

#wandb_logger = WandbLogger(project="Donut", name="demo-run-cord")

# class PushToHubCallback(Callback):
#     def on_train_epoch_end(self, trainer, pl_module):
#         print(f"Pushing model to the hub, epoch {trainer.current_epoch}")
#         pl_module.model.push_to_hub("nielsr/donut-demo",
#                                     commit_message=f"Training in progress, epoch {trainer.current_epoch}")

#     def on_train_end(self, trainer, pl_module):
#         print(f"Pushing model to the hub after training")
#         pl_module.processor.push_to_hub("nielsr/donut-demo",
#                                     commit_message=f"Training done")
#         pl_module.model.push_to_hub("nielsr/donut-demo",
#                                     commit_message=f"Training done")

early_stop_callback = EarlyStopping(monitor="val_edit_distance", patience=100, verbose=False, mode="min")

trainer = pl.Trainer(
        accelerator="gpu",
        #devices='auto',
        max_epochs=config.get("max_epochs"),
        val_check_interval=config.get("val_check_interval"),
        check_val_every_n_epoch=config.get("check_val_every_n_epoch"),
        gradient_clip_val=config.get("gradient_clip_val"),
        precision='16-mixed', # we'll use mixed precision
        num_sanity_val_steps=0,
        #logger=,
        callbacks=[early_stop_callback],
)

trainer.fit(model_module)

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:`Trainer(val_check_interval=1)` was configured so validation will run after every batch.
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:


Training: |          | 0/? [00:00<?, ?it/s]

train loss: 21.131343841552734
train loss: 20.54745864868164
train loss: 20.863561630249023
train loss: 20.867813110351562
train loss: 21.50075912475586
train loss: 20.30603790283203
train loss: 14.674810409545898
train loss: 13.685200691223145
train loss: 12.17957592010498
train loss: 11.411356925964355
train loss: 11.215615272521973
train loss: 9.365970611572266
train loss: 10.317680358886719
train loss: 10.664036750793457
train loss: 9.447834014892578
train loss: 8.53312873840332
train loss: 9.072164535522461
train loss: 8.35280704498291
train loss: 8.414700508117676
train loss: 8.42129135131836
train loss: 7.576664924621582
train loss: 7.337076187133789
train loss: 7.344854831695557
train loss: 7.3509111404418945
train loss: 7.231627941131592
train loss: 7.046614170074463
train loss: 6.748353958129883
train loss: 6.662676811218262
train loss: 6.0243449211120605
train loss: 6.2848591804504395
train loss: 5.834998607635498
train loss: 5.856175422668457
train loss: 5.849306583404541
t

In [None]:
processor.tokenizer.eos_token

In [None]:
pixel_values, labels, answers = test_dataset[0]

In [None]:
pixel_values.unsqueeze(0).shape

In [None]:
pixel_values, labels, answers = train_dataset[3]

pixel_values = pixel_values.unsqueeze(0)
labels = labels.unsqueeze(0)

batch_size = pixel_values.shape[0]
decoder_input_ids = torch.full((batch_size, 1), model.config.decoder_start_token_id, device='cuda')



outputs = trainer.model.model.generate(pixel_values.to('cuda'),
                                   decoder_input_ids=decoder_input_ids,
                                   max_length=max_length,
                                   early_stopping=True,
                                   pad_token_id=processor.tokenizer.pad_token_id,
                                   eos_token_id=processor.tokenizer.eos_token_id,
                                   use_cache=True,
                                   num_beams=1,
                                   bad_words_ids=[[processor.tokenizer.unk_token_id]],
                                   return_dict_in_generate=True,)

In [None]:
predictions = []
for seq in processor.tokenizer.batch_decode(outputs.sequences):
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    predictions.append(seq)

predictions

In [None]:
answers