In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from pathlib import Path
from PIL import Image
from transformers import TrOCRProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from typing import Dict


class PriceTagDataset(Dataset):
    def __init__(
        self,
        dataset_root_dir: Path,
        path_for_metadata_file: Path,
        processor: TrOCRProcessor,
    ) -> None:
        assert dataset_root_dir.exists(), f"{dataset_root_dir} does not exists"
        assert path_for_metadata_file.exists(), f"{path_for_metadata_file} does not exists"

        self.__dataset_root_dir = dataset_root_dir
        self.__metadata = path_for_metadata_file
        self.__df = self.__read_txt_metadata()
        self.__max_target_length = self.__df["text"].str.len().max()
        self.__processor = processor

    def __read_txt_metadata(self) -> pd.DataFrame:
        df = pd.read_table(self.__metadata, encoding="utf8", header=None, sep=" ")
        df.columns = ["file_name", "text"]
        df["text"] = df["text"].astype(str)
        return df

    def __len__(self) -> int:
        return self.__df.shape[0]

    def __getitem__(self, idx: int)->Dict[str,torch.Tensor]:
        file_name = self.__df["file_name"][idx]
        text = self.__df["text"][idx]
        image = Image.open(self.__dataset_root_dir / file_name).convert("RGB")
        pixel_values = self.__processor(image, return_tensors="pt").pixel_values
        labels = self.__processor.tokenizer(
            text, padding="max_length", max_length=self.__max_target_length
        ).input_ids
        labels = [
            label if label != self.__processor.tokenizer.pad_token_id else -100
            for label in labels
        ]

        return  pixel_values.squeeze(),torch.tensor(labels)

In [15]:
from model import TrOCRModel
from config import TransfomerOCRConfig
from pathlib import Path
yaml_config = TransfomerOCRConfig(Path("/home/research/NapoleonPractice/trocr/configs/trocr_printed.yaml"))
model = TrOCRModel(yaml_config)
model.model.config.decoder_start_token_id = model.processor.tokenizer.cls_token_id
model.model.config.pad_token_id = model.processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.model.config.vocab_size = model.model.config.decoder.vocab_size

# set beam search parameters
model.model.config.eos_token_id = model.processor.tokenizer.sep_token_id
model.model.config.max_length = 64
model.model.config.early_stopping = True
model.model.config.no_repeat_ngram_size = 3
model.model.config.length_penalty = 2.0
model.model.config.num_beams = 4
dataset = PriceTagDataset(Path("/home/research/NapoleonPractice/data/train_limited_50"),
                          Path("/home/research/NapoleonPractice/data/train_limited_50/annotations_train_limited_50.txt"),
                          model.processor
                          )
dataloader = DataLoader(dataset=dataset,batch_size=2,shuffle=True)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "deco

In [18]:
imgs,labels = next(iter(dataloader))

In [19]:
model.model.to("cpu")
res = model.model(imgs,labels=labels)

In [20]:
preds = model.model.generate(imgs)
predictions = model.processor.batch_decode(preds, skip_special_tokens=True)
predictions



[' . .', '29499']

In [24]:
labels[labels == -100] = model.processor.tokenizer.pad_token_id
model.processor.batch_decode(labels, skip_special_tokens=True)


['13129', '29499']

In [30]:
import numpy as np

np.sum(np.array(predictions)==np.array(model.processor.batch_decode(labels, skip_special_tokens=True)))

1