# Setup

In [None]:
# Installs

!pip install transformers
!pip install -q datasets jiwer
!pip install accelerate nvidia-ml-py3

# Imports

from transformers import (
    TrOCRConfig,
    TrOCRProcessor,
    TrOCRForCausalLM,
    ViTConfig,
    ViTModel,
    VisionEncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
    default_data_collator
)

from PIL import Image
from torchvision import transforms

import csv
import fnmatch
from math import isclose
import os
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from datasets import load_metric
import shutil
import pandas as pd

# Mount gdrive

from google.colab import drive
content_root = '/content/'
mount_root = '/content/gdrive/'
drive_root = f'{mount_root}MyDrive/'
drive.mount(mount_root)

# Processor
# TODO: should this be trained?
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

# Model Name
model_name = "oc-carb-fine-tuning-10k"
prediction_name = f"{model_name}-raw-predictions"
split = 0
total_splits = 5
threshold = -0.0772

!unzip '/content/gdrive/MyDrive/ErukaTraining/OC/images.zip' -d '/content/'


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.26.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

Downloading (…)rocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/images/0340001000900.jpg  
  inflating: /content/images/1740009001800.jpg  
  inflating: /content/images/0970002019800.jpg  
  inflating: /content/images/0400004011800.jpg  
  inflating: /content/images/0680001006000.jpg  
  inflating: /content/images/1120001002700.jpg  
  inflating: /content/images/2380001001500.jpg  
  inflating: /content/images/5950009001900.jpg  
  inflating: /content/images/6510051012400.jpg  
  inflating: /content/images/5200171011400.jpg  
  inflating: /content/images/2160047006000.jpg  
  inflating: /content/images/6410010001300.jpg  
  inflating: /content/images/6090006002900.jpg  
  inflating: /content/images/2100077011800.jpg  
  inflating: /content/images/2010038005300.jpg  
  inflating: /content/images/0380A03003000.jpg  
  inflating: /content/images/2030028011500.jpg  
  inflating: /content/images/5710003006000.jpg  
  inflating: /content/images/5950010004700.jpg  
  in

# Inference using trained model

In [None]:
class OCInferenceDataset(torch.utils.data.Dataset):

    def __init__(self, processor, dir, max_target_length=16) -> None:
        image_files = fnmatch.filter(os.listdir(dir), "*.jpg")

        self.X = []

        for image_file in image_files:
            # Extract pixel values
            file_path = f"{dir}{image_file}"
            parcel = image_file[:-4]
            # pixel_values.to(device)
            
            # Resolve parcel values
            parcel_tokens = processor.tokenizer(parcel, 
                padding="max_length", 
                max_length=max_target_length).input_ids
            parcel_tokens = torch.tensor(parcel_tokens)

            self.X.append({"file_path": file_path, "parcel_tokens": parcel_tokens})

    def __len__(self):
        return len(self.X)

    def __getitem__(self, ind):
        x = self.X[ind]
        try:
            image = Image.open(x["file_path"]).convert("RGB")
            pixel_values = processor(image, return_tensors="pt").pixel_values.squeeze(0)

            return {"pixel_values": pixel_values, "parcel_tokens": x["parcel_tokens"]}
        except:
            return None

def collate_fn(batch):
    batch = list(filter(lambda x: x is not None, batch))
    return torch.utils.data.dataloader.default_collate(batch)

oc_inference_dataset = OCInferenceDataset(processor, f"{content_root}images/")
oc_inference_subset = torch.utils.data.Subset(oc_inference_dataset, list(range(split*len(oc_inference_dataset)//total_splits, (split+1)*len(oc_inference_dataset)//total_splits)))

print(f"Inference subset - {len(oc_inference_subset)}")

Inference subset - 9969


In [None]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1").to("cuda")

# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# load model
model.load_state_dict(torch.load(f"{drive_root}ErukaTraining/models/{model_name}.pt"))
model.eval()

generate_args = {"max_length": 16, "num_beams": 4, "return_dict_in_generate": True, "output_scores": True}
result_df = pd.DataFrame(columns=["parcelid", "prediction", "score"])
test_loader = DataLoader(oc_inference_subset, batch_size=16, shuffle=False, collate_fn=collate_fn)
batch_bar = tqdm(total=len(test_loader), dynamic_ncols=True, leave=False, position=0, desc='Eval') 

for batch in test_loader:
    pixel_values = batch["pixel_values"].to("cuda")
    outputs = model.generate(pixel_values, **generate_args)
    generated_ids = outputs.sequences
    scores = outputs.sequences_scores
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
    # generated_text = generated_text.translate({ord(c): None for c in ' ,.'})
    parcelids = processor.batch_decode(batch["parcel_tokens"], skip_special_tokens=True)

    result_df = result_df.append(pd.DataFrame(list(zip(parcelids, generated_text, scores.to("cpu").tolist())), columns=["parcelid", "prediction", "score"]))
    batch_bar.update()

result_df.to_csv(f"{drive_root}ErukaTraining/predictions/{prediction_name}_{split}.csv")

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  result_df = result_df.append(pd.DataFrame(list(zip(parcelids, generated_text, scores.to("cpu").tolist())), columns=["parcelid", "prediction", "score"]))
  result_df = result_df.append(pd.DataFrame(list(zip(parcelids, generated_text, scores.to("cpu").tolist())), columns=["parcelid", "prediction", "score"]))
  result_df = result_df.append(pd.DataFrame(list(zip(parcelids, generated_text, scores.to("cpu").tolist())), columns=["parcelid", "prediction", "score"]))
  result_df = result_df.append(pd.DataFrame(list(zip(parcelids, generated_text, scores.to("cpu").tolist())), columns=["parcelid", "prediction", "score"]))
  result_df = result_df.append(pd.DataFrame(list(zip(parcelids, generated_text, scores.to("cpu").tolist())), columns=["parcelid", "prediction", "score"]))
  result_df = result_df.append(pd.DataFrame(list(zip(parcelids, generated_text, scores.to("cpu").tolist())), columns=["parcelid", "prediction", "score"]))
  result_df = result_df.append(pd.DataFrame(list(zip(parcelids, genera