# Import libraries

In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from PIL import Image
from transformers import TrOCRProcessor,  VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
from datasets import load_metric

# Load and prepare data

In [6]:
# Read file names and text labels, and put them in a dataframe
df = pd.read_fwf('IAM-data/IAM-data/iam_lines_gt.txt', header=None)
file_names = df.iloc[::2]
text = df.iloc[1::2]
del df
file_names.columns = ['file_name']
text.columns = ['text']
file_names.reset_index(drop=True, inplace=True)
text.reset_index(drop=True, inplace=True)

data = pd.concat([file_names, text], axis=1)
data.head()


Unnamed: 0,file_name,text
0,a03-017-07.png,into the pro-communist north and the
1,a03-017-05.png,"to 1958 kept the kingdom in peace, though"
2,a03-017-08.png,pro-western centre and south.
3,a03-017-02.png,in Phnom Penh indicate that he still regards
4,a03-017-06.png,at the cost of virtual partition of the country


In [7]:
# Check longest amount of text 
print(data['text'].apply(len).max())

54


In [8]:
# split train and validation data in 80% train 10% val and 10% test
train_data, val_data = train_test_split(data, test_size=0.2, random_state=0)
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

val_data, test_data = train_test_split(val_data, test_size=0.5, random_state=0)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [9]:
# Create regular Pytorch dataset to load images and process them during training

class Task3IAM(Dataset):
    def __init__(self, img_dir, data, processor, max_target_length = 64):
        self.img_dir = img_dir
        self.data = data
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_name = self.data['file_name'][idx]
        text = self.data['text'][idx]
        # Load image and get image ready for model 
        image = Image.open(os.path.join(self.img_dir, file_name)).convert("RGB")
        pixel_values = self.processor(image, return_tensors= "pt").pixel_values

        labels = self.processor.tokenizer(text, padding= "max_length", max_length = self.max_target_length).input_ids   # tokenize text label
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]    # Ignore pad tokens for the loss function
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        
        return encoding

In [10]:
# load processor (resize/normalize images and tokenizer), and load pytorch dataset
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
train_dataset = Task3IAM(img_dir='IAM-data/IAM-data/img/', data = train_data, processor = processor)
eval_dataset = Task3IAM(img_dir='IAM-data/IAM-data/img/', data = val_data, processor = processor)
test_dataset = Task3IAM(img_dir='IAM-data/IAM-data/img/', data = test_data, processor = processor)

In [11]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))
print("Number of test examples:", len(test_dataset))

Number of training examples: 5966
Number of validation examples: 746
Number of test examples: 746


# Fine tune model

In [15]:
# Import pretrained model
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-stage1")

# Change settings for pretrained model
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id ### This statement is maybe not needed, not sure if required to work. If inference does not work maybe delete this
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters of model for generating text: increases performance of model
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.num_beams = 5

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-stage1 and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Count parameters in model
print(sum([param.nelement() for param in model.parameters()]))

61596672


In [17]:
# set training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="models/trocr-small-5beam/",
    num_train_epochs = 10,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    evaluation_strategy = "epoch",
    logging_steps = 20,
    save_strategy = "epoch",
    predict_with_generate = True, 
)

In [18]:
# create function to calculate cer between label text and generated text, to evaluate during during training
cer_metric = load_metric("cer")
def compute_cer(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer * 100}

# initialize trainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = processor.feature_extractor,
    compute_metrics = compute_cer,
    data_collator = default_data_collator,
)

In [19]:
# finetine the pretrained model
trainer.train()

***** Running training *****
  Num examples = 5966
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3730


Epoch,Training Loss,Validation Loss,Cer
1,1.0378,0.952534,28.83285
2,0.7611,0.810736,29.285555
3,0.5411,0.772294,18.602353
4,0.4635,0.722455,25.093251
5,0.3705,0.696841,21.917939
6,0.3071,0.691399,22.150668
7,0.0811,0.528217,6.704498
8,0.0761,0.525633,6.140211
9,0.0398,0.506572,5.859661
10,0.0217,0.505554,5.738515


***** Running Evaluation *****
  Num examples = 746
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/task3_model_small_beam5/checkpoint-373
Configuration saved in /content/drive/MyDrive/task3_model_small_beam5/checkpoint-373/config.json
Model weights saved in /content/drive/MyDrive/task3_model_small_beam5/checkpoint-373/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/task3_model_small_beam5/checkpoint-373/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 746
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/task3_model_small_beam5/checkpoint-746
Configuration saved in /content/drive/MyDrive/task3_model_small_beam5/checkpoint-746/config.json
Model weights saved in /content/drive/MyDrive/task3_model_small_beam5/checkpoint-746/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/task3_model_small_beam5/checkpoint-746/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 746
  

TrainOutput(global_step=3730, training_loss=0.4198709288487166, metrics={'train_runtime': 5786.5227, 'train_samples_per_second': 10.31, 'train_steps_per_second': 0.645, 'total_flos': 7.136808851440927e+18, 'train_loss': 0.4198709288487166, 'epoch': 10.0})

In [20]:
# load finetuned model
model = VisionEncoderDecoderModel.from_pretrained("models/trocr-small-5beam/checkpoint-3730")

loading configuration file /content/drive/MyDrive/task3_model_small_beam5/checkpoint-3730/config.json
Model config VisionEncoderDecoderConfig {
  "_name_or_path": "microsoft/trocr-small-stage1",
  "architectures": [
    "VisionEncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "",
    "activation_dropout": 0.0,
    "activation_function": "relu",
    "add_cross_attention": true,
    "architectures": null,
    "attention_dropout": 0.0,
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": 0.0,
    "cross_attention_hidden_size": 384,
    "d_model": 256,
    "decoder_attention_heads": 8,
    "decoder_ffn_dim": 1024,
    "decoder_layerdrop": 0.0,
    "decoder_layers": 6,
    "decoder_start_token_id": 2,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "dropout": 0.1,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "exponential_decay_length_penalty": null,
    "finetunin

In [21]:
# Evaluate model on test data and report CER
img_dir = 'IAM-data/IAM-data/img/'
generated_texts = []
for index, row in test_data.iterrows():
    file = row["file_name"] 
    image = Image.open(os.path.join(img_dir, file)).convert("RGB")

    pixel_values = processor(image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    generated_texts.append(generated_text)

In [23]:
cer = cer_metric.compute(predictions=generated_texts, references=test_data['text'].tolist())
cer = cer * 100
print("CER on test data:", cer)

CER on test data: 6.1003861003861
