# Fine tune LayoutLMv3 on custom dataset

## Load data
### Import relevant modules

In [None]:
import pyarrow_hotfix

pyarrow_hotfix.uninstall()

In [None]:
from datasets import load_from_disk
import pyarrow
pyarrow.PyExtensionType.set_auto_load(True)

In [None]:

dataset = load_from_disk("C:/Projects/IDP/watercare/dataset/23_11_03_01")


In [None]:
example = dataset['train'][1]
example

## Setup Processor

In [None]:
from transformers import AutoProcessor

# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

# LayoutLMv3ImageProcessor - handles resize, normalize, change mode, rescale of image before passing to model
# LayoutLMv3TokenizerFast - Tokenize input words
# Sequence: Input data -> LayoutLMv3ImageProcessor -> LayoutLMv3TokenizerFast -> input_ids, attention_mask -> token_type_ids -> bbox

In [None]:
from datasets.features import ClassLabel
from idp.annotations.annotation_utils import get_label_list

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.


if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(dataset["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

print(label_list)
print(id2label)

## Batch encode examples

In [None]:
def prepare_examples(examples):
  images = examples[image_column_name]
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

  return encoding

In [None]:
len(dataset['train'][boxes_column_name][0])

In [None]:
encodings = prepare_examples(dataset['train'])
encodings

In [None]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D


# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

In [None]:
train_dataset.set_format("torch")

In [None]:
import torch

example = train_dataset[0]
for k,v in example.items():
    print(k,v.shape)

In [None]:
eval_dataset['labels'][0]

In [None]:
processor.tokenizer.decode(eval_dataset[1]["input_ids"])

In [None]:
for id, label in zip(train_dataset[0]["input_ids"], train_dataset[0]["labels"]):
  print(processor.tokenizer.decode([id]), label.item())

## Setup evaluation metrics

In [None]:
from datasets import load_metric
from idp.evaluate.evaluate_utils import compute_metrics_builder

METRIC = load_metric("seqeval")

compute_metrics = compute_metrics_builder(METRIC, label_list, entity_level_metrics=False)

In [None]:
from transformers import LayoutLMv3ForTokenClassification

model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
                                                         id2label=id2label,
                                                         label2id=label2id)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="C:/Projects/IDP/watercare/model_output/23_11_03_03_01",
                                  max_steps=150,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  learning_rate=1e-5,
                                  evaluation_strategy="steps",
                                  eval_steps=50,
                                  save_strategy='steps',
                                  save_steps=50,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1")

In [None]:
from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Save best model & processor to local directory
model.save_pretrained("C:/Projects/IDP/watercare/model_output/23_11_03/best")
processor.save_pretrained("C:/Projects/IDP/watercare/model_output/23_11_03/best")

In [None]:

from transformers import AutoModelForTokenClassification

# Will load LayoutLMv3ForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("C:/Projects/IDP/watercare/model_output/23_11_03_03/checkpoint-150")

## Test Inference

In [None]:
# from datasets import load_from_disk
# dataset = load_from_disk("datasets/watercare/train_test")
example = dataset['test'][7]


In [None]:
image = example["image"]
words = example["tokens"]
boxes = example["bboxes"]
word_labels = example["ner_tags"]

encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
for k,v in encoding.items():
  print(k,v.shape)

print(encoding['input_ids'])
print(words)

In [None]:
import torch

with torch.no_grad():
  outputs = model(**encoding)

In [None]:
logits = outputs.logits
logits.shape

In [None]:
predictions = logits.argmax(-1).squeeze().tolist()
print(predictions)

In [None]:
[id2label[prediction] for prediction in predictions]

In [None]:
labels = encoding.labels.squeeze().tolist()

In [None]:
from idp.annotations.bbox_utils import unnormalize_box

token_boxes = encoding.bbox.squeeze().tolist()
width, height = image.size

true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]

In [None]:
METRIC.compute(predictions=[true_predictions],references=[true_labels],zero_division='0')

In [None]:
from PIL import ImageDraw, ImageFont

# Visualise prediction
draw = ImageDraw.Draw(image)

font = ImageFont.load_default()

def iob_to_label(label):
    label = label[2:]
    if not label:
      return 'other'
    return label

label2color = {'other':'pink','balance_still_owing':'red', 'water_consumption':'purple', 'wastewater_consumption':'green', 'wastewater_fixed':'orange', 'balance_current_charges':'violet',
              "total_due": "grey",'water_consumption_details':'red','wastewater_consumption_details':'purple','wastewater_fixed_details':'green','this_reading':'black','last_reading':'black'}

for prediction, box in zip(true_predictions, true_boxes):
    predicted_label = iob_to_label(prediction).lower()
    draw.rectangle(box, outline=label2color[predicted_label])
    draw.text((box[0] + 10, box[1] - 10), text=predicted_label, fill=label2color[predicted_label], font=font)
image

In [None]:
# Compare with ground truth
image = example["image"]
image = image.convert("RGB")

draw = ImageDraw.Draw(image)

for word, box, label in zip(example['tokens'], example['bboxes'], example['ner_tags']):
  actual_label = iob_to_label(id2label[label]).lower()
  box = unnormalize_box(box, width, height)
  draw.rectangle(box, outline=label2color[actual_label], width=2)
  draw.text((box[0] + 10, box[1] - 10), actual_label, fill=label2color[actual_label], font=font)

image