# üë©üèΩ‚Äçüíª Setup

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install --upgrade modelbit

In [None]:
from datasets import load_dataset
from datasets.features import ClassLabel
from transformers import AutoProcessor, AutoModelForTokenClassification
from PIL import Image, ImageDraw, ImageFont
import requests
import torch

In [None]:
import modelbit
mb = modelbit.login()

In [None]:
mb.switch_branch("development")

# üìú Document Processing Helpers and Pre-Configuration


In [None]:
dataset = load_dataset("nielsr/funsd-layoutlmv3")

example = dataset["test"][1]
image = example['image']
words = example['tokens']
boxes = example['bboxes']
word_labels = example['ner_tags']

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]

def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

def iob_to_label(label):
    label = label[2:]
    if not label:
      return 'other'
    return label

if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(dataset["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}

num_labels = len(label_list)

# üí™üèæ Building the Model

In [None]:
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

In [None]:
model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", id2label=id2label, label2id=label2id)

# üß† Inference

In [None]:
def classify_document_text(image_url):
  # Download image from image_url
  image = Image.open(requests.get(image_url, stream=True).raw)

  # Build image encoding
  encoding = processor(image, words, word_labels=word_labels, boxes=boxes, return_tensors="pt")

  # Take one step
  with torch.no_grad():
    outputs = model(**encoding)
  logits = outputs.logits

  # Get model predictions
  predictions = logits.argmax(-1).squeeze().tolist()

  # Map predictions to image document
  token_boxes = encoding.bbox.squeeze().tolist()
  width, height = image.size
  labels = encoding.labels.squeeze().tolist()
  true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
  true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
  true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]

  # Draw predictions on image document
  draw = ImageDraw.Draw(image)
  font = ImageFont.load_default()
  label2color = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}

  for prediction, box in zip(true_predictions, true_boxes):
    predicted_label = iob_to_label(prediction).lower()
    draw.rectangle(box, outline=label2color[predicted_label])
    draw.text((box[0] + 10, box[1] - 10), text=predicted_label, fill=label2color[predicted_label], font=font)

  if not mb.in_modelbit():
    display(image)
  mb.log_image(image)

  # Return raw predictions for computational use
  return predictions

classify_document_text("https://doc.modelbit.com/img/memo.png")

# üöÄ Deployment

In [None]:
mb.deploy(classify_document_text)