### Libraries :

In [None]:
!pip install -q transformers==4.39.2

In [None]:
# CUDA 11.8
#!pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118
# CPU only
!pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu

In [None]:
## CPU :
!pip install paddlepaddle -i https://pypi.tuna.tsinghua.edu.cn/simple
## GPU :
#!pip install paddlepaddle-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
!pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+

### Inference :

#### Working with Paddle :

In [None]:
from paddleocr import PaddleOCR
from PIL import Image, ImageDraw, ImageFont,ImageEnhance

In [None]:
ocr = PaddleOCR(use_angle_cls=False,
                lang='fr',
                  rec=False,
                  use_tensorrt = True
                ) # need to run only once to download and load model into memory


In [None]:
labels = ['label1', 'label2', 'label3','...']
labels

In [None]:
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
label2id

In [None]:
def processbbox(BBOX, width, height):
  bbox = []
  bbox.append(BBOX[0][0])
  bbox.append(BBOX[0][1])
  bbox.append(BBOX[2][0])
  bbox.append(BBOX[2][1])
  #Scaling
  bbox[0]= 1000*bbox[0]/width # X1
  bbox[1]= 1000*bbox[1]/height # Y1
  bbox[2]= 1000*bbox[2]/width # X2
  bbox[3]= 1000*bbox[3]/height # Y2
  for i in range(4):
    bbox[i] = int(bbox[i])
  return bbox
def enhance_image(image,brightness_factor, contrast_factor):
    enhancer = ImageEnhance.Brightness(image)
    brightened_image = enhancer.enhance(brightness_factor)
    enhancer = ImageEnhance.Contrast(brightened_image)
    enhanced_image = enhancer.enhance(contrast_factor)
    return enhanced_image

def Preprocess(Image_path):
    image = Image.open(Image_path)
    image = enhance_image(image,1.3,1.7)
    image = image.convert("RGB")
    width, height = image.size
    results = ocr.ocr(Image_path, cls=False,rec = True)
    results = results[0]
    test_dict = {'image': image ,'tokens':[], "bboxes":[]}
    for item in results :
       bbox = processbbox(item[0], width, height)
       test_dict['tokens'].append(item[1][0])
       test_dict['bboxes'].append(bbox)

    print(test_dict['bboxes'])
    print(test_dict['tokens'])
    return test_dict


In [None]:
Image_path = 'PATH_TO_IMAGE'
example = Preprocess(Image_path)

In [None]:
#example

In [None]:
from transformers import AutoModelForTokenClassification
from transformers import AutoProcessor
model_Hugging_path = "MODEL_REPO_ID"
model = AutoModelForTokenClassification.from_pretrained(model_Hugging_path)

In [None]:
image = example["image"]
words = example["tokens"]
boxes = example["bboxes"]

processor = AutoProcessor.from_pretrained(model_Hugging_path, apply_ocr=False)
encoding = processor(image, words, boxes=boxes,return_offsets_mapping=True,truncation=True, max_length=512, padding="max_length", return_tensors="pt")
offset_mapping = encoding.pop('offset_mapping')

In [None]:
#encoding['input_ids']  # Ids of Tokens in the Vocab of the modle

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for k,v in encoding.items():
  encoding[k] = v.to(device)

In [None]:
from transformers import LayoutLMv3ForTokenClassification

# load the fine-tuned model from the hub
model = LayoutLMv3ForTokenClassification.from_pretrained(model_Hugging_path)
model.to(device)

# forward pass
outputs = model(**encoding)
print(outputs.logits.shape)

In [None]:
def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]


**Autre** : c'est la classe autre qui corresponde au prédiction non intéressante par le modèle, vous allez bien sûr l'utiliser comme label lors de labellisation à côté des classes qui vous intéresse.

In [None]:
import numpy as np

def drop_null_bbox(dictionary):
    keys_to_drop = []
    for key, (_, _, bbox_values) in dictionary.items():
        if all(value == 0.0 for value in bbox_values):
            keys_to_drop.append(key)
    for key in keys_to_drop:
        del dictionary[key]

def get_word(bboxes,image):
    x_min, y_min, x_max, y_max = bboxes
    roi = image.crop((x_min, y_min, x_max, y_max)) # Region of intrest
    roi_np = np.array(roi) # To array
    result = ocr.ocr(roi_np, cls=False,det = False,rec = True)
    if result != [None]:
        return result[0][0][0]
    else :
        return ""
#############################################################################
#############################################################################
def get_Finale_results(offset_mapping,id2label,image,prediction_scores,predictions,token_boxes):
    width, height = image.size
    is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
    # Filter out subword tokens and extract true predictions and scores
    true_predictions_with_scores = [(idx,id2label[pred], score[pred],unnormalize_box(box, width, height)) for idx, (pred, score,box) in enumerate(zip(predictions, prediction_scores,token_boxes)) if not is_subword[idx]]
    Final_prediction = [pred for pred in true_predictions_with_scores if pred[1] != "Autre"]
    # Create a dictionary to store the highest score for each prediction
    Final_results = {}
    # Eliminete Duplication of Predictions
    for index, prediction, score, bbox in Final_prediction:
        if prediction not in Final_results or score > Final_results[prediction][1]:
            Final_results[prediction] = (index, score,bbox)
    drop_null_bbox(Final_results)

    for final in Final_results:
        Kalma = get_word(Final_results[final][2],image)
        New_tuple = (Kalma,Final_results[final][1],Final_results[final][2])
        Final_results[final] = New_tuple

    return Final_results

In [None]:
prediction_scores = outputs.logits.softmax(-1).squeeze().tolist()
predictions = outputs.logits.argmax(-1).squeeze().tolist()
token_boxes = encoding.bbox.squeeze().tolist()

Finale_results=get_Finale_results(offset_mapping,id2label,image,prediction_scores,predictions,token_boxes)

In [None]:
Finale_results

In [None]:
import time

def Draw(image):
    start_time = time.time()

    image = enhance_image(image,1.3,1.7)

    draw = ImageDraw.Draw(image)
    label2color = {
        'labe1': 'blue',
        'label2': 'green',
        'label3': 'orange',
    }

    # Adjust the thickness of the rectangle outline and label text position
    rectangle_thickness = 4
    label_x_offset = 20
    label_y_offset = -30
    # Custom font size
    custom_font_size = 25

    # Load a font with the custom size
    font_path = "arial.ttf"  # Specify the path to your font file,if you don't have it you can download it from the net
    custom_font = ImageFont.truetype(font_path, custom_font_size)

    for result in Finale_results:
        predicted_label = result
        box = Finale_results[result][2]
        color = label2color[result]
        draw.rectangle(box, outline=color, width=rectangle_thickness)
        #print(box)
        # Draw text using the custom font and size
        draw.rectangle((box[0], box[1]+ label_y_offset,box[2],box[3]+ label_y_offset), fill=color)
        draw.text((box[0] + label_x_offset, box[1] + label_y_offset), text=predicted_label, fill='white', font=custom_font)

    end_time = time.time()
    execution_time = end_time - start_time

    return image,execution_time

In [None]:
Image,runtime = Draw(image)
Image