In [1]:
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import LayoutLMv3Processor

lmv3_processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
lmv3_processor.feature_extractor.apply_ocr = False



In [3]:
from transformers import LayoutLMv3Tokenizer

lmv3_tok = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")

In [4]:
from pathlib import Path
from os import listdir

main_path = Path("/home/luckagianvechio/Documents/Material Estudo TCC/IIT CDIP/images.a.a/imagesa/a/a")
a_path = Path("a/")
a_img_folder_path = [main_path / a_path / Path(pt) for pt in listdir(main_path / a_path)]
a_img_path = []
for pt in a_img_folder_path:
    files = listdir(pt)
    for file in files:
        if not file.split(".")[1] == "xml":
            a_img_path.append(pt / file)

from ocr_tools import get_ocr_word_box_list, preprocess_image, resize_image, read_image, normalize_bbox

image = read_image(a_img_path[0])
text_boxes, shape = get_ocr_word_box_list(a_img_path[0])
words = [k["text"] for k in text_boxes]
boxes = [normalize_bbox(k["bbox"], shape[0], shape[1]) for k in text_boxes]

In [5]:
processed = lmv3_processor(
    image,
    words,
    boxes=boxes,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

In [6]:
for id, box in zip(processed['input_ids'][0][:20], processed['bbox'][0][:20]):
    print(lmv3_tok.decode([id]),"\t\t" ,box)

<s> 		 tensor([0, 0, 0, 0])
 Alt 		 tensor([13, 12, 37, 30])
empt 		 tensor([13, 12, 37, 30])
 do 		 tensor([38, 12, 48, 30])
e 		 tensor([38, 12, 48, 30])
 em 		 tensor([84, 15, 97, 16])
 age 		 tensor([340,  12, 353,  30])
� 		 tensor([340,  12, 353,  30])
� 		 tensor([340,  12, 353,  30])
 7 		 tensor([356,  14, 362,  27])
} 		 tensor([356,  14, 362,  27])
 RJ 		 tensor([ 98, 126, 113, 135])
. 		 tensor([ 98, 126, 113, 135])
 RE 		 tensor([123, 126, 128, 135])
 ER 		 tensor([260, 126, 277, 135])
T 		 tensor([260, 126, 277, 135])
 Alert 		 tensor([ 71, 154,  86, 161])
 Date 		 tensor([ 89, 154, 104, 161])
: 		 tensor([ 89, 154, 104, 161])
 December 		 tensor([130, 154, 161, 161])


In [7]:
words[:10]

['Altempt', 'doe', 'em', 'age’', '7}', 'RJ.', 'RE', 'ERT', 'Alert', 'Date:']

In [8]:
for tk in tokenizer.encode(words[:10]): print(tokenizer.decode(tk))

[ C L S ]
[ U N K ]
[ U N K ]
e m
[ U N K ]
[ U N K ]
[ U N K ]
[ U N K ]
[ U N K ]
[ U N K ]
[ U N K ]
[ S E P ]


In [9]:
for wd in words[:10]:
    print(tokenizer.tokenize(wd))
    print(tokenizer.encode(wd))
    print(tokenizer.decode(tokenizer.encode(wd)))
    print()

['Al', '##tem', '##pt']
[101, 476, 2097, 5131, 102]
[CLS] Altempt [SEP]

['do', '##e']
[101, 171, 22279, 102]
[CLS] doe [SEP]

['em']
[101, 173, 102]
[CLS] em [SEP]

['age', '’']
[101, 9174, 22361, 102]
[CLS] age ’ [SEP]

['7', '}']
[101, 977, 197, 102]
[CLS] 7 } [SEP]

['RJ', '.']
[101, 19647, 119, 102]
[CLS] RJ. [SEP]

['R', '##E']
[101, 257, 22309, 102]
[CLS] RE [SEP]

['E', '##RT']
[101, 192, 20257, 102]
[CLS] ERT [SEP]

['Ale', '##r', '##t']
[101, 1043, 22282, 22286, 102]
[CLS] Alert [SEP]

['Da', '##te', ':']
[101, 1292, 185, 131, 102]
[CLS] Date : [SEP]



In [10]:
tokenizer.cls_token_id

101

In [11]:
tokenizer.SPECIAL_TOKENS_ATTRIBUTES

['bos_token',
 'eos_token',
 'unk_token',
 'sep_token',
 'pad_token',
 'cls_token',
 'mask_token',
 'additional_special_tokens']

In [12]:
tokenizer.encode("Altempt")[1:-1]

[476, 2097, 5131]

In [13]:
special2bbox = {
    tokenizer.cls_token_id : lmv3_tok.cls_token_box,
    tokenizer.sep_token_id : lmv3_tok.sep_token_box,
    tokenizer.pad_token_id : lmv3_tok.pad_token_box
}

special2bbox

{101: [0, 0, 0, 0], 102: [0, 0, 0, 0], 0: [0, 0, 0, 0]}

In [14]:
def tokenize_with_bbox(
    words, 
    bboxs, 
    tokenizer,
    labels = [], 
    max_length = 512):
    assert len(words) == len(bboxs)

    input_ids = []
    tokenized_boxes = []
    tokenized_labels = []

    if labels == []:
        for wd, box in zip(words, bboxs):
            tokenized_word_with_cls_sep = tokenizer.encode(wd)
            tokenized_word = tokenized_word_with_cls_sep[1:-1]
            tokenized_box = [box for k in tokenized_word]

            input_ids += tokenized_word
            tokenized_boxes += tokenized_box
        
        if len(input_ids) >= max_length - 2:
            input_ids = [tokenizer.cls_token_id] + input_ids[:max_length - 2] + [tokenizer.sep_token_id]
            tokenized_boxes = [special2bbox[tokenizer.cls_token_id]] + \
                    tokenized_boxes[:max_length - 2] + \
                    [special2bbox[tokenizer.sep_token_id]]

        return {"input_ids": input_ids, "bbox": tokenized_boxes}
    else:
        assert len(labels) == len(words)

        for wd, box, label in zip(words, bboxs, labels):
            tokenized_word_with_cls_sep = tokenizer.encode(wd)
            tokenized_word = tokenized_word_with_cls_sep[1:-1]
            tokenized_box = [box for k in tokenized_word]
            tokenized_label = [-100 for k in tokenized_word]
            tokenized_label[0] = label

            input_ids += tokenized_word
            tokenized_boxes += tokenized_box
            tokenized_labels += tokenized_label
        
        if len(input_ids) >= max_length - 2:
            input_ids = [tokenizer.cls_token_id] + input_ids[:max_length - 2] + [tokenizer.sep_token_id]
            tokenized_boxes = [special2bbox[tokenizer.cls_token_id]] + \
                    tokenized_boxes[:max_length - 2] + \
                    [special2bbox[tokenizer.sep_token_id]]
            tokenized_labels = [-100] + tokenized_labels[:max_length - 2] + [-100]
        

        return {"input_ids": input_ids, "bbox": tokenized_boxes, "labels": tokenized_labels}

In [15]:
lmv3_tok.all_special_tokens

['<s>', '</s>', '<unk>', '<pad>', '<mask>']

In [16]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [17]:
len(tokenize_with_bbox(words, boxes, tokenizer, max_length=100)['input_ids']), len(tokenize_with_bbox(words, boxes, tokenizer, max_length=100)['bbox'])

(100, 100)

In [18]:
pad_obj = {
    "input_ids": tokenizer.pad_token_id,
    "bbox": special2bbox[tokenizer.pad_token_id],
    "label": -100
}

pad_obj

{'input_ids': 0, 'bbox': [0, 0, 0, 0], 'label': -100}

Os tokens CLS, SEP e PAD terão [0,0,0,0] como bounding box

O token de mask mantém a mesma bounding box

O token unk mantém a mesma bounding box

In [19]:
tokenizer.tokenize("[MASK]"), tokenizer.encode("[MASK]"), tokenizer.decode(tokenizer.encode("[MASK]"))

(['[MASK]'], [101, 103, 102], '[CLS] [MASK] [SEP]')

In [20]:
def pad_tokenized(tokenized_dict, max_length):

    paded_dict = tokenized_dict.copy()

    for key, val in tokenized_dict.items():
        if len(val) >= max_length:
            return tokenized_dict
        
        paded_dict[key] = val + [pad_obj[key] for k in range(max_length - len(val))]
    
    return paded_dict
        

In [21]:
pad_tokenized(
    tokenize_with_bbox(words, boxes, tokenizer, max_length=100),
    max_length= 150
)["input_ids"][-70:]

[9109,
 22284,
 438,
 416,
 1814,
 156,
 4267,
 12893,
 12230,
 352,
 11760,
 4530,
 143,
 14883,
 823,
 6060,
 22281,
 438,
 15460,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [22]:
len(pad_tokenized(
    tokenize_with_bbox(words, boxes, tokenizer, max_length=100),
    max_length= 150
)["bbox"])

150

In [23]:
len(pad_tokenized(
    tokenize_with_bbox(words, boxes, tokenizer, max_length=100),
    max_length= 150
)["input_ids"])

150

In [24]:
image = []
boxes = []
words = []
word_labels = []

for i in range(1):
    image.append(read_image(a_img_path[i]))
    text_boxes, shape = get_ocr_word_box_list(a_img_path[i])
    words.append([k["text"] for k in text_boxes])
    boxes.append([normalize_bbox(k["bbox"], shape[0], shape[1]) for k in text_boxes])
    word_labels.append([len(word) for word in [k["text"] for k in text_boxes]])


processed = lmv3_processor(
    image,
    words,
    boxes=boxes,
    word_labels=word_labels,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
for k, v in processed.items():
    print(k, v.shape, len(v.shape))

input_ids torch.Size([1, 512]) 2
attention_mask torch.Size([1, 512]) 2
bbox torch.Size([1, 512, 4]) 3
labels torch.Size([1, 512]) 2
pixel_values torch.Size([1, 3, 224, 224]) 4


In [26]:
processed["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0

In [27]:
from new_processor import BertimbauLayoutLMv3Processor

In [28]:
BLprocessor = BertimbauLayoutLMv3Processor(
    layoutlmv3_processor=lmv3_processor,
    bertimbau_tokenizer=tokenizer
)

In [29]:
processed_BL = BLprocessor(
    images=image,
    words=words,
    boxes=boxes,
    word_labels=word_labels,
    max_length=512
)

In [30]:
for k, v in processed_BL.items():
    print(k, v.shape, len(v.shape))

pixel_values torch.Size([1, 3, 224, 224]) 4
input_ids torch.Size([1, 512]) 2
bbox torch.Size([1, 512, 4]) 3
labels torch.Size([1, 512]) 2
attention_mask torch.Size([1, 512]) 2


In [31]:
processed_BL_1 = BLprocessor(
    images=image[0],
    words=words[0],
    boxes=boxes[0],
    word_labels=word_labels[0],
    max_length=512
)

for k, v in processed_BL_1.items():
    print(k, v.shape, len(v.shape))

pixel_values torch.Size([1, 3, 224, 224]) 4
input_ids torch.Size([1, 512]) 2
bbox torch.Size([1, 512, 4]) 3
labels torch.Size([1, 512]) 2
attention_mask torch.Size([1, 512]) 2
