In [4]:
from transformers import LayoutLMv3Config, LayoutLMv3Model
import json

# Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration
with open("/home/luckagianvechio/Documents/Material Estudo TCC/code/layoutlmv3/config_alterations.json", "r") as jeiso:
    configuration = LayoutLMv3Config(**json.load(jeiso))

# Initializing a model (with random weights) from the microsoft/layoutlmv3-base style configuration
model = LayoutLMv3Model(configuration)

# Accessing the model configuration
configuration = model.config

In [5]:
model.embeddings.word_embeddings

Embedding(29794, 768, padding_idx=0)

In [6]:
lmv3_embeddings = model.embeddings.word_embeddings

In [7]:
model

LayoutLMv3Model(
  (embeddings): LayoutLMv3TextEmbeddings(
    (word_embeddings): Embedding(29794, 768, padding_idx=0)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(512, 768, padding_idx=0)
    (x_position_embeddings): Embedding(1024, 128)
    (y_position_embeddings): Embedding(1024, 128)
    (h_position_embeddings): Embedding(1024, 128)
    (w_position_embeddings): Embedding(1024, 128)
  )
  (patch_embed): LayoutLMv3PatchEmbeddings(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
  (encoder): LayoutLMv3Encoder(
    (layer): ModuleList(
      (0-11): 12 x LayoutLMv3Layer(
        (attention): Layo

In [8]:
from transformers import BertModel  # or BertModel, for BERT without pretraining heads

bertimbau_model = BertModel.from_pretrained('/home/luckagianvechio/Documents/Material Estudo TCC/code/bertimbau')


In [9]:
bertimbau_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(29794, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [10]:
bertimbau_model.embeddings.word_embeddings

Embedding(29794, 768, padding_idx=0)

In [11]:
model.embeddings.word_embeddings = bertimbau_model.embeddings.word_embeddings

In [12]:
model.embeddings.word_embeddings

Embedding(29794, 768, padding_idx=0)

In [13]:
from transformers import LayoutLMv3Processor, LayoutLMv3ImageProcessor
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
image_processor = LayoutLMv3ImageProcessor.from_pretrained("microsoft/layoutlmv3-base")

In [14]:
from transformers import LayoutLMv3Tokenizer

lmv3_tok = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")


In [12]:
tokenizer.encode('Tinha uma pedra no meio do caminho.', return_tensors='pt')

tensor([[  101, 14478,   230,  5028,   202,  1423,   171,  3420,   119,   102]])

In [13]:
lmv3_tok.encode(
    text='Oi tudo tres quatro cinco seis sete.',
    boxes = [[0,1,2,3], [2,3,4,5], [0,1,2,3], [2,3,4,5], [0,1,2,3], [2,3,4,5], [0,1,2,3]],
    return_tensors='pt')

tensor([[   0,  384,  939, 1437,  326, 1717,  385, 1021,    2]])

In [14]:
lmv3_tok.encode(
    text='Oi tudo tres quatro cinco seis sete.'.split(),
    boxes = [[0,1,2,3], [2,3,4,5], [0,1,2,3], [2,3,4,5], [0,1,2,3], [2,3,4,5], [0,1,2,3]],
    return_tensors='pt')

tensor([[    0,   384,   118,   326, 23259,   326,  1535,  2677, 25357,   740,
           179,   876,   842,   354,   278,   242,     4,     2]])

In [15]:
lmv3_tok.encode(
    text='Oi tudo tres quatro cinco seis sete.',
    boxes = [[k, k+1, k+2, k+3] for k in range(0, 40, 4)],
    return_tensors='pt')

tensor([[   0,  384,  939, 1437,  326, 1717,  385, 1021, 1437,  326,  910,    2]])

In [16]:
lmv3_tok.decode([   0,  384,  939, 1437,  326, 1717,  385, 1021,    2])

'<s> O i  t u d o</s>'

In [17]:
lmv3_tok.decode([   0,  384,  939, 1437,  326, 1717,  385, 1021, 1437,  326,  910,    2])

'<s> O i  t u d o  t r</s>'

In [18]:
lmv3_tok.decode([    0,   384,   118,   326, 23259,   326,  1535,  2677, 25357,   740,
           179,   876,   842,   354,   278,   242,     4,     2])

'<s> Oi tudo tres quatro cinco seis sete.</s>'

In [19]:
toks=[    0,   384,   118,   326, 23259,   326,  1535,  2677, 25357,   740,
           179,   876,   842,   354,   278,   242,     4,     2]

for tk in toks:
    print(lmv3_tok.decode([tk]), end = "--")

<s>-- O--i-- t--udo-- t--res-- qu--atro-- c--in--co-- se--is-- set--e--.--</s>--

In [20]:
res = lmv3_tok.encode(
    text='Oi tudo tres quatro cinco seis sete.',
    boxes = [[k, k+1, k+2, k+3] for k in range(0, 10, 1)],
    return_tensors='pt')
lmv3_tok.decode(res[0])

'<s> O i  t u d o  t r</s>'

In [21]:
res

tensor([[   0,  384,  939, 1437,  326, 1717,  385, 1021, 1437,  326,  910,    2]])

In [15]:
from transformers import LayoutLMv3Processor

lmv3_processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
lmv3_processor.feature_extractor.apply_ocr = False




In [16]:
from pathlib import Path
from os import listdir

main_path = Path("/home/luckagianvechio/Documents/Material Estudo TCC/IIT CDIP/images.a.a/imagesa/a/a")
a_path = Path("a/")
a_img_folder_path = [main_path / a_path / Path(pt) for pt in listdir(main_path / a_path)]
a_img_path = []
for pt in a_img_folder_path:
    files = listdir(pt)
    for file in files:
        if not file.split(".")[1] == "xml":
            a_img_path.append(pt / file)

In [17]:
from ocr_tools import get_ocr_word_box_list, preprocess_image, resize_image, read_image, normalize_bbox

image = read_image(a_img_path[0])
text_boxes, shape = get_ocr_word_box_list(a_img_path[0])
words = [k["text"] for k in text_boxes]
boxes = [normalize_bbox(k["bbox"], shape[0], shape[1]) for k in text_boxes]

In [18]:
processed = lmv3_processor(
    image,
    words,
    boxes=boxes,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

In [26]:
processed.keys()

dict_keys(['input_ids', 'attention_mask', 'bbox', 'pixel_values'])

In [27]:
processed['bbox'][0][-350:-300]

tensor([[127, 368, 149, 377],
        [151, 368, 174, 375],
        [ 71, 389,  98, 398],
        [ 71, 389,  98, 398],
        [100, 389, 110, 396],
        [113, 389, 133, 396],
        [113, 389, 133, 396],
        [113, 389, 133, 396],
        [113, 389, 133, 396],
        [135, 389, 179, 396],
        [181, 389, 192, 396],
        [181, 389, 192, 396],
        [194, 391, 206, 396],
        [208, 389, 221, 396],
        [223, 389, 233, 396],
        [236, 390, 245, 396],
        [247, 388, 275, 397],
        [278, 388, 285, 397],
        [286, 388, 297, 395],
        [ 71, 399, 100, 407],
        [102, 400, 116, 407],
        [118, 399, 125, 407],
        [127, 399, 152, 407],
        [154, 399, 171, 407],
        [173, 399, 176, 407],
        [178, 401, 194, 409],
        [196, 399, 202, 407],
        [203, 399, 212, 407],
        [214, 399, 238, 407],
        [240, 399, 253, 406],
        [255, 399, 261, 406],
        [262, 399, 271, 406],
        [273, 399, 301, 406],
        [ 

In [28]:
model(**processed)

LayoutLMv3TextEmbeddings(
  (word_embeddings): Embedding(29794, 768, padding_idx=0)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (position_embeddings): Embedding(512, 768, padding_idx=1)
  (x_position_embeddings): Embedding(1024, 128)
  (y_position_embeddings): Embedding(1024, 128)
  (h_position_embeddings): Embedding(1024, 128)
  (w_position_embeddings): Embedding(1024, 128)
) tensor([[    0,  7330, 15318,   109,   242,  2841,  1046,    17,    27,   262,
         24303, 37317,     4,  4979, 13895,   565, 14718, 10566,    35,   719,
           753,     6,  7528, 33355, 10566,    35,  5457,   759,   733,     6,
          7528, 40950,  1499,  4177, 10486,    35, 30627,     6,   886, 40950,
          1499, 11185,   337,  1872,    44,   711,   713, 11020, 20026, 13133,
             9,   143,  1203,     6, 11566,     6, 15155,  1120,    50, 11267,
          2332,    14, 26238,  

IndexError: index out of range in self

In [None]:
#model.embeddings.word_embeddings = lmv3_embeddings

In [None]:
lmv3_tok.decode(processed['input_ids'][0])

'<s> Altempt doe em age’ 7} RJ. RE ERT Alert Date: December 19, 1997 Effective Date: = November 21, 1997 Restriction Geography: Compton, California Restriction Detalts ‘This ordinance prohibits placement of any sign, poster, placard or graphic display that advertises cigarettes or alcoholic beverages in a publicly visible location. Publicly visible tocation includes outdoor billboards, sides of buildings and freestanding signboards. The ordinance DOES NOT APPLY to the following: + Signs inside the premises that are licensed to sell cigarettes. « Commercial vehicles used to transport cigarettes «Signs an Metropolitan Transit Aulhorily vehicles ¢ Signs on property adjacent to an interstate highway + Contracts executed before the effective date of this ordnance or to any renewal term of an existing contract Cigarette and alcoho! advertisements thal now exist and are prohibited by this ordinance must be removed within 2 years of the effective date of the ordinance (November 21, 1999). w 8 

In [None]:
lmv3_processor(
    image,
    words,
    word_labels = [1 for k in range(len(words))],
    boxes=boxes,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)["labels"]

tensor([[-100,    1, -100,    1, -100,    1,    1, -100, -100,    1, -100,    1,
         -100,    1,    1, -100,    1,    1, -100,    1,    1, -100,    1,    1,
            1, -100,    1,    1,    1, -100,    1,    1, -100,    1, -100, -100,
            1, -100,    1,    1, -100,    1, -100, -100,    1,    1, -100,    1,
            1,    1,    1,    1,    1, -100,    1, -100,    1, -100,    1,    1,
            1,    1,    1, -100,    1,    1,    1,    1,    1,    1,    1,    1,
            1, -100,    1, -100,    1,    1, -100,    1,    1,    1, -100,    1,
            1,    1,    1,    1, -100, -100,    1, -100, -100,    1,    1,    1,
            1,    1, -100, -100,    1,    1,    1, -100,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1, -100,    1,    1,    1,    1,
            1,    1,    1,    1, -100, -100,    1,    1,    1,    1, -100, -100,
         -100,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,  

In [19]:
from new_processor import tokenize_with_bbox, pad_tokenized

new_processed = tokenize_with_bbox(
    words=words,
    bboxs=boxes,
    tokenizer=tokenizer,
    max_length=512
)
padded_new_processed = pad_tokenized(new_processed, max_length=512)

In [None]:
bertimbau_embeddings = bertimbau_model.embeddings.word_embeddings
model.embeddings.word_embeddings = bertimbau_embeddings

In [20]:
from torch import Tensor

processed_new_tokenizer = processed.copy()
processed_new_tokenizer["input_ids"][0] = Tensor(padded_new_processed["input_ids"])
processed_new_tokenizer["bbox"][0] = Tensor(padded_new_processed["bbox"])

In [21]:
model(**processed_new_tokenizer)

LayoutLMv3TextEmbeddings(
  (word_embeddings): Embedding(29794, 768, padding_idx=0)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (position_embeddings): Embedding(512, 768, padding_idx=0)
  (x_position_embeddings): Embedding(1024, 128)
  (y_position_embeddings): Embedding(1024, 128)
  (h_position_embeddings): Embedding(1024, 128)
  (w_position_embeddings): Embedding(1024, 128)
) tensor([[  476,  2097,  5131,   171, 22279,   173,  9174, 22361,   977,   197,
         19647,   119,   257, 22309,   192, 20257,  1043, 22282, 22286,  1292,
           185,   131,  4534,   210,   604,   275,   117,  6827,   192, 14782,
          5965,  1086,  1292,   185,   131,   134,  4424,   210,   604,  2250,
           117,  6827,  4151,  9581,  1131,  1992,  4009,  9558,   131,  2174,
           897,   117, 19921,  4151,  9581,  1131,   540,   477,  1355,  8586,
         16989,  4189,   387,  



BaseModelOutput(last_hidden_state=tensor([[[-0.1568, -0.2598, -0.2371,  ..., -1.0681,  0.5948,  0.8614],
         [-0.0803, -2.0187, -1.3858,  ...,  0.1783, -1.8378,  1.4976],
         [-0.3958,  0.6073, -0.3292,  ..., -1.5268, -0.1432, -0.4828],
         ...,
         [ 1.0745,  1.1467, -0.3228,  ..., -0.8103,  0.1111,  0.6372],
         [ 1.1398,  1.4831,  0.6356,  ..., -0.8528, -0.7138,  0.1530],
         [ 0.6453,  0.7477,  1.2914,  ..., -0.9224,  0.0406,  0.0370]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [None]:
for key in processed.keys():
    print(processed_new_tokenizer[key].squeeze().shape, processed[key].squeeze().shape)

torch.Size([512]) torch.Size([512])
torch.Size([512]) torch.Size([512])
torch.Size([512, 4]) torch.Size([512, 4])
torch.Size([3, 224, 224]) torch.Size([3, 224, 224])


In [None]:
tokenizer.decode(processed_new_tokenizer['input_ids'][0])

'Altempt doe em age ’ 7 } RJ. RE ERT Alert Date : December 19, 1997 Effective Date : = November 21, 1997 Restriction Geography : Compton, California Restriction Detalts ‘ This ordinance prohibits placement of any sign, poster, placard or graphic display that advertises cigarettes or alcoholic beverages in a publicly visible location. Publicly visible tocation includes outdoor billboards, sides of buildings and freestanding signboards. The ordinance DOES NOT APPLY to the following : + Signs inside the premises that are licensed to sell cigarettes. « Commercial vehicles used to transport cigarettes « Signs an Metropolitan Transit Aulhorily vehicles [UNK] Signs on property adjacent to an interstate highway + Contracts executed before the effective date of this ordnance or to any renewal term of an existing contract Cigarette and alcoho! advertisements thal now exist and are prohibited by this ordinance must be removed within 2 years of the effective date of the ordinance ( November 21, 19

In [None]:
processed_new_tokenizer['input_ids']

tensor([[  476,  2097,  5131,   171, 22279,   173,  9174, 22361,   977,   197,
         19647,   119,   257, 22309,   192, 20257,  1043, 22282, 22286,  1292,
           185,   131,  4534,   210,   604,   275,   117,  6827,   192, 14782,
          5965,  1086,  1292,   185,   131,   134,  4424,   210,   604,  2250,
           117,  6827,  4151,  9581,  1131,  1992,  4009,  9558,   131,  2174,
           897,   117, 19921,  4151,  9581,  1131,   540,   477,  1355,  8586,
         16989,  4189,   387,  1710,   258,  4095,  5257, 22281,  1269,  2812,
           265,   586,   360, 22326,  1493,   117, 13608,   140,   117,  9109,
         22284,   438,   416,  1814,   156,  4267, 12893, 12230,   352, 11760,
          4530,   143, 14883,   823,  6060, 22281,   438, 15460, 15203,   156,
          5294,  5042,  4119,   238,   123,  1336,  2021,  1003, 20900,   817,
           352,   484,   119, 10927,  2021,  1003, 20900,  7384,  1131,   868,
          1111,   291, 22286,   243,   141,  3425, 2

In [None]:
max(processed_new_tokenizer['input_ids'][0])

tensor(22361)

In [None]:
bertimbau_embeddings = bertimbau_model.embeddings.word_embeddings

In [None]:
bertimbau_embeddings(max(processed_new_tokenizer['input_ids'][0]))

tensor([ 5.1749e-02, -7.5483e-03, -5.3265e-02,  7.9719e-03, -2.2911e-02,
        -3.7143e-02,  4.8871e-03, -1.9706e-02, -2.4039e-04, -4.5227e-02,
        -1.8988e-02,  1.3754e-02, -2.5060e-02, -3.9911e-02, -3.8198e-02,
         7.9675e-03,  1.0591e-02, -1.5516e-02, -1.9410e-02, -9.3070e-02,
         3.8822e-02,  2.4400e-02, -4.6231e-02, -3.9224e-02, -4.9267e-02,
        -5.1528e-02,  2.4972e-03, -4.0830e-02, -1.8358e-02, -3.7181e-02,
        -3.4045e-02, -2.0300e-02,  4.2643e-02, -4.8424e-02,  3.0170e-02,
         3.0366e-02, -2.8620e-02, -6.3595e-02,  1.1377e-03, -4.3281e-02,
         2.6231e-02, -4.0770e-02,  6.7154e-02,  2.0910e-02,  4.2354e-02,
        -2.5098e-02, -2.5217e-02, -6.4154e-02,  1.2690e-02, -1.4113e-02,
         3.6339e-02, -1.0496e-02, -6.5917e-02,  5.9714e-02,  3.0761e-02,
        -2.3547e-01, -2.7711e-02,  1.0110e-02, -8.5547e-03, -1.9117e-02,
        -3.2750e-02, -1.3825e-02,  6.4325e-03, -1.0901e-02, -3.0468e-02,
        -1.7198e-02, -1.7289e-02, -1.8503e-02, -7.0

In [None]:
bertimbau_embeddings(processed_new_tokenizer['input_ids']).squeeze().shape

torch.Size([512, 768])

In [None]:
for key in model.config.__dict__.keys():
    try:
        print((key, bertimbau_model.config.__dict__[key], model.config.__dict__[key]), end = "  -- ")
    except:
        pass

('return_dict', True, True)  -- ('output_hidden_states', False, False)  -- ('output_attentions', False, False)  -- ('torchscript', False, False)  -- ('torch_dtype', None, None)  -- ('use_bfloat16', False, False)  -- ('tf_legacy_loss', False, False)  -- ('pruned_heads', {}, {})  -- ('tie_word_embeddings', True, True)  -- ('chunk_size_feed_forward', 0, 0)  -- ('is_encoder_decoder', False, False)  -- ('is_decoder', False, False)  -- ('cross_attention_hidden_size', None, None)  -- ('add_cross_attention', False, False)  -- ('tie_encoder_decoder', False, False)  -- ('max_length', 20, 20)  -- ('min_length', 0, 0)  -- ('do_sample', False, False)  -- ('early_stopping', False, False)  -- ('num_beams', 1, 1)  -- ('num_beam_groups', 1, 1)  -- ('diversity_penalty', 0.0, 0.0)  -- ('temperature', 1.0, 1.0)  -- ('top_k', 50, 50)  -- ('top_p', 1.0, 1.0)  -- ('typical_p', 1.0, 1.0)  -- ('repetition_penalty', 1.0, 1.0)  -- ('length_penalty', 1.0, 1.0)  -- ('no_repeat_ngram_size', 0, 0)  -- ('encoder_no_r