## O. Importing libraries and data

In [2]:
import json
import torch
from torch import nn
from transformers import AutoTokenizer, BertModel
from training_utils import tokenize_and_position_sequence
from relation_extraction_encoders_modeling import Classifier_MLP, Classifier_MLP_with_AvgPooling


In [2]:
data_path = '/home/tensorboard/Documentos/1. D4R/Relation extraction/prompted_RE_senteces_19-06-24.json'

In [3]:
with open(data_path, 'r' ) as f:
    data = json.load(f)
    f.close()

## 1. Adding the Special Tokens to the Model

This part is quite tricky, so I will explain it step by step:

1. First, load the normal tokenizer.

2. Add the unused tokens. Include as many unused tokens as special tokens you intend to add (e.g., if you have 6 special tokens, add 6 unused tokens).

3. Save the normal tokenizer with the new tokens to access the `vocab.txt` file. You will find [unused*] tokens listed there.

4. Follow the instructions below, extracted from [this GitHub issue](https://github.com/huggingface/transformers/issues/27974):




    hey! You should modify manually both the **added_tokens_decoder field** (saved in the `tokenizer_config.json` ) and the **added_tokens field** (saved in the `tokenizer.json`). We don't really support this manually, but that is the recommended way to do it! (If the reserved tokens were already part of the vocab, so not AddedTokens, then you have to overwrite the vocab as well, the `vocab files`, to make sure they are removed) that would be hard than if it's just the content of the added tokens that you are trying to modify 😉



This involves modifying both the `added_tokens_decoder` field in `tokenizer_config.json` and the `added_tokens` field in `tokenizer.json`. If the reserved tokens were already in the vocab and not AddedTokens, you may need to overwrite the vocab files to ensure they are correctly managed.



In [4]:
#step Zero
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')


In [5]:
# step 1
special_tokens_list =['[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]']
# Add special tokens to the tokenizer
tokenizer.add_special_tokens({'additional_special_tokens':special_tokens_list}) # type: ignore
tokenizer.added_tokens_decoder

{0: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 4: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 5: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 6: AddedToken("[unused0]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 7: AddedToken("[unused1]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 8: AddedToken("[unused2]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 9: AddedToken("[unused3]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 10: AddedToken("[unused4]", rstrip=False, lstr

In [6]:
# Step 2
tokenizer.save_pretrained('/home/tensorboard/Documentos/1. D4R/Relation extraction/custom basic model/custom tokenizer')

('/home/tensorboard/Documentos/1. D4R/Relation extraction/custom basic model/custom tokenizer/tokenizer_config.json',
 '/home/tensorboard/Documentos/1. D4R/Relation extraction/custom basic model/custom tokenizer/special_tokens_map.json',
 '/home/tensorboard/Documentos/1. D4R/Relation extraction/custom basic model/custom tokenizer/vocab.txt',
 '/home/tensorboard/Documentos/1. D4R/Relation extraction/custom basic model/custom tokenizer/added_tokens.json',
 '/home/tensorboard/Documentos/1. D4R/Relation extraction/custom basic model/custom tokenizer/tokenizer.json')

the step 3 is following the instructions.

In [3]:
# If everything goes well, executing this line of code should solve it
path_of_custom_tokenizer = '/home/tensorboard/Documentos/1. D4R/Relation extraction/custom basic model/custom tokenizer'
tokenizer = AutoTokenizer.from_pretrained(path_of_custom_tokenizer)
tokenizer.added_tokens_decoder

{0: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 4: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 5: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 6: AddedToken("[PERSON]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 7: AddedToken("[PERSON_REFERENCE]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 8: AddedToken("[ORG]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 9: AddedToken("[PLACE]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 10: AddedToken("[CONTEXT]", rstrip=False, ls

In [4]:
len(tokenizer), tokenizer.vocab_size

(31002, 31002)

## 2. Tokenizing inputs

In [None]:
labels_to_ids = {'Actos Procesales':0,'Circunstancial': 1, 'Connivencia': 2, 'Materia Teológica': 3, 'Pertenencia': 4, 'Roles Procesales': 5}
ids_to_labels = {v:k for k, v in labels_to_ids.items()}
labels_list = [x for x in ids_to_labels.values()]

In [None]:
prompted_sentence = [i['Prompted_sentence'] for i in data]
labels = [i['Label'] for i in data]
encoded_labels = [[k] for va in labels for k, v in ids_to_labels.items() if va == v]

In [None]:
len(labels), len(encoded_labels)

In [None]:
tokenized_inputs = tokenize_and_position_sequence(sequences=prompted_sentence, tokenizer=tokenizer, labels= encoded_labels)

In [None]:
tokenized_inputs['input_ids'].shape, tokenized_inputs['labels'].shape, tokenized_inputs['sequence_positions'].shape

## 3. Checking the model can handle the computation with Batches

In [None]:
text_sample = tokenized_inputs = tokenize_and_position_sequence(sequences=prompted_sentence[100], tokenizer=tokenizer, labels=encoded_labels[100])

In [None]:

outputs = bert_test_model(text_sample['input_ids'],
            attention_mask=text_sample.attention_mask,
            token_type_ids=text_sample.token_type_ids) # type: ignore

In [None]:
outputs[0].shape, text_sample['sequence_positions'].shape

In [None]:
sep_token_id = 5
sep_indices = (text_sample['input_ids'] == sep_token_id).nonzero(as_tuple=True)[1][0]

In [None]:
rest_tensor = outputs[0][:,:sep_indices+1]
rest_positions = torch.stack([text_sample['sequence_positions'][:,0,:sep_indices+1], text_sample['sequence_positions'][:,1,:sep_indices+1]])

In [None]:
rest_positions.shape, rest_tensor.shape

In [None]:
position_embedding = nn.Embedding(
    num_embeddings=500,
    embedding_dim=20,
    padding_idx=499

)

In [None]:
rest_positions.shape

In [None]:
post_1 = position_embedding(rest_positions[0])


In [None]:
post_2 = position_embedding(rest_positions[1])

In [None]:
rest_tensor.shape, post_1.shape, post_2.shape

In [None]:
full_tensor = torch.cat(tensors=(rest_tensor, post_1, post_2), dim=-1)
full_tensor.shape

In [None]:
rest_tensor.shape[2] +(post_1.shape[2]*2)

In [None]:
bidirectional_stack =nn.LSTM(
                    input_size=rest_tensor.shape[2] +(post_1.shape[2]*2),
                    hidden_size=rest_tensor.shape[2],
                    num_layers=1,
                    batch_first=True,
                    bidirectional=True
                )

bidirectional_stack_GRU =nn.GRU(
                    input_size=rest_tensor.shape[2] +(post_1.shape[2]*2),
                    hidden_size=rest_tensor.shape[2],
                    num_layers=1,
                    batch_first=True,
                    bidirectional=True
                )

In [None]:
token_hidden_states, final_hidden_state = bidirectional_stack(full_tensor)
token_hidden_states_GRU, final_hidden_state_GRU = bidirectional_stack_GRU(full_tensor)

In [None]:
token_hidden_states.shape, final_hidden_state[0].shape, token_hidden_states_GRU.shape, final_hidden_state_GRU.shape


In [None]:
final_hiddenconcat = torch.cat(tensors=(final_hidden_state[0][0], final_hidden_state[1][1]), dim=-1)
final_hiddenconcat_GRU = torch.cat(tensors=(final_hidden_state_GRU[0], final_hidden_state_GRU[1]), dim=-1)

In [None]:
final_hiddenconcat.shape, final_hiddenconcat_GRU.shape

In [None]:
classifier_hidden_state = Classifier_MLP(
    input_dim=768*2,
    hidden_dim=300,
    output_dim=1,
    dropout_rate=0.5,
    Gelu_aproximation='none'
)

classificer_pooling = Classifier_MLP_with_Pooling(
    input_dim=768*2,
    hidden_dim=300,
    output_dim=1,
    dropout_rate=0.5,
    Gelu_aproximation='none'
)

In [None]:
out_final = classifier_hidden_state(final_hiddenconcat)
out_pooling = classificer_pooling(token_hidden_states)
out_final_GRU = classifier_hidden_state(final_hiddenconcat_GRU)
out_pooling_GRU = classificer_pooling(token_hidden_states_GRU)

In [None]:
out_final.shape, out_pooling.shape, out_final_GRU.shape, out_pooling_GRU.shape