# Ajuste Fino do Modelo Pré-Treinado bert-base-NER no Hugging Face para Reconhecimento de Entidades Nomeadas (NER).

Baseado no [artigo](https://medium.com/@anyuanay/working-with-hugging-face-lesson-2-3-885abc410c2d) de Yuan An

## Instalação de bibliotecas e recursos

Install Transformers and Datasets from Hugging Face

In [None]:
! pip install transformers[torch] datasets

Load the Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

Load the WNUT 2017 Dataset

In [None]:
from datasets import load_dataset

wnut = load_dataset('wnut_17')

## Análise exploratória de dados

In [None]:
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [None]:
wnut['train']['tokens'][1]

['From',
 'Green',
 'Newsfeed',
 ':',
 'AHFA',
 'extends',
 'deadline',
 'for',
 'Sage',
 'Award',
 'to',
 'Nov',
 '.',
 '5',
 'http://tinyurl.com/24agj38']

In [None]:
wnut['train']['ner_tags'][1]

[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
wnut["train"].features["ner_tags"].feature.names

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

## Tokenização

Tokenize the Tokens into Subwords by the Tokenizer of bert-base-NER

In [None]:
tokenized_result = tokenizer(wnut['train']['tokens'][1], is_split_into_words=True)

In [None]:
tokenized_result['input_ids']

[101,
 1622,
 2565,
 3128,
 8124,
 1174,
 131,
 138,
 13561,
 1592,
 8559,
 17638,
 1111,
 15204,
 1698,
 1106,
 14152,
 119,
 126,
 8413,
 131,
 120,
 120,
 4296,
 2149,
 1233,
 119,
 3254,
 120,
 1572,
 8517,
 3361,
 23249,
 102]

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_result["input_ids"])
tokens

['[CLS]',
 'From',
 'Green',
 'News',
 '##fe',
 '##ed',
 ':',
 'A',
 '##HF',
 '##A',
 'extends',
 'deadline',
 'for',
 'Sage',
 'Award',
 'to',
 'Nov',
 '.',
 '5',
 'http',
 ':',
 '/',
 '/',
 'tiny',
 '##ur',
 '##l',
 '.',
 'com',
 '/',
 '24',
 '##ag',
 '##j',
 '##38',
 '[SEP]']

In [None]:
len(tokens), len(tokenized_result['input_ids']), len(tokenized_result['token_type_ids']), len(tokenized_result['attention_mask'])


(34, 34, 34, 34)

In [None]:
len(wnut['train']['ner_tags'][1])

15

In [None]:
# To re-assign the tags to the new tokens, map the tokens to their corresponding word ids in the input
word_ids = tokenized_result.word_ids()
len(set(word_ids))-1

15

In [None]:
# Re-assign tags to the new tokens
input_tags = []
previous_wid = None
for wid in word_ids:
    if wid is None:
        input_tags.append(-100)
    elif wid == previous_wid:
        input_tags.append(-100)
    else:
        input_tags.append(wnut['train']['ner_tags'][1][wid])
    previous_wid = wid


In [None]:
# The new tokens
tokens_new = tokenizer.convert_ids_to_tokens(tokenized_result['input_ids'])
print(tokens_new)

['[CLS]', 'From', 'Green', 'News', '##fe', '##ed', ':', 'A', '##HF', '##A', 'extends', 'deadline', 'for', 'Sage', 'Award', 'to', 'Nov', '.', '5', 'http', ':', '/', '/', 'tiny', '##ur', '##l', '.', 'com', '/', '24', '##ag', '##j', '##38', '[SEP]']


In [None]:
# The assigned tags to the new tokens
print(input_tags)

[-100, 0, 0, 0, -100, -100, 0, 5, -100, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [None]:
tag_names = wnut["test"].features["ner_tags"].feature.names
tag_names[5]

'B-group'

In [None]:
def tokenize_and_align_tags(records):
    # Tokenize the input words. This will break words into subtokens if necessary.
    # For instance, "ChatGPT" might become ["Chat", "##G", "##PT"].
    tokenized_results = tokenizer(records["tokens"], truncation=True, is_split_into_words=True)

    input_tags_list = []

    # Iterate through each set of tags in the records.
    for i, given_tags in enumerate(records["ner_tags"]):
        # Get the word IDs corresponding to each token. This tells us to which original word each token corresponds.
        word_ids = tokenized_results.word_ids(batch_index=i)

        previous_word_id = None
        input_tags = []

        # For each token, determine which tag it should get.
        for wid in word_ids:
            # If the token does not correspond to any word (e.g., it's a special token), set its tag to -100.
            if wid is None:
                input_tags.append(-100)
            # If the token corresponds to a new word, use the tag for that word.
            elif wid != previous_word_id:
                input_tags.append(given_tags[wid])
            # If the token is a subtoken (i.e., part of a word we've already tagged), set its tag to -100.
            else:
                input_tags.append(-100)
            previous_word_id = wid

        input_tags_list.append(input_tags)

    # Add the assigned tags to the tokenized results.
    # In the Hugging Face Transformers library, a model recognizes the labels parameter
    # for computing losses along with logits (predictions)
    tokenized_results["labels"] = input_tags_list

    return tokenized_results


Tokenize the Dataset and Assign Tags to the New Subword Tokens

In [None]:
tokenized_wnut = wnut.map(tokenize_and_align_tags, batched=True)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [None]:
for key in wnut['train'][0]:
    print(key, ":", wnut['train'][0][key])

id : 0
tokens : ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
ner_tags : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
for key in tokenized_wnut['train'][0]:
    print(key, ":", tokenized_wnut['train'][0][key])

id : 0
tokens : ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
ner_tags : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]
input_ids : [101, 137, 185, 18318, 13868, 1135, 112, 188, 1103, 2458, 1121, 1187, 146, 112, 182, 1690, 1111, 1160, 2277, 119, 2813, 1426, 4334, 134, 142, 19117, 119, 12004, 2213, 4162, 1303, 1314, 3440, 119, 102]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels : [-100, 0, -100, -100, -100, 0, 0, -100, 0, 0, 0, 0, 0, 0, -100, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, -100, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Treinamento e validação

In [None]:
id2label = dict(enumerate(tag_names))
label2id = dict(zip(id2label.values(), id2label.keys()))

In [None]:
label2id

{'O': 0,
 'B-corporation': 1,
 'I-corporation': 2,
 'B-creative-work': 3,
 'I-creative-work': 4,
 'B-group': 5,
 'I-group': 6,
 'B-location': 7,
 'I-location': 8,
 'B-person': 9,
 'I-person': 10,
 'B-product': 11,
 'I-product': 12}

Load the Pre-Trained Model by Passing the Maps between Ids and Names

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER", num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768])

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="my_finetuned_wnut_model",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss
500,0.1424
1000,0.0447


TrainOutput(global_step=1275, training_loss=0.07848994049371458, metrics={'train_runtime': 7157.9758, 'train_samples_per_second': 1.422, 'train_steps_per_second': 0.178, 'total_flos': 289679322751956.0, 'train_loss': 0.07848994049371458, 'epoch': 3.0})