In [None]:
!pip install transformers datasets tokenizers seqeval -q

In [None]:
import datasets 
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer 

conll2003 = datasets.load_dataset("conll2003")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
conll2003.shape #shape of dataset

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [None]:
conll2003['train'][0] #Brings the first row of the dataset #let's see an element

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
conll2003['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

From the ner_tags we can see the possibilities in the dataset being the following:

O - means word doesn't correspond to an entity
B-PER/I-PER token corresponds to beggining or inside a person entity
B-ORG/I-ORG token corresponds to beggining or inside a organization entity
B-LOC/I-LOC token corresponds to beggining or inside a location entity
B-MISC/I-MISC token corresponds to beggining or inside a miscellanous entity

In [None]:
conll2003['train'].description # Here we can see the description in the Hugging Face page

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") #let's define our tokenizer

In here we find a problem that needs to be expalined vert precisely.

In [None]:
example_text = conll2003['train'][0]

tokenized_input = tokenizer(example_text['tokens'],is_split_into_words=True) #we take tokens because as we can see above the columns tokens is the one containing the text

tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']) # Here we store the tokens from our example

words_ids = tokenized_input.word_ids()

print(tokens)
#As you can see from the print of tokens we have to special tokens at the start and end
words_ids

['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']


[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]

In [None]:
len(example_text['ner_tags']), len(tokenized_input['input_ids'])

(9, 11)

As you can see we have a problem here. The BertTokenizer (and many LLM models tokenizer) tokenize words by subtokens which results in a different count from the tokens counted by the dataset and the tokens recognised by the model. This is named the problem of subtokens. 

This means we need to make some preprocess with this id methods. The special tokens are mapped to None so we need to have those mapped to -100, why? because the tokens with -100 are ignored by pytorch during training. Let's do it


In [None]:
def tokenize_and_align_labels(example, label_all_tokens = True):
  #THis methods will do two functions: set -100 as the label for special tokens and mask the subword representation after the first subword
  #example is a dataset
  tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True) 
  labels = []  #This will hold the labels later

  for i, label in enumerate(example["ner_tags"]): 
      word_ids = tokenized_input.word_ids(batch_index=i) 
      # word_ids() return a list of mapping tokens
      # to their actual word in the initial sentence.
      # It returns a list indicating the word corresponding to each token.
      #Now we can use this list to asign the -100 to those elements of the list whose word is None
      previous_word_idx = None 
      label_ids = []
      #special tokens like <s> and <\s> are originally mapped to None
      #We need to set the label to -100 so they are automatically ignored in the loss function
      for word_idx in word_ids: 
          if word_idx is None: 
              #set -100 as the label for thses special tokens
              label_ids.append(-100)
          # For the other tokens in a word, we set the label to either the current label or -100, depending on
          # the label_all_tokens flag.
          elif word_idx != previous_word_idx:
              #if current word_idx is different from prev the its the most regular case
              # and add the corresponding token                
              label_ids.append(label[word_idx]) 
          else: 
              #Here we will take care of the subwords which have the same word_idx
              # set to -100 as well for them, but only if label_all_tokens == False
              label_ids.append(label[word_idx] if label_all_tokens else -100) 
              # mask the subword representations after the first subword
                
          previous_word_idx = word_idx 
      labels.append(label_ids) 
      tokenized_input["labels"] = labels 

      return tokenized_inputs 






In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.
                     
    label_all_tokens (bool): A flag to indicate whether all tokens should have labels. 
                             If False, only the first token of a word will have a label, 
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set â€“100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [None]:
q = tokenize_and_align_labels(conll2003['train'][4:5])
print(q)
#AS we can see in the print we've added a new key "labels" that has the values aligned with the tokens in order to train. Let's see them side by side

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(f"{token:_<40} {label}") 
  #Here we can see the token and its corresponding labels and the subtokens are being labeled as -100 so they won't be looking at during training. 

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [None]:
#By using the map function we will apply our function along the whole dataset
tokenized_dataset = conll2003.map(tokenize_and_align_labels, batched = True)



In [None]:
#Now let's define our model with the number of possible labels
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels = 9)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [None]:
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#And the training argument parameters
#from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01, 
) 

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
#By the data collator we organize the batches for the traning

In [None]:
metric = datasets.load_metric('seqeval')

example = conll2003['train'][0]

  metric = datasets.load_metric('seqeval')


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
label_list = conll2003['train'].features['ner_tags'].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

Now let's calculate the metrics for the first example to see how our fined tune model is doing


In [None]:
labels = [label_list[i] for i in example['ner_tags']]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [None]:
metric.compute(predictions=[labels],references=[labels])
#This perfect result come because we gave the same list to both of them but now we're gonna prepare the real metrics to our configuration

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
def compute_metrics(eval_preds): 
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we donâ€™t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [None]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_dataset["train"], 
   eval_dataset=tokenized_dataset["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [None]:
trainer.train() 


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
model.save_pretrained("ner_model")

In [None]:
tokenizer.save_pretrained("tokenizer")


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
import json
config = json.load(open("ner_model/config.json"))


In [None]:
config["id2label"] = id2label
config["label2id"] = label2id

In [None]:
json.dump(config, open("ner_model/config.json","w"))


In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")


In [None]:
from transformers import pipeline


In [37]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Juan es el fundador de Galletas para todos en Valencia"

ner_results = nlp(example)

print(ner_results)

[{'entity': 'B-PER', 'score': 0.8335036, 'index': 1, 'word': 'juan', 'start': 0, 'end': 4}, {'entity': 'I-PER', 'score': 0.4437575, 'index': 2, 'word': 'es', 'start': 5, 'end': 7}, {'entity': 'I-ORG', 'score': 0.7028736, 'index': 4, 'word': 'fund', 'start': 11, 'end': 15}, {'entity': 'I-ORG', 'score': 0.9459004, 'index': 5, 'word': '##ador', 'start': 15, 'end': 19}, {'entity': 'I-ORG', 'score': 0.96764964, 'index': 6, 'word': 'de', 'start': 20, 'end': 22}, {'entity': 'I-ORG', 'score': 0.949765, 'index': 7, 'word': 'gall', 'start': 23, 'end': 27}, {'entity': 'I-ORG', 'score': 0.9747286, 'index': 8, 'word': '##eta', 'start': 27, 'end': 30}, {'entity': 'I-ORG', 'score': 0.9883681, 'index': 9, 'word': '##s', 'start': 30, 'end': 31}, {'entity': 'B-LOC', 'score': 0.8175368, 'index': 14, 'word': 'valencia', 'start': 46, 'end': 54}]
