In [1]:
!pip install -q transformers datasets seqeval evaluate transformers[torch]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.0 MB/s[0m et

**Named entity recognition (NER)**: Find the entities (such as persons, locations, or organizations) in a sentence. This can be formulated as attributing a label to each token by having one class per entity and one class for “no entity.”

**Part-of-speech tagging (POS)**: Mark each word in a sentence as corresponding to a particular part of speech (such as noun, verb, adjective, etc.).

**Chunking**: Find the tokens that belong to the same entity. This task (which can be combined with POS or NER) can be formulated as attributing one label (usually B-) to any tokens that are at the beginning of a chunk, another label (usually I-) to tokens that are inside a chunk, and a third label (usually O) to tokens that don’t belong to any chunk.

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
raw_datasets["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [4]:
ner_features = raw_datasets["train"].features["ner_tags"]
ner_features

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

    O means the word doesn’t correspond to any entity.
    B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.
    B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.
    B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.
    B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.

In [5]:
label_names = ner_features.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [6]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]

line1=""
line2=""

for word,label in zip(words,labels):
  full_label = label_names[label]
  max_len = max(len(word),len(full_label))
  line1 += word + " "*(max_len-len(word)+1)
  line2 += full_label + " "*(max_len-len(full_label)+1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


And for an example mixing B- and I- labels, here’s what the same code gives us on the element of the training set at index 4:

In [7]:
words = raw_datasets["train"][4]["tokens"]
labels = raw_datasets["train"][4]["ner_tags"]

line1=""
line2=""

for word,label in zip(words,labels):
  full_label = label_names[label]
  max_len = max(len(word),len(full_label))
  line1 += word + " "*(max_len-len(word)+1)
  line2 += full_label + " "*(max_len-len(full_label)+1)

print(line1)
print(line2)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


As we can see, entities spanning two words, like “European Union” and “Werner Zwingmann,” are attributed a B- label for the first word and an I- label for the second.

In [8]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer.is_fast

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

True

In [9]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"],is_split_into_words=True)
print(inputs.tokens())
print(inputs.word_ids())

['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [10]:
def align_labels_with_tokens(labels,word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:
    if word_id!=current_word:
      # start new word
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id is None:
      # special token
      new_labels.append(-100)
    else:
      # same word as previous token
      label = labels[word_id]
      # if the label is B-XXX we change it to I-XXX
      if label%2==1:
        label+=1
      new_labels.append(label)

  return new_labels



In [11]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()

new_labels = align_labels_with_tokens(labels,word_ids)
print(labels)
print(new_labels)

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]


To preprocess our whole dataset, we need to tokenize all the inputs and apply align_labels_with_tokens() on all the labels. To take advantage of the speed of our fast tokenizer, it’s best to tokenize lots of texts at the same time, so we’ll write a function that processes a list of examples and use the Dataset.map() method with the option batched=True. The only thing that is different from our previous example is that the word_ids() function needs to get the index of the example we want the word IDs of when the inputs to the tokenizer are lists of texts (or in our case, list of lists of words), so we add that too:

In [12]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples["tokens"], truncation=True,is_split_into_words=True)
  all_labels = examples["ner_tags"]
  new_labels = []
  for i,labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels,word_ids))

  tokenized_inputs["labels"] = new_labels

  return tokenized_inputs



In [13]:
q = tokenize_and_align_labels(raw_datasets['train'][4:5])
print(q)

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [14]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

Note that we haven’t padded our inputs yet; we’ll do that later, when creating the batches with a data collator.

In [15]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

We can’t just use a DataCollatorWithPadding like in Chapter 3 because that only pads the inputs (input IDs, attention mask, and token type IDs). Here our labels should be padded the exact same way as the inputs so that they stay the same size, using -100 as a value so that the corresponding predictions are ignored in the loss computation.

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
batch = data_collator([tokenized_datasets["train"][i]for i in range(2)])
batch["labels"]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100]])

In [17]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]
[-100, 1, 2, -100]


In [18]:
import  evaluate

metrics = evaluate.load("seqeval")
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
print(labels)

predictions = labels.copy()
predictions[2] = "O"
metrics.compute(predictions=[predictions],references=[labels])

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [19]:
import numpy as np

def compute_metrics(eval_preds):
  logits,labels = eval_preds
  predictions = np.argmax(logits,axis=-1)
  # Remove ignored index (special tokens) and convert to labels
  true_labels = [[label_names[l] for l in label if l!=-100]for label in labels]
  true_predictions = [
      [label_names[p] for (l,p) in zip(label,prediction)if l!=-100]
      for label,prediction in zip(labels,predictions)
  ]
  all_metrics = metrics.compute(predictions=true_predictions, references=true_labels)
  return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }



In [20]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [21]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
model.config.num_labels

9

In [28]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
from transformers import TrainingArguments

args = TrainingArguments(
    "distilbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [30]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Cloning https://huggingface.co/tchoud8/distilbert-finetuned-ner into local empty directory.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0785,0.064924,0.903664,0.929822,0.916556,0.982271
2,0.0403,0.057179,0.916089,0.933356,0.924642,0.983987
3,0.024,0.059911,0.929795,0.940592,0.935163,0.985273


TrainOutput(global_step=5268, training_loss=0.06755219668082062, metrics={'train_runtime': 399.9642, 'train_samples_per_second': 105.317, 'train_steps_per_second': 13.171, 'total_flos': 446702830025004.0, 'train_loss': 0.06755219668082062, 'epoch': 3.0})

In [31]:
trainer.push_to_hub(commit_message="Training complete")

To https://huggingface.co/tchoud8/distilbert-finetuned-ner
   18f2c88..c2b33de  main -> main

   18f2c88..c2b33de  main -> main

To https://huggingface.co/tchoud8/distilbert-finetuned-ner
   c2b33de..ed18f0c  main -> main

   c2b33de..ed18f0c  main -> main



'https://huggingface.co/tchoud8/distilbert-finetuned-ner/commit/c2b33def402da9bf6e12ba2810a57a262fcabccc'

https://huggingface.co/tchoud8/distilbert-finetuned-ner/commit/c2b33def402da9bf6e12ba2810a57a262fcabccc

In [32]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "tchoud8/distilbert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

Downloading (…)lve/main/config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/266M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'entity_group': 'PER',
  'score': 0.9980798,
  'word': 'sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.83914745,
  'word': 'hugging face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.997795,
  'word': 'brooklyn',
  'start': 49,
  'end': 57}]

In [None]:
# model.save_pretrained("ner_model")
# tokenizer.save_pretrained("tokenizer")
# id2label = {str(i): label for i,label in enumerate(label_names)}
# label2id = {label: str(i) for i,label in enumerate(label_names)}
# import json
# json.dump(config, open("ner_model/config.json","w"))
# config = json.load(open("ner_model/config.json"))
# config["id2label"] = id2label
# config["label2id"] = label2id
# model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

#A custom training loop