In [None]:
# https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/

In [None]:
!pip install datasets
!pip install tokenizers
!pip install transformers



In [None]:
from datasets import load_dataset

en_dataset = load_dataset("wikiann", "en")

In [None]:
en_dataset

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [None]:
ner_feature = en_dataset["train"].features["ner_tags"]

In [None]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [None]:
model_checkpoint = "bert-base-multilingual-cased"

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [None]:
en_dataset["train"][0]["tokens"]

['R.H.',
 'Saunders',
 '(',
 'St.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')']

In [None]:
inputs = tokenizer(en_dataset["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'R',
 '.',
 'H',
 '.',
 'Saunders',
 '(',
 'St',
 '.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')',
 '[SEP]']

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = -100 #labels[word_id] #-100
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = en_dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(word_ids)
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, None]
[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]
[-100, 3, -100, -100, -100, 4, 0, 3, -100, 4, 4, 0, 0, 0, 0, 0, -100]


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = en_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=en_dataset["train"].column_names,
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
en_dataset["train"].column_names

['tokens', 'ner_tags', 'langs', 'spans']

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
!pip install seqeval



In [None]:
!pip install evaluate



In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

7

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install transformers[torch]



In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner-english", #change this to save in diff directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
#!huggingface-cli login

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Cloning https://huggingface.co/JoannaAndrews/bert-finetuned-ner-english into local empty directory.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.281,0.263013,0.811261,0.833168,0.822069,0.92458


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.281,0.263013,0.811261,0.833168,0.822069,0.92458
2,0.1959,0.248862,0.838488,0.854729,0.846531,0.934812


In [None]:
trainer.push_to_hub(commit_message="Training complete")