In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from tqdm import tqdm
import torch

In [5]:
dataset = load_dataset("conll2003", trust_remote_code=True)

model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

Device set to use cpu


In [7]:
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]

    tokenized_inputs = tokenizer(tokens, is_split_into_words=True, truncation=True, return_offsets_mapping=True)
    word_ids = tokenized_inputs.word_ids()

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append("O")
        elif word_idx != previous_word_idx:
            label = dataset["train"].features["ner_tags"].feature.names[labels[word_idx]]
            aligned_labels.append(label)
        else:
            label = dataset["train"].features["ner_tags"].feature.names[labels[word_idx]]
            if label.startswith("B-"):
                label = label.replace("B-", "I-")
            aligned_labels.append(label)
        previous_word_idx = word_idx

    return tokenized_inputs, aligned_labels

In [10]:
true_labels = []
pred_labels = []

label_list = model.config.id2label

for example in tqdm(dataset["validation"]):
    tokens = example["tokens"]
    labels = example["ner_tags"]

    tokenized_inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True)
    with torch.no_grad():
        output = model(**tokenized_inputs).logits

    predictions = torch.argmax(output, dim=2).squeeze().tolist()
    word_ids = tokenized_inputs.word_ids()

    aligned_preds = []
    aligned_labels = []

    prev_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if word_idx != prev_word_idx:
            aligned_preds.append(label_list[predictions[idx]])
            label = dataset["validation"].features["ner_tags"].feature.names[labels[word_idx]]
            aligned_labels.append(label)
        prev_word_idx = word_idx

    pred_labels.append(aligned_preds)
    true_labels.append(aligned_labels)


100%|██████████| 3250/3250 [03:54<00:00, 13.87it/s]


In [11]:
print("Classification report:")
print(classification_report(true_labels, pred_labels))
print(f"F1 Score: {f1_score(true_labels, pred_labels):.4f}")
print(f"Precision: {precision_score(true_labels, pred_labels):.4f}")
print(f"Recall: {recall_score(true_labels, pred_labels):.4f}")

Classification report:
              precision    recall  f1-score   support

         LOC       0.98      0.97      0.97      1837
        MISC       0.90      0.91      0.91       922
         ORG       0.93      0.93      0.93      1341
         PER       0.96      0.98      0.97      1842

   micro avg       0.95      0.95      0.95      5942
   macro avg       0.94      0.95      0.94      5942
weighted avg       0.95      0.95      0.95      5942

F1 Score: 0.9515
Precision: 0.9494
Recall: 0.9536
