Adapted partly from https://huggingface.co/learn/nlp-course/en/chapter7/2

In [1]:
from datasets import Dataset, Value, ClassLabel, Features, Sequence, Array2D, Features

tag_to_id = {
    "O": 0, 
    "B-PER":  1, "I-PER":  2,
    "B-ORG":  3, "I-ORG":  4,
    "B-LOC":  5, "I-LOC":  6,
    "B-MISC": 7, "I-MISC": 8,
    "-": 9
}
id_to_tag = {id: tag for tag, id in tag_to_id.items()}

def iob2_to_dataset(fp):
    with open(fp, encoding='utf-8') as f:
        raw_data = f.readlines()

    data = {
        "tokens": [],
        "ner_tags": [],
        "ner_tags_id": [],
        "index": [],
        "id": [],
    }
    current = {
        "tokens": [],
        "ner_tags": [],
        "ner_tags_id": [],
        "index": [],
    }
    i = 0
    for line in raw_data:
        if line.startswith("#"):
            continue
        if line == "\n":
            data["tokens"].append(current["tokens"])
            data["ner_tags"].append(current["ner_tags"])
            data["ner_tags_id"].append(current["ner_tags_id"])
            data["index"].append(current["index"])
            data["id"].append(str(i))
            current = {
                "tokens": [],
                "ner_tags": [],
                "ner_tags_id": [],
                "index": [],
            }
            continue
        i, word, ner_tag, _, _ = line.split()
        current["tokens"].append(word)
        current["ner_tags"].append(ner_tag)
        current["ner_tags_id"].append(tag_to_id[ner_tag])
        current["index"].append(i)
    
    features = Features({
        "id": Value("string"),
        "tokens": Sequence(Value("string")),
        "ner_tags": Sequence(ClassLabel(names=list(tag_to_id.keys()))),
        "ner_tags_id": Sequence(Value("int32")),
        "index": Sequence(Value("int32"))
    })
    dataset_raw = Dataset.from_dict(data, features=features)
    return dataset_raw

In [2]:
dataset_raw = iob2_to_dataset("../project_description/en_ewt-ud-train.iob2")
dataset_raw_val = iob2_to_dataset("../project_description/en_ewt-ud-dev.iob2")

ner_feature = dataset_raw.features["ner_tags"]
label_names = ner_feature.feature.names
print(ner_feature)
print(label_names)

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', '-'], id=None), length=-1, id=None)
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', '-']


In [3]:
def decode(words, labels):
    line1 = ""
    line2 = ""
    for word, label in zip(words, labels):
        full_label = label_names[label]
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)

    print(line1)
    print(line2)

words = dataset_raw[0]["tokens"]
labels = dataset_raw[0]["ner_tags"]
decode(words, labels)

Where in the world is Iguazu ? 
O     O  O   O     O  B-LOC  O 


In [4]:
from transformers import AutoTokenizer
model_id = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [5]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [6]:
tokenized_datasets = dataset_raw.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_raw.column_names,
)
tokenized_datasets_val = dataset_raw_val.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_raw_val.column_names,
)

Map:   0%|          | 0/12543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2001 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

batch = data_collator([tokenized_datasets[i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    5,    6,    6,    0, -100],
        [-100,    5,    6,    6,    6, -100, -100, -100, -100, -100, -100]])

In [8]:
for i in range(2):
    print(tokenized_datasets[i]["labels"])

[-100, 0, 0, 0, 0, 0, 5, 6, 6, 0, -100]
[-100, 5, 6, 6, 6, -100]


In [9]:
import evaluate

metric = evaluate.load("seqeval")

In [10]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [11]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained (
    model_id,
    id2label=id_to_tag,
    label2id=tag_to_id,
)
model.config.num_labels

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


10

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
from transformers import TrainingArguments

args = TrainingArguments(
    "mbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  0%|          | 0/4704 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 352.00 MiB. GPU 0 has a total capacity of 1.95 GiB of which 111.62 MiB is free. Including non-PyTorch memory, this process has 1.83 GiB memory in use. Of the allocated memory 1.69 GiB is allocated by PyTorch, and 95.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)