In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate



In [2]:
import os

import evaluate
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, AutoTokenizer, DataCollatorForTokenClassification, \
    AutoModelForTokenClassification, TrainingArguments, Trainer


In [3]:
model_checkpoint: str = "dbmdz/bert-base-historic-multilingual-cased"


def print_aligned(
        list1: list,
        list2: list
):
    line1 = ""
    line2 = ""
    for item1, item2 in zip(list1, list2):
        max_length = max(len(item1), len(item2))
        line1 += item1 + " " * (max_length - len(item1) + 1)
        line2 += item2 + " " * (max_length - len(item2) + 1)
    print(line1)
    print(line2)

In [4]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except:
    DATA_DIR = os.path.join('data')
    pass
DATA_DIR

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'drive/MyDrive/KEDiff/data'

In [5]:
BILOUs_hug = Dataset.load_from_disk(dataset_path=os.path.join(DATA_DIR, 'BILOUs_hf'))
print("Dataset:", BILOUs_hug, sep='\n')
print("Features:", BILOUs_hug.features, sep='\n')


Dataset:
Dataset({
    features: ['Text', 'EVENT-BILOUs', 'LOC-BILOUs', 'MISC-BILOUs', 'ORG-BILOUs', 'PER-BILOUs', 'TIME-BILOUs'],
    num_rows: 13928
})
Features:
{'Text': Value(dtype='string', id=None), 'EVENT-BILOUs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EVENT', 'U-EVENT', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L-MISC', 'U-MISC', 'B-ORG', 'I-ORG', 'L-ORG', 'U-ORG', 'B-PER', 'I-PER', 'L-PER', 'U-PER', 'B-TIME', 'I-TIME', 'L-TIME', 'U-TIME'], id=None), length=-1, id=None), 'LOC-BILOUs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EVENT', 'U-EVENT', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L-MISC', 'U-MISC', 'B-ORG', 'I-ORG', 'L-ORG', 'U-ORG', 'B-PER', 'I-PER', 'L-PER', 'U-PER', 'B-TIME', 'I-TIME', 'L-TIME', 'U-TIME'], id=None), length=-1, id=None), 'MISC-BILOUs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EVENT', 'U-EVENT', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L

In [6]:
train_testvalid = BILOUs_hug.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)
# gather everyone if you want to have a single DatasetDict
BILOUs_hug = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']}
)
del train_testvalid, test_valid
print(BILOUs_hug)


DatasetDict({
    train: Dataset({
        features: ['Text', 'EVENT-BILOUs', 'LOC-BILOUs', 'MISC-BILOUs', 'ORG-BILOUs', 'PER-BILOUs', 'TIME-BILOUs'],
        num_rows: 11142
    })
    test: Dataset({
        features: ['Text', 'EVENT-BILOUs', 'LOC-BILOUs', 'MISC-BILOUs', 'ORG-BILOUs', 'PER-BILOUs', 'TIME-BILOUs'],
        num_rows: 1393
    })
    validation: Dataset({
        features: ['Text', 'EVENT-BILOUs', 'LOC-BILOUs', 'MISC-BILOUs', 'ORG-BILOUs', 'PER-BILOUs', 'TIME-BILOUs'],
        num_rows: 1393
    })
})


In [7]:
tokeniser: BertTokenizerFast = AutoTokenizer.from_pretrained(model_checkpoint)
print(f"Is '{model_checkpoint}' a fast tokeniser?", tokeniser.is_fast)


def batch_embed(batch):
    # align annotation with added [CLS] and [SEP]
    for BILOU_column in ['EVENT-BILOUs', 'LOC-BILOUs', 'MISC-BILOUs', 'ORG-BILOUs', 'PER-BILOUs', 'TIME-BILOUs']:
        all_labels = batch[BILOU_column]
        new_labels = [[-100, *labels[1:-1], -100] for labels in all_labels]
        batch[BILOU_column] = new_labels
    return batch


BILOUs_hug = BILOUs_hug.map(batch_embed, batched=True)


def batch_tokenise(batch):
    # tokenise
    tokenised_inputs = tokeniser(batch['Text'], truncation=True)
    tokenised_inputs["labels"] = batch['PER-BILOUs']
    return tokenised_inputs


BILOUs_hug_tokenised = BILOUs_hug.map(
    batch_tokenise,
    batched=True,
    remove_columns=BILOUs_hug["train"].column_names
)
print(BILOUs_hug_tokenised)


Is 'dbmdz/bert-base-historic-multilingual-cased' a fast tokeniser? True


Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11142
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1393
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1393
    })
})


In [8]:
sample = BILOUs_hug_tokenised["train"][1]
sample


{'input_ids': [2,
  14331,
  16,
  7098,
  3616,
  9042,
  10976,
  405,
  11928,
  1080,
  18,
  7241,
  430,
  18,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 0, 17, 18, 18, 18, 18, 18, 19, 0, 0, 0, -100]}

In [9]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokeniser, padding=True)
batch = data_collator([BILOUs_hug_tokenised["train"][i] for i in range(2)])
print(batch)
print(batch['labels'])

for i in range(2):
    print(BILOUs_hug_tokenised["train"][i]["labels"])


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    2,   964, 26753,   377,  1172,  3288, 10883,    18,    13,     3,
             0,     0,     0,     0,     0],
        [    2, 14331,    16,  7098,  3616,  9042, 10976,   405, 11928,  1080,
            18,  7241,   430,    18,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100],
        [-100,    0,    0,    0,   17,   18,   18,   18,   18,   18,   19,    0,
            0,    0, -100]])}
tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100],
        [-100,    0,    0,    0,   17,   18,   18,   18,   18,   18,   19,    0,
            0,    0, -100]])
[-100, 0, 0, 0, 0, 0, 0, 0,

In [10]:
metric = evaluate.load("seqeval")

label_names = BILOUs_hug["train"].features["PER-BILOUs"].feature.names

labels = BILOUs_hug["train"][1]["PER-BILOUs"]
labels[10] = 18  # todo UserWarning: L-PER seems not to be NE tag. mimimi
labels = [label_names[i] for i in labels[1:-1]]

# fake predictions
predictions = labels.copy()
predictions[2] = "B-PER"
predictions[3] = "I-PER"

# todo allow BILOUs not just IOBs (warning AND WRONG SCORES when using U & L)
print_aligned(labels, predictions)
metric.compute(predictions=[predictions], references=[labels])



O O O     B-PER I-PER I-PER I-PER I-PER I-PER I-PER O O O 
O O B-PER I-PER I-PER I-PER I-PER I-PER I-PER I-PER O O O 


{'PER': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.8461538461538461}

In [11]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }



In [12]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
model.config.num_labels



Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-historic-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


25

In [13]:
trained_model_name = "oalz-1788-q1-ner-PER"
args = TrainingArguments(
    trained_model_name,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

# todo store checkpoints on drive as well not just final model
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=BILOUs_hug_tokenised["train"],
    eval_dataset=BILOUs_hug_tokenised["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokeniser,
)
trainer.train()
trainer.save_model(os.path.join(DATA_DIR, trained_model_name))



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1049,0.065425,0.708943,0.662614,0.684996,0.98101
2,0.0598,0.068877,0.746711,0.68997,0.71722,0.981838
3,0.0414,0.072988,0.742812,0.706687,0.724299,0.981624




In [None]:
pass
