In [1]:
!pip install datasets evaluate transformers[sentencepiece] seqeval
!pip install accelerate


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
 

In [2]:
import os

import evaluate
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, AutoTokenizer, DataCollatorForTokenClassification, \
    AutoModelForTokenClassification, TrainingArguments, Trainer


In [3]:
model_checkpoint: str = "dbmdz/bert-base-historic-multilingual-cased"



In [4]:
def print_aligned(
        list1: list,
        list2: list
):
    line1 = ""
    line2 = ""
    for item1, item2 in zip(list1, list2):
        max_length = max(len(item1), len(item2))
        line1 += item1 + " " * (max_length - len(item1) + 1)
        line2 += item2 + " " * (max_length - len(item2) + 1)
    print(line1)
    print(line2)



In [5]:
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
    pass

print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))


You work on Colab. Gentle as we are, we will mount Drive for you. It'd help if you allowed this in the popup that opens.
Mounted at /content/drive
DATA_DIR='drive/MyDrive/KEDiff/data' --> /content/drive/MyDrive/KEDiff/data


In [6]:
BILOUs_hug = Dataset.load_from_disk(dataset_path=os.path.join(DATA_DIR, 'BILOUs_hf'))
print("Dataset:", BILOUs_hug, sep='\n')
print("Features:", BILOUs_hug.features, sep='\n')


Dataset:
Dataset({
    features: ['Text', 'EVENT-BILOUs', 'EVENT-IOBs', 'LOC-BILOUs', 'LOC-IOBs', 'MISC-BILOUs', 'MISC-IOBs', 'ORG-BILOUs', 'ORG-IOBs', 'PER-BILOUs', 'PER-IOBs', 'TIME-BILOUs', 'TIME-IOBs'],
    num_rows: 13928
})
Features:
{'Text': Value(dtype='string', id=None), 'EVENT-BILOUs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EVENT', 'U-EVENT', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L-MISC', 'U-MISC', 'B-ORG', 'I-ORG', 'L-ORG', 'U-ORG', 'B-PER', 'I-PER', 'L-PER', 'U-PER', 'B-TIME', 'I-TIME', 'L-TIME', 'U-TIME'], id=None), length=-1, id=None), 'EVENT-IOBs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EVENT', 'U-EVENT', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L-MISC', 'U-MISC', 'B-ORG', 'I-ORG', 'L-ORG', 'U-ORG', 'B-PER', 'I-PER', 'L-PER', 'U-PER', 'B-TIME', 'I-TIME', 'L-TIME', 'U-TIME'], id=None), length=-1, id=None), 'LOC-BILOUs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EV

In [7]:
train_testvalid = BILOUs_hug.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

# gather everyone if you want to have a single DatasetDict
BILOUs_hug = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']}
)
del train_testvalid, test_valid
print(BILOUs_hug)


DatasetDict({
    train: Dataset({
        features: ['Text', 'EVENT-BILOUs', 'EVENT-IOBs', 'LOC-BILOUs', 'LOC-IOBs', 'MISC-BILOUs', 'MISC-IOBs', 'ORG-BILOUs', 'ORG-IOBs', 'PER-BILOUs', 'PER-IOBs', 'TIME-BILOUs', 'TIME-IOBs'],
        num_rows: 11142
    })
    test: Dataset({
        features: ['Text', 'EVENT-BILOUs', 'EVENT-IOBs', 'LOC-BILOUs', 'LOC-IOBs', 'MISC-BILOUs', 'MISC-IOBs', 'ORG-BILOUs', 'ORG-IOBs', 'PER-BILOUs', 'PER-IOBs', 'TIME-BILOUs', 'TIME-IOBs'],
        num_rows: 1393
    })
    validation: Dataset({
        features: ['Text', 'EVENT-BILOUs', 'EVENT-IOBs', 'LOC-BILOUs', 'LOC-IOBs', 'MISC-BILOUs', 'MISC-IOBs', 'ORG-BILOUs', 'ORG-IOBs', 'PER-BILOUs', 'PER-IOBs', 'TIME-BILOUs', 'TIME-IOBs'],
        num_rows: 1393
    })
})


In [8]:
tokeniser: BertTokenizerFast = AutoTokenizer.from_pretrained(model_checkpoint)
print(f"Is '{model_checkpoint}' a fast tokeniser?", tokeniser.is_fast)


def batch_embed(batch):
    # align annotation with added [CLS] and [SEP]
    for column in [
        'EVENT-BILOUs', 'LOC-BILOUs', 'MISC-BILOUs', 'ORG-BILOUs', 'PER-BILOUs', 'TIME-BILOUs',
        'EVENT-IOBs', 'LOC-IOBs', 'MISC-IOBs', 'ORG-IOBs', 'PER-IOBs', 'TIME-IOBs'
    ]:
        all_labels = batch[column]
        new_labels = [[-100, *labels[1:-1], -100] for labels in all_labels]
        batch[column] = new_labels
    return batch


BILOUs_hug = BILOUs_hug.map(batch_embed, batched=True)


def batch_tokenise(batch):
    # tokenise
    tokenised_inputs = tokeniser(batch['Text'], truncation=True)
    tokenised_inputs["labels"] = batch['PER-IOBs']
    return tokenised_inputs


BILOUs_hug_tokenised = BILOUs_hug.map(
    batch_tokenise,
    batched=True,
    remove_columns=BILOUs_hug["train"].column_names
)
print(BILOUs_hug_tokenised)


tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/212k [00:00<?, ?B/s]

Is 'dbmdz/bert-base-historic-multilingual-cased' a fast tokeniser? True
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11142
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1393
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1393
    })
})


In [9]:
sample = BILOUs_hug_tokenised["train"][1]
print(sample)
del sample


{'input_ids': [2, 14331, 16, 7098, 3616, 9042, 10976, 405, 11928, 1080, 18, 7241, 430, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 17, 18, 18, 18, 18, 18, 18, 0, 0, 0, -100]}


In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokeniser, padding=True)
batch = data_collator([BILOUs_hug_tokenised["train"][i] for i in range(2)])
print(batch)
print(batch['labels'])

for i in range(2):
    print(BILOUs_hug_tokenised["train"][i]["labels"])
del i


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    2,   964, 26753,   377,  1172,  3288, 10883,    18,    13,     3,
             0,     0,     0,     0,     0],
        [    2, 14331,    16,  7098,  3616,  9042, 10976,   405, 11928,  1080,
            18,  7241,   430,    18,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100],
        [-100,    0,    0,    0,   17,   18,   18,   18,   18,   18,   18,    0,
            0,    0, -100]])}
tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100],
        [-100,    0,    0,    0,   17,   18,   18,   18,   18,   18,   18,    0,
            0,    0, -100]])
[-100, 0, 0, 0, 0, 0, 0, 0,

In [11]:
label_names = BILOUs_hug["train"].features["PER-IOBs"].feature.names

batch = {'references': [], 'predictions': []}
for i in [0, 1]:
    labels = BILOUs_hug["train"][i]["PER-IOBs"]
    labels = [label_names[i] for i in labels[1:-1]]
    # fake predictions
    predictions = labels.copy()
    predictions[2] = "B-PER"
    predictions[3] = "I-PER"

    print_aligned(labels, predictions)

    batch['references'] += [labels]
    batch['predictions'] += [predictions]
del i, labels, predictions

# calculate metrics
for metric_name in ["seqeval", "poseval"]:
    print(f"Now evaluating using {metric_name=}")
    metric = evaluate.load(metric_name)
    metric_result = metric.compute(predictions=batch['predictions'], references=batch['references'])
    print(metric_result)
# del batch, metric, metric_name, metric_result


O O O     O     O O O O 
O O B-PER I-PER O O O O 
O O O     B-PER I-PER I-PER I-PER I-PER I-PER I-PER O O O 
O O B-PER I-PER I-PER I-PER I-PER I-PER I-PER I-PER O O O 
Now evaluating using metric_name='seqeval'


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

{'PER': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.8095238095238095}
Now evaluating using metric_name='poseval'


Downloading builder script:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

{'B-PER': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}, 'I-PER': {'precision': 0.75, 'recall': 1.0, 'f1-score': 0.8571428571428571, 'support': 6}, 'O': {'precision': 1.0, 'recall': 0.7857142857142857, 'f1-score': 0.88, 'support': 14}, 'accuracy': 0.8095238095238095, 'macro avg': {'precision': 0.5833333333333334, 'recall': 0.5952380952380952, 'f1-score': 0.579047619047619, 'support': 21}, 'weighted avg': {'precision': 0.8809523809523809, 'recall': 0.8095238095238095, 'f1-score': 0.8315646258503401, 'support': 21}}


In [12]:
metric = evaluate.load('poseval')


In [13]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metric_result = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "macro precision": metric_result["macro avg"]["precision"],
        "macro recall": metric_result["macro avg"]["recall"],
        "macro f1": metric_result["macro avg"]["f1-score"],
        "macro support": metric_result["macro avg"]["support"],

        "weighted precision": metric_result["weighted avg"]["precision"],
        "weighted recall": metric_result["weighted avg"]["recall"],
        "weighted f1": metric_result["weighted avg"]["f1-score"],
        "weighted support": metric_result["weighted avg"]["support"],

        "accuracy": metric_result["accuracy"],
    }



In [14]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}



In [15]:
for label_type in ['EVENT', 'LOC', 'MISC', 'ORG', 'PER', 'TIME']:
    trained_model_name = f"oalz-1788-q1-ner-{label_type}"

    print(f"Now training '{trained_model_name}'")

    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        id2label=id2label,
        label2id=label2id,
    )
    model.config.num_labels

    args = TrainingArguments(
        output_dir = os.path.join(DATA_DIR, trained_model_name),
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=5,
        weight_decay=0.01
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=BILOUs_hug_tokenised["train"],
        eval_dataset=BILOUs_hug_tokenised["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokeniser,
    )
    trainer.train()
    trainer.save_model(os.path.join(DATA_DIR, trained_model_name))


Now training 'oalz-1788-q1-ner-EVENT'


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-historic-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro precision,Macro recall,Macro f1,Macro support,Weighted precision,Weighted recall,Weighted f1,Weighted support,Accuracy
1,0.0877,0.067817,0.841211,0.867993,0.852233,37440,0.978756,0.97711,0.977698,37440,0.97711
2,0.0562,0.065261,0.886517,0.864127,0.874006,37440,0.981889,0.981838,0.981786,37440,0.981838
3,0.035,0.071427,0.874872,0.879132,0.876142,37440,0.982274,0.981838,0.981984,37440,0.981838
4,0.0195,0.072964,0.883278,0.879927,0.880697,37440,0.982962,0.982666,0.982746,37440,0.982666
5,0.0153,0.084804,0.86999,0.883857,0.876592,37440,0.982343,0.981944,0.982109,37440,0.981944


Now training 'oalz-1788-q1-ner-LOC'


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-historic-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro precision,Macro recall,Macro f1,Macro support,Weighted precision,Weighted recall,Weighted f1,Weighted support,Accuracy
1,0.0892,0.078254,0.810216,0.869441,0.836378,37440,0.976486,0.973558,0.974648,37440,0.973558
2,0.0559,0.063268,0.888975,0.866466,0.876557,37440,0.982176,0.982158,0.9821,37440,0.982158
3,0.0345,0.072262,0.872073,0.876018,0.873683,37440,0.981976,0.981704,0.981809,37440,0.981704
4,0.0204,0.074967,0.881002,0.877823,0.878305,37440,0.982719,0.982345,0.982449,37440,0.982345
5,0.0145,0.086087,0.867018,0.885541,0.875892,37440,0.982311,0.981811,0.98202,37440,0.981811


Now training 'oalz-1788-q1-ner-MISC'


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-historic-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro precision,Macro recall,Macro f1,Macro support,Weighted precision,Weighted recall,Weighted f1,Weighted support,Accuracy
1,0.0892,0.078254,0.810216,0.869441,0.836378,37440,0.976486,0.973558,0.974648,37440,0.973558
2,0.0559,0.063268,0.888975,0.866466,0.876557,37440,0.982176,0.982158,0.9821,37440,0.982158
3,0.0345,0.072262,0.872073,0.876018,0.873683,37440,0.981976,0.981704,0.981809,37440,0.981704
4,0.0204,0.074967,0.881002,0.877823,0.878305,37440,0.982719,0.982345,0.982449,37440,0.982345
5,0.0145,0.086087,0.867018,0.885541,0.875892,37440,0.982311,0.981811,0.98202,37440,0.981811


Now training 'oalz-1788-q1-ner-ORG'


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-historic-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro precision,Macro recall,Macro f1,Macro support,Weighted precision,Weighted recall,Weighted f1,Weighted support,Accuracy
1,0.0892,0.078254,0.810216,0.869441,0.836378,37440,0.976486,0.973558,0.974648,37440,0.973558
2,0.0559,0.063268,0.888975,0.866466,0.876557,37440,0.982176,0.982158,0.9821,37440,0.982158
3,0.0345,0.072262,0.872073,0.876018,0.873683,37440,0.981976,0.981704,0.981809,37440,0.981704
4,0.0204,0.074967,0.881002,0.877823,0.878305,37440,0.982719,0.982345,0.982449,37440,0.982345
5,0.0145,0.086087,0.867018,0.885541,0.875892,37440,0.982311,0.981811,0.98202,37440,0.981811


Now training 'oalz-1788-q1-ner-PER'


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-historic-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro precision,Macro recall,Macro f1,Macro support,Weighted precision,Weighted recall,Weighted f1,Weighted support,Accuracy
1,0.0892,0.078254,0.810216,0.869441,0.836378,37440,0.976486,0.973558,0.974648,37440,0.973558
2,0.0559,0.063268,0.888975,0.866466,0.876557,37440,0.982176,0.982158,0.9821,37440,0.982158
3,0.0345,0.072262,0.872073,0.876018,0.873683,37440,0.981976,0.981704,0.981809,37440,0.981704
4,0.0204,0.074967,0.881002,0.877823,0.878305,37440,0.982719,0.982345,0.982449,37440,0.982345
5,0.0145,0.086087,0.867018,0.885541,0.875892,37440,0.982311,0.981811,0.98202,37440,0.981811


Now training 'oalz-1788-q1-ner-TIME'


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-historic-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro precision,Macro recall,Macro f1,Macro support,Weighted precision,Weighted recall,Weighted f1,Weighted support,Accuracy
1,0.0892,0.078254,0.810216,0.869441,0.836378,37440,0.976486,0.973558,0.974648,37440,0.973558
2,0.0559,0.063268,0.888975,0.866466,0.876557,37440,0.982176,0.982158,0.9821,37440,0.982158
3,0.0345,0.072262,0.872073,0.876018,0.873683,37440,0.981976,0.981704,0.981809,37440,0.981704
4,0.0204,0.074967,0.881002,0.877823,0.878305,37440,0.982719,0.982345,0.982449,37440,0.982345
5,0.0145,0.086087,0.867018,0.885541,0.875892,37440,0.982311,0.981811,0.98202,37440,0.981811


In [16]:
pass
