In [1]:
import os

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, BertTokenizerFast

In [2]:
TOKENISER_CHECKPOINT: str = "dbmdz/bert-base-historic-multilingual-cased"

In [3]:
try:
    from google.colab import drive

    print("You work on Colab. Gentle as we are, we will mount Drive for you. It'd help if you allowed this in the popup that opens.")
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')

print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

You do not work on Colab
DATA_DIR='data' --> /Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data


In [4]:
annotations = Dataset.load_from_disk(dataset_path=os.path.join(DATA_DIR, 'BILOUs_hf'))
print("Dataset:", annotations, sep='\n')
print("Features:", annotations.features, sep='\n')

Dataset:
Dataset({
    features: ['Text', 'EVENT-BILOUs', 'EVENT-IOBs', 'LOC-BILOUs', 'LOC-IOBs', 'MISC-BILOUs', 'MISC-IOBs', 'ORG-BILOUs', 'ORG-IOBs', 'PER-BILOUs', 'PER-IOBs', 'TIME-BILOUs', 'TIME-IOBs'],
    num_rows: 13928
})
Features:
{'Text': Value(dtype='string', id=None), 'EVENT-BILOUs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EVENT', 'U-EVENT', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L-MISC', 'U-MISC', 'B-ORG', 'I-ORG', 'L-ORG', 'U-ORG', 'B-PER', 'I-PER', 'L-PER', 'U-PER', 'B-TIME', 'I-TIME', 'L-TIME', 'U-TIME'], id=None), length=-1, id=None), 'EVENT-IOBs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EVENT', 'U-EVENT', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L-MISC', 'U-MISC', 'B-ORG', 'I-ORG', 'L-ORG', 'U-ORG', 'B-PER', 'I-PER', 'L-PER', 'U-PER', 'B-TIME', 'I-TIME', 'L-TIME', 'U-TIME'], id=None), length=-1, id=None), 'LOC-BILOUs': Sequence(feature=ClassLabel(names=['O', 'B-EVENT', 'I-EVENT', 'L-EV

In [5]:
train_testvalid = annotations.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

# gather everyone if you want to have a single DatasetDict
annotations_split = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']}
)

Loading cached split indices for dataset at /Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/BILOUs_hf/cache-1ce91eaf5ffca8bc.arrow and /Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/BILOUs_hf/cache-73f151b2a64230fe.arrow
Loading cached split indices for dataset at /Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/BILOUs_hf/cache-9bed8d6c9fe2a48d.arrow and /Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/BILOUs_hf/cache-edb26d3b40b7c818.arrow


In [6]:
tokeniser: BertTokenizerFast = AutoTokenizer.from_pretrained(TOKENISER_CHECKPOINT)
print(f"Is '{TOKENISER_CHECKPOINT}' a fast tokeniser?", tokeniser.is_fast)

Is 'dbmdz/bert-base-historic-multilingual-cased' a fast tokeniser? True


In [7]:
def batch_embed_labels(batch):
    # align annotation with added [CLS] and [SEP] tokens
    for column in ['EVENT-BILOUs', 'LOC-BILOUs', 'MISC-BILOUs', 'ORG-BILOUs', 'PER-BILOUs', 'TIME-BILOUs',
                   'EVENT-IOBs', 'LOC-IOBs', 'MISC-IOBs', 'ORG-IOBs', 'PER-IOBs', 'TIME-IOBs']:
        all_labels = batch[column]
        new_labels = [[-100, *labels[1:-1], -100]
                      for labels in all_labels]
        batch[column] = new_labels
    return batch

In [8]:
before = annotations_split['train'][6]

In [9]:
annotations_split = annotations_split.map(batch_embed_labels, batched=True)

In [10]:
after = annotations_split['train'][6]

In [11]:
tag = 'EVENT-IOBs'
print(before[tag],
      after[tag],
      len(before[tag]) == len(after[tag]), sep='\n')

In [ ]:
def batch_tokenise(batch):
    # tokenise
    tokenised_inputs = tokeniser(batch['Text'], truncation=True)
    tokenised_inputs["labels"] = batch['PER-IOBs']
    return tokenised_inputs