In [None]:
import os.path

import json
from tqdm import tqdm

# Load things

## Define `DATA_DIR`

In [None]:
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except ModuleNotFoundError:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

You work on Colab. Gentle as we are, we will mount Drive for you. It'd help if you allowed this in the popup that opens.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DATA_DIR='drive/MyDrive/KEDiff/data' --> /content/drive/MyDrive/KEDiff/data


## Import NER Ensemble _`kners`_

In [None]:
try:
    from google.colab import drive

    print("You work on Colab, so 'kediff_ner_system.py' needs to be copied over before importing it.")

    import shutil
    shutil.copy(
        src = os.path.join('drive', 'MyDrive', 'KEDiff', 'kediff_ner_system.py'),
        dst = '.'
    )
except ModuleNotFoundError:
    pass

You work on Colab, so 'kediff_ner_system.py' needs to be copied over before importing it.


In [None]:
import kediff_ner_system as kners

In [None]:
MODELS_DIR = os.path.join(DATA_DIR, 'trained_models')
MODELS_DIR

'drive/MyDrive/KEDiff/data/trained_models'

In [None]:
classifier_paths = {
    entity_type: os.path.join(MODELS_DIR,
                              kners.CLASSIFIER_MODEL_VERSION,
                              "".join([kners.CLASSIFIER_NAME_BASE, entity_type]),
                              kners.SELECTED_EPOCHS[entity_type])
    for entity_type in kners.ENTITY_TYPES
}
classifier_paths

{'EVENT': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT/checkpoint-1393',
 'LOC': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC/checkpoint-1393',
 'MISC': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC/checkpoint-2786',
 'ORG': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG/checkpoint-1393',
 'PER': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-PER/checkpoint-2786',
 'TIME': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME/checkpoint-1393'}

In [None]:
{k: os.path.abspath(v) for k, v in classifier_paths.items()}

{'EVENT': '/content/drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT/checkpoint-1393',
 'LOC': '/content/drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC/checkpoint-1393',
 'MISC': '/content/drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC/checkpoint-2786',
 'ORG': '/content/drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG/checkpoint-1393',
 'PER': '/content/drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-PER/checkpoint-2786',
 'TIME': '/content/drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME/checkpoint-1393'}

In [None]:
ner = kners.KediffNerSystem(classifier_paths, print_debug_messages_to_console=True)

Loading tokeniser 'dbmdz/bert-base-historic-multilingual-cased'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Initialising models. Received paths to 6 classifiers


100%|██████████| 6/6 [00:27<00:00,  4.58s/it]

Class initialised





## Import Text and Ground Truth

In [None]:
with open(os.path.join(DATA_DIR, 'union_dataset.jsonl')) as f:
    lines = [json.loads(line) for line in f]
len(lines)

13928

In [None]:
line = lines[42]
line

{'id': 43,
 'text': 'der gemeine Mann z. B., wie seine Lage bioher beschaffen war, noch itzt beschaffen ist, und wahrscheinlicher Weise noch mehrere Jahrhunderte hindurch beschaffen seyn wird, fühlt gewiß mehr Ruhe dabey, wenn er gewisse Wahrheiten auf das Ansehen seiner vernünftigern, oder vernünftiger seyn sollenden Lehrer bloß glaubt, als wenn es ihm einfiele, sie ohne alle Hülfskenntnisse selbst zu prüfen.)',
 'label': []}

In [None]:
predicted = ner.ner(line['text'], print_table_to_console=True)

(no entities found)


# Apply kNERs to entire file

## Generate Predictions

In [None]:
{
    'id': line['id'],
    'text': line['text'],
    'ground_truth': line['label'],
    'predicted': predicted
}

{'id': 43,
 'text': 'der gemeine Mann z. B., wie seine Lage bioher beschaffen war, noch itzt beschaffen ist, und wahrscheinlicher Weise noch mehrere Jahrhunderte hindurch beschaffen seyn wird, fühlt gewiß mehr Ruhe dabey, wenn er gewisse Wahrheiten auf das Ansehen seiner vernünftigern, oder vernünftiger seyn sollenden Lehrer bloß glaubt, als wenn es ihm einfiele, sie ohne alle Hülfskenntnisse selbst zu prüfen.)',
 'ground_truth': [],
 'predicted': []}

In [None]:
# todo remove this later
# lines = lines[:50]

In [None]:
for line in tqdm(lines):
    line['predicted'] = ner(line['text'])

100%|██████████| 13928/13928 [3:58:56<00:00,  1.03s/it]


## Materialise as JSONL file

The `json` module cannot materialise the `numpy.float32` data types which is used for the predictions' `score` information. Thus, convert it to the base data type `float`.

In [None]:
lines[2]

{'id': 3,
 'text': 'Zur Beherzigung für Regenten, Censoren, und Schriftsteller.',
 'label': [],
 'predicted': [{'entity_group': 'ORG',
   'score': 0.58743024,
   'word': 'Regenten',
   'start': 20,
   'end': 28}]}

In [None]:
for k in lines[2]['predicted'][0].keys():
    print(type(lines[2]['predicted'][0][k]))

<class 'str'>
<class 'numpy.float32'>
<class 'str'>
<class 'int'>
<class 'int'>


In [None]:
lines[2]['predicted'][0]['score'].astype(float)

0.5874302387237549

In [None]:
for line in tqdm(lines):
    for prediction in line['predicted']:
        prediction['score'] = prediction['score'].astype(float)

100%|██████████| 13928/13928 [00:00<00:00, 401502.87it/s]


In [None]:
json.dumps(lines[2])

'{"id": 3, "text": "Zur Beherzigung f\\u00fcr Regenten, Censoren, und Schriftsteller.", "label": [], "predicted": [{"entity_group": "ORG", "score": 0.5874302387237549, "word": "Regenten", "start": 20, "end": 28}]}'

In [None]:
with open(os.path.join(DATA_DIR, 'predictions.jsonl'), "w") as f:
    for line in tqdm(lines):
        f.write(json.dumps(line) + "\n")

100%|██████████| 13928/13928 [00:00<00:00, 81923.97it/s]
