In [27]:
import json
import os.path
from typing import Union

from tqdm import tqdm

from kediff_ner_system import KediffNerSystem

# Load things

## Define `DATA_DIR`

In [28]:
DATA_DIR: str
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except ModuleNotFoundError:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

You do not work on Colab
DATA_DIR='data' --> /Users/daniel/source/repos/lelvilamp/kediff-ner-training/data


## Import NER Ensemble _`kners`_

In [29]:
try:
    from google.colab import drive

    print("You work on Colab, so 'kediff_ner_system.py' needs to be copied over before importing it.")

    import shutil
    shutil.copy(
        src = os.path.join('drive', 'MyDrive', 'KEDiff', 'kediff_ner_system.py'),
        dst = '.'
    )
except ModuleNotFoundError:
    pass

In [30]:
import kediff_ner_system as kners

In [31]:
MODELS_DIR: str = os.path.join(DATA_DIR, 'trained_models')
MODELS_DIR

'data/trained_models'

In [32]:
classifier_paths: dict[str, str] = {
    entity_type: os.path.join(MODELS_DIR,
                              kners.CLASSIFIER_MODEL_VERSION,
                              "".join([kners.CLASSIFIER_NAME_BASE, entity_type]),
                              kners.SELECTED_EPOCHS[entity_type])
    for entity_type in kners.ENTITY_TYPES
}
classifier_paths

{'EVENT': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT/checkpoint-1393',
 'LOC': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC/checkpoint-1393',
 'MISC': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC/checkpoint-2786',
 'ORG': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG/checkpoint-1393',
 'PER': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-PER/checkpoint-2786',
 'TIME': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME/checkpoint-1393'}

In [33]:
{k: os.path.abspath(v) for k, v in classifier_paths.items()}

{'EVENT': '/Users/daniel/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT/checkpoint-1393',
 'LOC': '/Users/daniel/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC/checkpoint-1393',
 'MISC': '/Users/daniel/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC/checkpoint-2786',
 'ORG': '/Users/daniel/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG/checkpoint-1393',
 'PER': '/Users/daniel/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-PER/checkpoint-2786',
 'TIME': '/Users/daniel/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME/checkpoint-1393'}

In [34]:
ner: KediffNerSystem = kners.KediffNerSystem(classifier_paths, print_debug_messages_to_console=True)

Loading tokeniser 'dbmdz/bert-base-historic-multilingual-cased'
Initialising models. Received paths to 6 classifiers


100%|██████████| 6/6 [00:03<00:00,  1.62it/s]

Class initialised





## Import Text and Ground Truth

In [35]:
with open(os.path.join(DATA_DIR, 'union_dataset.jsonl')) as file:
    lines: list[dict[str, Union[int, list[Union[int, str]], str]]] = [json.loads(line) for line in file]
len(lines)

13928

In [36]:
line: dict[str, Union[int, list[Union[int, str]], str]] = lines[42]
line

{'id': 43,
 'text': 'der gemeine Mann z. B., wie seine Lage bioher beschaffen war, noch itzt beschaffen ist, und wahrscheinlicher Weise noch mehrere Jahrhunderte hindurch beschaffen seyn wird, fühlt gewiß mehr Ruhe dabey, wenn er gewisse Wahrheiten auf das Ansehen seiner vernünftigern, oder vernünftiger seyn sollenden Lehrer bloß glaubt, als wenn es ihm einfiele, sie ohne alle Hülfskenntnisse selbst zu prüfen.)',
 'label': []}

In [37]:
predicted: list[dict[str, Union[float, int, str]]] = ner.ner(line['text'], print_table_to_console=True)

(no entities found)


# Apply kNERs to entire file

## Generate Predictions

In [38]:
{
    'id': line['id'],
    'text': line['text'],
    'ground_truth': line['label'],
    'predicted': predicted
}

{'id': 43,
 'text': 'der gemeine Mann z. B., wie seine Lage bioher beschaffen war, noch itzt beschaffen ist, und wahrscheinlicher Weise noch mehrere Jahrhunderte hindurch beschaffen seyn wird, fühlt gewiß mehr Ruhe dabey, wenn er gewisse Wahrheiten auf das Ansehen seiner vernünftigern, oder vernünftiger seyn sollenden Lehrer bloß glaubt, als wenn es ihm einfiele, sie ohne alle Hülfskenntnisse selbst zu prüfen.)',
 'ground_truth': [],
 'predicted': []}

In [39]:
# todo remove this later
# lines = lines[:50]

In [None]:
for line in tqdm(lines):
    line['predicted'] = ner(line['text'])

  0%|          | 9/13928 [00:01<48:02,  4.83it/s]

## Materialise as JSONL file

The `json` module cannot materialise the `numpy.float32` data types which is used for the predictions' `score` information. Thus, convert it to the base data type `float`.

In [None]:
lines[2]

In [None]:
for key in lines[2]['predicted'][0].keys():
    print(type(lines[2]['predicted'][0][key]))

In [None]:
lines[2]['predicted'][0]['score'].astype(float)

In [None]:
for line in tqdm(lines):
    for prediction in line['predicted']:
        prediction['score'] = prediction['score'].astype(float)

In [None]:
json.dumps(lines[2])

In [None]:
with open(os.path.join(DATA_DIR, 'predictions.jsonl'), "w") as file:
    for line in tqdm(lines):
        file.write(json.dumps(line) + "\n")