In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os.path

import kediff_ner_system as kners
import json
from tqdm import tqdm

# Load things

## Define `DATA_DIR`

In [3]:
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except ModuleNotFoundError:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

You do not work on Colab
DATA_DIR='data' --> /Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data


## Import NER Ensemble _`kners`_

In [4]:
MODELS_DIR = os.path.join(DATA_DIR, 'trained_models')
MODELS_DIR

'data/trained_models'

In [5]:
classifier_paths = {
    entity_type: os.path.join(MODELS_DIR,
                              kners.CLASSIFIER_MODEL_VERSION,
                              "".join([kners.CLASSIFIER_NAME_BASE, entity_type]),
                              kners.SELECTED_EPOCHS[entity_type])
    for entity_type in kners.ENTITY_TYPES
}
classifier_paths

{'EVENT': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT/checkpoint-1393',
 'LOC': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC/checkpoint-1393',
 'MISC': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC/checkpoint-2786',
 'ORG': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG/checkpoint-1393',
 'PER': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-PER/checkpoint-2786',
 'TIME': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME/checkpoint-1393'}

In [6]:
{k: os.path.abspath(v) for k, v in classifier_paths.items()}

{'EVENT': '/Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT/checkpoint-1393',
 'LOC': '/Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC/checkpoint-1393',
 'MISC': '/Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC/checkpoint-2786',
 'ORG': '/Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG/checkpoint-1393',
 'PER': '/Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-PER/checkpoint-2786',
 'TIME': '/Users/lelvilamp/source/repos/lelvilamp/kediff-ner-training/data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME/checkpoint-1393'}

In [7]:
ner = kners.KediffNerSystem(classifier_paths, print_debug_messages_to_console=True)

Loading tokeniser 'dbmdz/bert-base-historic-multilingual-cased'
Initialising models. Received paths to 6 classifiers


100%|██████████| 6/6 [00:46<00:00,  7.67s/it]

Class initialised





## Import Text and Ground Truth

In [8]:
with open(os.path.join(DATA_DIR, 'union_dataset.jsonl')) as f:
    lines = [json.loads(line) for line in f]
len(lines)

13928

In [9]:
line = lines[42]
line

{'id': 43,
 'text': 'der gemeine Mann z. B., wie seine Lage bioher beschaffen war, noch itzt beschaffen ist, und wahrscheinlicher Weise noch mehrere Jahrhunderte hindurch beschaffen seyn wird, fühlt gewiß mehr Ruhe dabey, wenn er gewisse Wahrheiten auf das Ansehen seiner vernünftigern, oder vernünftiger seyn sollenden Lehrer bloß glaubt, als wenn es ihm einfiele, sie ohne alle Hülfskenntnisse selbst zu prüfen.)',
 'label': []}

In [10]:
predicted = ner(line['text'])
predicted

[]

# Apply to entire file

In [11]:
{
    'id': line['id'],
    'text': line['text'],
    'ground_truth': line['label'],
    'predicted': predicted
}

{'id': 43,
 'text': 'der gemeine Mann z. B., wie seine Lage bioher beschaffen war, noch itzt beschaffen ist, und wahrscheinlicher Weise noch mehrere Jahrhunderte hindurch beschaffen seyn wird, fühlt gewiß mehr Ruhe dabey, wenn er gewisse Wahrheiten auf das Ansehen seiner vernünftigern, oder vernünftiger seyn sollenden Lehrer bloß glaubt, als wenn es ihm einfiele, sie ohne alle Hülfskenntnisse selbst zu prüfen.)',
 'ground_truth': [],
 'predicted': []}

In [12]:
# todo remove this later
lines = lines[:1_111]

In [13]:
for line in tqdm(lines):
    line['predicted'] = ner(line['text'])

  2%|▏         | 23/1111 [01:06<52:21,  2.89s/it]  
