In [None]:
import os.path

import json
from tqdm import tqdm

# Load things

## Define `DATA_DIR`

In [None]:
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except ModuleNotFoundError:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

## Import NER Ensemble _`kners`_

In [None]:
try:
    from google.colab import drive

    print("You work on Colab, so 'kediff_ner_system.py' needs to be copied over before importing it.")

    import shutil
    shutil.copy(
        src = os.path.join('drive', 'MyDrive', 'KEDiff', 'kediff_ner_system.py'),
        dst = '.'
    )
except ModuleNotFoundError:
    pass

In [None]:
import kediff_ner_system as kners

In [None]:
MODELS_DIR = os.path.join(DATA_DIR, 'trained_models')
MODELS_DIR

In [None]:
classifier_paths = {
    entity_type: os.path.join(MODELS_DIR,
                              kners.CLASSIFIER_MODEL_VERSION,
                              "".join([kners.CLASSIFIER_NAME_BASE, entity_type]),
                              kners.SELECTED_EPOCHS[entity_type])
    for entity_type in kners.ENTITY_TYPES
}
classifier_paths

In [None]:
{k: os.path.abspath(v) for k, v in classifier_paths.items()}

In [None]:
ner = kners.KediffNerSystem(classifier_paths, print_debug_messages_to_console=True)

## Import Text and Ground Truth

In [None]:
with open(os.path.join(DATA_DIR, 'union_dataset.jsonl')) as f:
    lines = [json.loads(line) for line in f]
len(lines)

In [None]:
line = lines[42]
line

In [None]:
predicted = ner.ner(line['text'], print_table_to_console=True)

# Apply kNERs to entire file

In [None]:
{
    'id': line['id'],
    'text': line['text'],
    'ground_truth': line['label'],
    'predicted': predicted
}

In [None]:
# todo remove this later
lines = lines[:50]

In [None]:
for line in tqdm(lines):
    line['predicted'] = ner(line['text'])

In [None]:
lines[2]

In [None]:
for k in lines[2]['predicted'][0].keys():
    print(type(lines[2]['predicted'][0][k]))

In [None]:
lines[2]['predicted'][0]['score'].astype(float)

In [None]:
for line in tqdm(lines):
    for prediction in line['predicted']:
        prediction['score'] = prediction['score'].astype(float)

In [None]:
json.dumps(lines[2])

In [None]:
with open(os.path.join(DATA_DIR, 'predictions.jsonl'), "w") as f:
    for line in lines:
        f.write(json.dumps(line) + "\n")