In [1]:
import os
import tabulate

from transformers import AutoModel, AutoTokenizer, pipeline

In [2]:
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except ModuleNotFoundError:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

You work on Colab. Gentle as we are, we will mount Drive for you. It'd help if you allowed this in the popup that opens.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DATA_DIR='drive/MyDrive/KEDiff/data' --> /content/drive/MyDrive/KEDiff/data


In [3]:
TOKENISER_CHECKPOINT = "dbmdz/bert-base-historic-multilingual-cased"
CHECKPOINT_NAME_BASE = "oalz-1788-q1-ner-"
TRAINED_DIR = os.path.join(DATA_DIR, 'trained_models', '2024-01-15')

In [4]:
tokeniser = AutoTokenizer.from_pretrained(TOKENISER_CHECKPOINT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
sample = {
    "id": 1003,
    "text": "Unter die Gegenden, die den Borkenk\u00e4fer n\u00e4hren S. 13 \u2014 14. geh\u00f6rt leider auch das Erzstift Salzburg.",
    "label": [[82, 99, "ORG"], [82, 99, "LOC"], [28, 39, "MISC"]]
}
sample

{'id': 1003,
 'text': 'Unter die Gegenden, die den Borkenkäfer nähren S. 13 — 14. gehört leider auch das Erzstift Salzburg.',
 'label': [[82, 99, 'ORG'], [82, 99, 'LOC'], [28, 39, 'MISC']]}

In [6]:
x = tokeniser(sample['text'])
x

{'input_ids': [2, 2558, 788, 25854, 668, 16, 788, 767, 3383, 1199, 1176, 1582, 3835, 2221, 55, 18, 1097, 338, 1136, 18, 16130, 24914, 1532, 1190, 7840, 14369, 18843, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
sample_texts = [
    "(Das hei\u00dft ab ovo anfangen, wie's jener that, der vom deutschen Gleichgewichte handeln wollte, und von Adam anfieng.)",
    "Daniel Göller ist der beste Masterstudent Christian Borgelts und sollte eine Ehrenmedaille… medallje… medallie… wie schreibt man das????… sowie eine saftige Sonderzahlung von Herrn Prof. Lehnert, Rektor der Universität zu Salzburg, erhalten, sodass er endlich nach Island reisen und dort in einer Kirche zu Gott beten kann.",
    "Nun ist die Frage, ob das Modell auch mit Frauennamen umgehen kann, da beim Lesen der Originaltexte ein deutlicher Bias zu Männernamen aufgefallen ist. Und wie es dann wohl mit geschlechtsneutralen Namen aussieht?",
    "Bundeskanzlerin Brigitte Bierlein führte bis zur Angelobung der Bundesregierung Kurz II nach der vorgezogenen Nationalratswahl im Herbst 2019 die Amtsgeschäfte der Bundesministerien weiter. Vielleicht kennt sie ja auch Angela Merkel?",
    "Test in Salzburg während der Österreichsichen Aufklärung. In Paris wurden mehrere Menschen aus Deutschland gesichtet.",
    "den meisten Lesern durch eine ausführliche Beschreibung und Beurtheilung des Wirtembergischen, im katholischen Deutschlande noch immer nicht genug belannten Gesangbuches, einen Gefallen zu erzeigen."
]

In [8]:
label_types = ["EVENT", "LOC", "MISC", "ORG", "PER", "TIME"]
ner_model_paths = {
    label_type: os.path.join(TRAINED_DIR,
                             "".join([CHECKPOINT_NAME_BASE, label_type]))
    for label_type in label_types
}
ner_model_paths

{'EVENT': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT',
 'LOC': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC',
 'MISC': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC',
 'ORG': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG',
 'PER': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-PER',
 'TIME': 'drive/MyDrive/KEDiff/data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME'}

In [9]:
token_classifiers = {
    label_type : pipeline("token-classification",
                          model=ner_model_paths[label_type],
                          aggregation_strategy="simple")
    for label_type in label_types
}

In [10]:
def recognise_entites(text: str):
    found_annotations = {
        label_type : token_classifiers[label_type](text)
        for label_type in label_types
    }
    return found_annotations

In [11]:
entities = recognise_entites(sample_texts[0])
entities

{'EVENT': [],
 'LOC': [{'entity_group': 'LOC',
   'score': 0.54463327,
   'word': 'deutschen',
   'start': 54,
   'end': 63}],
 'MISC': [],
 'ORG': [],
 'PER': [{'entity_group': 'PER',
   'score': 0.99405813,
   'word': 'Adam',
   'start': 103,
   'end': 107}],
 'TIME': []}

In [12]:
text = sample_texts[1]
entities_dict = recognise_entites(text)
entities_dict

{'EVENT': [],
 'LOC': [{'entity_group': 'LOC',
   'score': 0.8783617,
   'word': 'Universität zu Salzburg',
   'start': 207,
   'end': 230},
  {'entity_group': 'LOC',
   'score': 0.9809841,
   'word': 'Island',
   'start': 265,
   'end': 271}],
 'MISC': [],
 'ORG': [{'entity_group': 'ORG',
   'score': 0.99762946,
   'word': 'Universität zu Salzburg',
   'start': 207,
   'end': 230},
  {'entity_group': 'ORG',
   'score': 0.8928384,
   'word': 'Kirche',
   'start': 297,
   'end': 303}],
 'PER': [{'entity_group': 'PER',
   'score': 0.9990125,
   'word': 'Daniel Göller',
   'start': 0,
   'end': 13},
  {'entity_group': 'PER',
   'score': 0.9947775,
   'word': 'Christian Borgelts',
   'start': 42,
   'end': 60},
  {'entity_group': 'PER',
   'score': 0.9557042,
   'word': 'Herrn Prof. Lehnert,',
   'start': 175,
   'end': 195},
  {'entity_group': 'PER',
   'score': 0.58198434,
   'word': '##ktor',
   'start': 198,
   'end': 202},
  {'entity_group': 'PER',
   'score': 0.6620957,
   'word': 'G

In [13]:
def entities_dict_to_list(entities_dict):
    entities_list = []
    for label_type in entities_dict:
        entities_list += entities_dict[label_type]
    return entities_list

In [14]:
entities_list = entities_dict_to_list(entities_dict)
entities_list

[{'entity_group': 'LOC',
  'score': 0.8783617,
  'word': 'Universität zu Salzburg',
  'start': 207,
  'end': 230},
 {'entity_group': 'LOC',
  'score': 0.9809841,
  'word': 'Island',
  'start': 265,
  'end': 271},
 {'entity_group': 'ORG',
  'score': 0.99762946,
  'word': 'Universität zu Salzburg',
  'start': 207,
  'end': 230},
 {'entity_group': 'ORG',
  'score': 0.8928384,
  'word': 'Kirche',
  'start': 297,
  'end': 303},
 {'entity_group': 'PER',
  'score': 0.9990125,
  'word': 'Daniel Göller',
  'start': 0,
  'end': 13},
 {'entity_group': 'PER',
  'score': 0.9947775,
  'word': 'Christian Borgelts',
  'start': 42,
  'end': 60},
 {'entity_group': 'PER',
  'score': 0.9557042,
  'word': 'Herrn Prof. Lehnert,',
  'start': 175,
  'end': 195},
 {'entity_group': 'PER',
  'score': 0.58198434,
  'word': '##ktor',
  'start': 198,
  'end': 202},
 {'entity_group': 'PER',
  'score': 0.6620957,
  'word': 'Gott',
  'start': 307,
  'end': 311}]

In [15]:
def sort_entity_list(entities_list):
    sorted_entities_list = sorted(entities_list, key=lambda d: (d['start'], d['score'], d['entity_group']))
    return sorted_entities_list

In [16]:
sorted_entities_list = sort_entity_list(entities_list)
sorted_entities_list

[{'entity_group': 'PER',
  'score': 0.9990125,
  'word': 'Daniel Göller',
  'start': 0,
  'end': 13},
 {'entity_group': 'PER',
  'score': 0.9947775,
  'word': 'Christian Borgelts',
  'start': 42,
  'end': 60},
 {'entity_group': 'PER',
  'score': 0.9557042,
  'word': 'Herrn Prof. Lehnert,',
  'start': 175,
  'end': 195},
 {'entity_group': 'PER',
  'score': 0.58198434,
  'word': '##ktor',
  'start': 198,
  'end': 202},
 {'entity_group': 'LOC',
  'score': 0.8783617,
  'word': 'Universität zu Salzburg',
  'start': 207,
  'end': 230},
 {'entity_group': 'ORG',
  'score': 0.99762946,
  'word': 'Universität zu Salzburg',
  'start': 207,
  'end': 230},
 {'entity_group': 'LOC',
  'score': 0.9809841,
  'word': 'Island',
  'start': 265,
  'end': 271},
 {'entity_group': 'ORG',
  'score': 0.8928384,
  'word': 'Kirche',
  'start': 297,
  'end': 303},
 {'entity_group': 'PER',
  'score': 0.6620957,
  'word': 'Gott',
  'start': 307,
  'end': 311}]

In [17]:
def ner(text: str):
    entity_dict = recognise_entites(text)
    entity_list = entities_dict_to_list(entity_dict)
    sorted_entity_list = sort_entity_list(entity_list)
    return sorted_entity_list

In [18]:
entities = [ner(text) for text in sample_texts]
entities

[[{'entity_group': 'LOC',
   'score': 0.54463327,
   'word': 'deutschen',
   'start': 54,
   'end': 63},
  {'entity_group': 'PER',
   'score': 0.99405813,
   'word': 'Adam',
   'start': 103,
   'end': 107}],
 [{'entity_group': 'PER',
   'score': 0.9990125,
   'word': 'Daniel Göller',
   'start': 0,
   'end': 13},
  {'entity_group': 'PER',
   'score': 0.9947775,
   'word': 'Christian Borgelts',
   'start': 42,
   'end': 60},
  {'entity_group': 'PER',
   'score': 0.9557042,
   'word': 'Herrn Prof. Lehnert,',
   'start': 175,
   'end': 195},
  {'entity_group': 'PER',
   'score': 0.58198434,
   'word': '##ktor',
   'start': 198,
   'end': 202},
  {'entity_group': 'LOC',
   'score': 0.8783617,
   'word': 'Universität zu Salzburg',
   'start': 207,
   'end': 230},
  {'entity_group': 'ORG',
   'score': 0.99762946,
   'word': 'Universität zu Salzburg',
   'start': 207,
   'end': 230},
  {'entity_group': 'LOC',
   'score': 0.9809841,
   'word': 'Island',
   'start': 265,
   'end': 271},
  {'ent

In [19]:
i = 1
print(sample_texts[i])
print("")
x = ner(sample_texts[i])
x

Daniel Göller ist der beste Masterstudent Christian Borgelts und sollte eine Ehrenmedaille… medallje… medallie… wie schreibt man das????… sowie eine saftige Sonderzahlung von Herrn Prof. Lehnert, Rektor der Universität zu Salzburg, erhalten, sodass er endlich nach Island reisen und dort in einer Kirche zu Gott beten kann.



[{'entity_group': 'PER',
  'score': 0.9990125,
  'word': 'Daniel Göller',
  'start': 0,
  'end': 13},
 {'entity_group': 'PER',
  'score': 0.9947775,
  'word': 'Christian Borgelts',
  'start': 42,
  'end': 60},
 {'entity_group': 'PER',
  'score': 0.9557042,
  'word': 'Herrn Prof. Lehnert,',
  'start': 175,
  'end': 195},
 {'entity_group': 'PER',
  'score': 0.58198434,
  'word': '##ktor',
  'start': 198,
  'end': 202},
 {'entity_group': 'LOC',
  'score': 0.8783617,
  'word': 'Universität zu Salzburg',
  'start': 207,
  'end': 230},
 {'entity_group': 'ORG',
  'score': 0.99762946,
  'word': 'Universität zu Salzburg',
  'start': 207,
  'end': 230},
 {'entity_group': 'LOC',
  'score': 0.9809841,
  'word': 'Island',
  'start': 265,
  'end': 271},
 {'entity_group': 'ORG',
  'score': 0.8928384,
  'word': 'Kirche',
  'start': 297,
  'end': 303},
 {'entity_group': 'PER',
  'score': 0.6620957,
  'word': 'Gott',
  'start': 307,
  'end': 311}]

In [20]:
def print_entities_as_table(entity_list, tablefmt: str = "simple_outline"):
    if entity_list is None or len(entity_list) == 0:
        print("No entities found")
        return
    header = list(entity_list[0].keys())
    header[0] = "type"
    rows = [entity.values() for entity in entity_list]
    print(tabulate.tabulate(rows, header, tablefmt=tablefmt))

In [21]:
print_entities_as_table(x)

┌────────┬──────────┬─────────────────────────┬─────────┬───────┐
│ type   │    score │ word                    │   start │   end │
├────────┼──────────┼─────────────────────────┼─────────┼───────┤
│ PER    │ 0.999012 │ Daniel Göller           │       0 │    13 │
│ PER    │ 0.994778 │ Christian Borgelts      │      42 │    60 │
│ PER    │ 0.955704 │ Herrn Prof. Lehnert,    │     175 │   195 │
│ PER    │ 0.581984 │ ##ktor                  │     198 │   202 │
│ LOC    │ 0.878362 │ Universität zu Salzburg │     207 │   230 │
│ ORG    │ 0.997629 │ Universität zu Salzburg │     207 │   230 │
│ LOC    │ 0.980984 │ Island                  │     265 │   271 │
│ ORG    │ 0.892838 │ Kirche                  │     297 │   303 │
│ PER    │ 0.662096 │ Gott                    │     307 │   311 │
└────────┴──────────┴─────────────────────────┴─────────┴───────┘
