In [None]:
!pip install huggingface huggingface-cli huggingface_hub torch torchvision torchaudio transformers

In [None]:
import os

from typing import List, Dict, Union
from tqdm import tqdm
from transformers import AutoModelForTokenClassification, AutoTokenizer, BertForTokenClassification, BertTokenizerFast, pipeline, Pipeline

from kediff_ner_system import KediffNerSystem

In [None]:
# !huggingface-cli login # do this in the terminal

In [None]:
DATA_DIR: str
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except ModuleNotFoundError:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

In [None]:
TOKENISER_CHECKPOINT: str = "dbmdz/bert-base-historic-multilingual-cased"
CHECKPOINT_NAME_BASE: str = "oalz-1788-q1-ner-"
TRAINED_DIR: str = os.path.join(DATA_DIR, 'trained_models', '2024-01-15')

In [None]:
tokeniser: BertTokenizerFast = AutoTokenizer.from_pretrained(TOKENISER_CHECKPOINT)

In [None]:
label_types: list[str] = ["EVENT", "LOC", "MISC", "ORG", "PER", "TIME"]
selected_epochs: dict[str, str] = {
    "EVENT": "checkpoint-1393",
    "LOC": "checkpoint-1393",
    "MISC": "checkpoint-2786",
    "ORG": "checkpoint-1393",
    "PER": "checkpoint-2786",
    "TIME": "checkpoint-1393"
}
ner_model_paths: dict[str, str] = {
    label_type: os.path.join(TRAINED_DIR,
                             "".join([CHECKPOINT_NAME_BASE, label_type]),
                             selected_epochs[label_type])
    for label_type in label_types
}
ner_model_paths

In [None]:
token_classifiers: dict[str, Pipeline] = {}
label_type: str
for label_type in tqdm(label_types):
    token_classifiers[label_type] = pipeline(
        "token-classification",
        model=os.path.abspath(ner_model_paths[label_type]),
        aggregation_strategy="simple"
    )
list(token_classifiers.keys())

In [None]:
label_type: str
for label_type in label_types:
    pipeline: Pipeline = token_classifiers[label_type]
    tokeniser: BertTokenizerFast = pipeline.tokenizer
    model: BertForTokenClassification = pipeline.model
    model_name: str = f"OALZ-1788-Q1-NER-{label_type}"

    # tokeniser.push_to_hub(repo_id=model_name)
    # model.push_to_hub(repo_id=model_name)  # todo uncomment to actually publish

In [None]:
sample_text: str = "Dieses Modell soll nur eine der folgenden Kategorien erkennen: Personen-, Orts-, Organisations-, Ereignis-, Artefaktnamen oder Datumsangaben. Daher sprechen wir nun über Wilhelm Thell, der am 1. Jänner 1788, also noch vor der französischen Revolution und vor Vorstellung des ersten iPhones durch Apple im Jahre 2008, in Salzburg geboren wurde. Er studierte Pharmakologie an der Medizinischen Universität Wien."

In [None]:
person_model_hf: BertForTokenClassification = AutoModelForTokenClassification.from_pretrained("LelViLamp/OALZ-1788-Q1-NER-PER")
person_tokeniser: BertTokenizerFast = tokeniser

In [None]:
found_entities: list[dict[str, Union[float, int, str]]] = pipeline("ner", model=person_model_hf, tokenizer=person_tokeniser)(sample_text)
found_entities

In [None]:
KediffNerSystem.print_entities_as_table(found_entities)