In [1]:
!pip install huggingface huggingface-cli huggingface_hub torch torchvision torchaudio transformers



In [2]:
import os

from tqdm import tqdm
from transformers import AutoTokenizer, pipeline, AutoModelForTokenClassification, Pipeline

In [3]:
# !huggingface-cli login # do this in the terminal

In [4]:
try:
    from google.colab import drive

    print(
        "You work on Colab. Gentle as we are, we will mount Drive for you. "
        "It'd help if you allowed this in the popup that opens."
    )
    drive.mount('/content/drive')
    DATA_DIR = os.path.join('drive', 'MyDrive', 'KEDiff', 'data')
except ModuleNotFoundError:
    print("You do not work on Colab")
    DATA_DIR = os.path.join('data')
print(f"{DATA_DIR=}", '-->', os.path.abspath(DATA_DIR))

You do not work on Colab
DATA_DIR='data' --> /Users/daniel/source/repos/lelvilamp/kediff-ner-training/data


In [5]:
TOKENISER_CHECKPOINT = "dbmdz/bert-base-historic-multilingual-cased"
CHECKPOINT_NAME_BASE = "oalz-1788-q1-ner-"
TRAINED_DIR = os.path.join(DATA_DIR, 'trained_models', '2024-01-15')

In [6]:
tokeniser = AutoTokenizer.from_pretrained(TOKENISER_CHECKPOINT)

In [7]:
label_types = ["EVENT", "LOC", "MISC", "ORG", "PER", "TIME"]
selected_epochs = {
    "EVENT": "checkpoint-1393",
    "LOC": "checkpoint-1393",
    "MISC": "checkpoint-2786",
    "ORG": "checkpoint-1393",
    "PER": "checkpoint-2786",
    "TIME": "checkpoint-1393"
}
ner_model_paths = {
    label_type: os.path.join(TRAINED_DIR,
                             "".join([CHECKPOINT_NAME_BASE, label_type]),
                             selected_epochs[label_type])
    for label_type in label_types
}
ner_model_paths

{'EVENT': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-EVENT/checkpoint-1393',
 'LOC': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-LOC/checkpoint-1393',
 'MISC': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-MISC/checkpoint-2786',
 'ORG': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-ORG/checkpoint-1393',
 'PER': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-PER/checkpoint-2786',
 'TIME': 'data/trained_models/2024-01-15/oalz-1788-q1-ner-TIME/checkpoint-1393'}

In [8]:
token_classifiers = {}
for label_type in tqdm(label_types):
    token_classifiers[label_type] = pipeline(
        "token-classification",
        model=os.path.abspath(ner_model_paths[label_type]),
        aggregation_strategy="simple"
    )
list(token_classifiers.keys())

100%|██████████| 6/6 [00:02<00:00,  2.37it/s]


['EVENT', 'LOC', 'MISC', 'ORG', 'PER', 'TIME']

In [9]:
for label_type in label_types:
    pipeline = token_classifiers[label_type]
    tokeniser = pipeline.tokenizer
    model = pipeline.model
    model_name: str = f"OALZ-1788-Q1-NER-{label_type}"

    # tokeniser.push_to_hub(repo_id=model_name)
    model.push_to_hub(repo_id=model_name)

  0%|          | 0/6 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

 17%|█▋        | 1/6 [00:51<04:19, 51.98s/it]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

 33%|███▎      | 2/6 [01:38<03:15, 48.81s/it]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

 50%|█████     | 3/6 [02:32<02:33, 51.30s/it]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

 83%|████████▎ | 5/6 [03:27<00:33, 33.87s/it]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 6/6 [04:15<00:00, 42.54s/it]


In [None]:
sample_text = "Dieses Modell soll nur eine der folgenden Kategorien erkennen: Personen-, Orts-, Organisations-, Ereignis-, Artefaktnamen oder Datumsangaben. Daher sprechen wir nun über Wilhelm Thell, der am 1. Jänner 1788, also noch vor der französischen Revolution und vor Vorstellung des ersten iPhones durch Apple im Jahre 2008, in Salzburg geboren wurde. Er studierte Pharmakologie an der Medizinischen Universität Wien."

In [None]:
person_model_hf = AutoModelForTokenClassification.from_pretrained("LelViLamp/OALZ-1788-Q1-NER-PER")
person_tokeniser = tokeniser

In [None]:
pipeline("ner", model=person_model_hf, tokenizer=person_tokeniser)(sample_text)