In [11]:
from datasets import load_from_disk
ds = load_from_disk("lang-sci-press_ft_debug_dataset")
ds

['O', 'I-BLL-DE', 'I-BLL-EN', 'B-BLL-DE', 'B-BLL-EN']

In [12]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-german-cased", num_labels=len(ds.features["all_ner_tags"].feature.names)
)


loading configuration file https://huggingface.co/bert-base-german-cased/resolve/main/config.json from cache at /home/jens/.cache/huggingface/transformers/98877e98ee76b3977d326fe4f54bc29f10b486c317a70b6445ac19a0603b00f0.1f2afedb22f9784795ae3a26fe20713637c93f50e2c99101d952ea6476087e5e
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
 

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

In [43]:
testing = tokenizer("Testing")
[ (t,tokenizer.decode(t)) for t in testing["input_ids"]]

[(3, '[CLS]'), (4496, 'Test'), (270, '##ing'), (4, '[SEP]')]

In [4]:
# https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["all_tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"all_ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_ds = ds.map(tokenize_and_align_labels, batched = True )
tokenized_ds 

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['id_', 'id', 'identifiers', 'relations', 'all_tokens', 'all_ner_tags', 'all_ner_links', 'title_tokens', 'title_ner_tags', 'title_ner_links', 'subject_tokens', 'subject_ner_tags', 'subject_ner_links', 'description_tokens', 'description_ner_tags', 'description_ner_links', 'text_tokens', 'text_ner_tags', 'text_ner_links', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 16
})

In [5]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [21]:
# https://huggingface.co/docs/transformers/training
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [13]:
import torch

torch.cuda.empty_cache()

In [49]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    # optim="adafactor",
    optim="adamw_torch",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    do_eval=False,
    output_dir="lang-sci-press_ft_debug_training",
    no_cuda=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    #eval_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
result = trainer.train()
result


The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text_ner_tags, relations, description_ner_tags, title_ner_links, title_ner_tags, text_tokens, id, all_tokens, description_ner_links, identifiers, description_tokens, subject_ner_links, subject_tokens, all_ner_links, id_, all_ner_tags, title_tokens, subject_ner_tags, text_ner_links. If text_ner_tags, relations, description_ner_tags, title_ner_links, title_ner_tags, text_tokens, id, all_tokens, description_ner_links, identifiers, description_tokens, subject_ner_links, subject_tokens, all_ner_links, id_, all_ner_tags, title_tokens, subject_ner_tags, text_ner_links are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumu

{'train_runtime': 211.4026, 'train_samples_per_second': 0.227, 'train_steps_per_second': 0.057, 'train_loss': 0.0006478881696239114, 'epoch': 3.0}


TrainOutput(global_step=12, training_loss=0.0006478881696239114, metrics={'train_runtime': 211.4026, 'train_samples_per_second': 0.227, 'train_steps_per_second': 0.057, 'train_loss': 0.0006478881696239114, 'epoch': 3.0})

In [48]:
trainer.predict(tokenized_ds)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text_ner_tags, relations, description_ner_tags, title_ner_links, title_ner_tags, text_tokens, id, all_tokens, description_ner_links, identifiers, description_tokens, subject_ner_links, subject_tokens, all_ner_links, id_, all_ner_tags, title_tokens, subject_ner_tags, text_ner_links. If text_ner_tags, relations, description_ner_tags, title_ner_links, title_ner_tags, text_tokens, id, all_tokens, description_ner_links, identifiers, description_tokens, subject_ner_links, subject_tokens, all_ner_links, id_, all_ner_tags, title_tokens, subject_ner_tags, text_ner_links are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 16
  Batch size = 8


ValueError: Module inputs don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]],
Input references: [[-100    0    0 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]
 [-100    0 -100 ...    0    0 -100]
 ...
 [-100    0 -100 ... -100 -100 -100]
 [-100    0 -100 ... -100 -100 -100]
 [-100    0 -100 ... -100 -100 -100]]

In [46]:
from tokenizers import pipelines
ner_pipe = pipelines("ner", model=model, tokenizer=tokenizer)

ner_pipe("Dieser Text enthält Wörter, wie z.B. Abkürzung")

ImportError: cannot import name 'pipelines' from 'tokenizers' (/home/jens/.cache/pypoetry/virtualenvs/bachelor-inf-sW_MISUD-py3.10/lib/python3.10/site-packages/tokenizers/__init__.py)