In [1]:
import os
import torch
import evaluate
import transformers
import numpy as np

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification


In [2]:
raw_dataset = load_dataset("kosta-naumenko/medflex")['train']
raw_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 294
})

In [3]:
model_name = "alexyalunin/RuBioRoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [4]:
inputs = tokenizer(raw_dataset[0]["tokens"], is_split_into_words=True)
tokenizer.decode(inputs['input_ids'])

'<s> Отмечает постепенный набор массы тела с 30 лет, в настоящее время вес максимальный -102кг ( ИМТ=32,19 кг/м 2). Неоднократно предпринимал попытки снижения веса с помощью диет и физических нагрузок с положительным временным эффектом.\nВ 1999г. при плановом обследовании выявлено повышение гликемии до 12 ммоль/л натощак. Диагностирован СД2 типа, назначен Сиофор 1500мг вечером. В 2018г. амбулаторно проведена коррекция терапии: ЯнуМет 1000+50мг утром и вечером, Сиофор 1000мг вечером. Контроль гликемии не проводит.\n</s>'

In [5]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [6]:
labels = raw_dataset[0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [8]:
tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_dataset.column_names,
)

In [9]:
id = 4
input_ids, attention_mask, labels = list(tokenized_dataset[id].values())
for i in range(len(input_ids)):
    if labels[i] > 0:
        if labels[i] == 1:
            print(" ")
        print(tokenizer.decode(input_ids[i]), end='')

 
 гликемия 12-13 ммоль/л, 
 ИМТ 26,53 кг/м 2 
 сухость во рту, 
 жажды, 
 учащённого мочеиспускания. 
 гликемия при контроле 1 раз в день натощак 12-13 ммоль/л,

In [10]:
seqeval = evaluate.load("seqeval")
label_list = ['O', 'B', 'I']

labels = [label_list[i] for i in raw_dataset[0][f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [11]:
id2label = {
    0: "O",
    1: "B",
    2: "I",
}
label2id = {
    "O": 0,
    "B": 1,
    "I": 2,
}

In [12]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
    )

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioRoBERTa and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
torch.cuda.set_device(7)

training_args = TrainingArguments(
    disable_tqdm=False,
    output_dir="test_model",
    learning_rate=2e-5,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()