In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import torch
import evaluate
import transformers
import numpy as np

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

In [3]:
raw_dataset = load_dataset("kosta-naumenko/medflex", split='train', download_mode='force_redownload', verification_mode='no_checks')
raw_dataset

Downloading readme:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/198k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/394 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 394
})

In [4]:
model_name = "alexyalunin/RuBioRoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [5]:
inputs = tokenizer(raw_dataset[0]["tokens"], is_split_into_words=True)
tokenizer.decode(inputs['input_ids'])

'<s> Отмечает постепенный набор массы тела с 30 лет, в настоящее время вес максимальный -102кг ( ИМТ=32,19 кг/м 2). Неоднократно предпринимал попытки снижения веса с помощью диет и физических нагрузок с положительным временным эффектом.\nВ 1999г. при плановом обследовании выявлено повышение гликемии до 12 ммоль/л натощак. Диагностирован СД2 типа, назначен Сиофор 1500мг вечером. В 2018г. амбулаторно проведена коррекция терапии: ЯнуМет 1000+50мг утром и вечером, Сиофор 1000мг вечером. Контроль гликемии не проводит.\n</s>'

In [6]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [7]:
labels = raw_dataset[0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, 
        max_length=513, padding=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [9]:
tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_dataset.column_names,
)

Map:   0%|          | 0/394 [00:00<?, ? examples/s]

In [10]:
id = 4
input_ids, attention_mask, labels = list(tokenized_dataset[id].values())
for i in range(len(input_ids)):
    if labels[i] > 0:
        if labels[i] == 1:
            print(" ")
        print(tokenizer.decode(input_ids[i]), end='')

 
 гликемия 12-13 ммоль/л, 
 ИМТ 26,53 кг/м 2 
 сухость во рту, 
 жажды, 
 учащённого мочеиспускания. 
 гликемия при контроле 1 раз в день натощак 12-13 ммоль/л,

In [11]:
seqeval = evaluate.load("seqeval")
label_list = ['O', 'B', 'I']


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [12]:
id2label = {
    0: "O",
    1: "B",
    2: "I",
}
label2id = {
    "O": 0,
    "B": 1,
    "I": 2,
}

In [72]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    device_map={'': torch.cuda.current_device()},
    cache_dir='.cache',
    num_labels=3,
    id2label=id2label,
    label2id=label2id
    )

# for param in model.roberta.parameters():
#     param.requires_grad = False

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioRoBERTa and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
max_len = 514
num = 0
for row in tokenized_dataset['labels']:
    if len(row) > max_len:
        num += 1
        # max_len = len(row)
print(max_len)
print(num)


514
0


In [52]:
model.device

device(type='cuda', index=0)

In [53]:
tokens = tokenized_dataset['input_ids'][0]
inputs = torch.Tensor([tokens]).long()
inputs = inputs.to(model.device)
model(inputs)[:10]

(tensor([[[ 0.8966, -0.8131,  1.1221],
          [ 0.6292,  0.3066, -0.0585],
          [ 0.4295, -0.1999,  0.2521],
          ...,
          [ 0.4705, -0.0586,  1.0367],
          [ 0.2763, -0.3491,  1.2069],
          [ 0.2444, -0.3749,  1.3511]]], device='cuda:0',
        grad_fn=<ViewBackward0>),)

In [54]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [73]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-4,
    num_train_epochs=10,
    weight_decay=0.5,
    logging_steps=20,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss
20,1.2937
40,0.6457
60,0.7254
80,0.6797
100,0.6464
120,0.6367
140,0.6273
160,0.6447
180,0.6415
200,0.6297


TrainOutput(global_step=500, training_loss=0.6671845321655273, metrics={'train_runtime': 336.9499, 'train_samples_per_second': 11.693, 'train_steps_per_second': 1.484, 'total_flos': 3666264670012680.0, 'train_loss': 0.6671845321655273, 'epoch': 10.0})

In [75]:
preds = model(torch.LongTensor(tokenized_dataset['input_ids']).to(model.device))
p = [preds['logits'].detach().cpu(), tokenized_dataset['labels']]
compute_metrics(p)

OutOfMemoryError: CUDA out of memory. Tried to allocate 790.00 MiB. GPU 0 has a total capacty of 39.39 GiB of which 457.94 MiB is free. Process 911713 has 38.93 GiB memory in use. Of the allocated memory 37.37 GiB is allocated by PyTorch, and 1.06 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
{'precision': 0.00025539522410930913,
 'recall': 0.0009062075215224287,
 'f1': 0.00039848575413428965,
 'accuracy': 0.7268569749791901}

{'precision': 0.006167613979925021,
 'recall': 0.023108291798821932,
 'f1': 0.009736540664375716,
 'accuracy': 0.7841453263477451}

{'precision': 0.007338551859099804,
 'recall': 0.02718622564567286,
 'f1': 0.011557353366079168,
 'accuracy': 0.7893355530529306}

{'precision': 0.008235145065247688,
 'recall': 0.02945174444947893,
 'f1': 0.012871287128712872,
 'accuracy': 0.7918033589580376}

{'precision': 0.008580441640378548,
 'recall': 0.030811055731762575,
 'f1': 0.013422818791946308,
 'accuracy': 0.7921167311364637}