In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.insert(1, '../../src')

import torch
import evaluate
import mlflow
import numpy as np

from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

from TextProcessor import TextProcessor
from update_dataset import update_dataset


In [4]:
mlflow.set_tracking_uri("http://mlflow:5000")
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))

os.environ["MLFLOW_EXPERIMENT_NAME"] = "NER"
# os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "True"

# os.environ["MLFLOW_FLATTEN_PARAMS"] = "True"

# os.environ["MLFLOW_TRACKING_URI"] = tracking_uri

Current tracking uri: http://mlflow:5000


In [5]:
# raw_dataset = load_dataset("kosta-naumenko/medflex", split='train', download_mode='force_redownload', verification_mode='no_checks')
raw_dataset = update_dataset(False, True)
raw_dataset

Total raws: 1608


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1608
})

In [6]:
model_name = "alexyalunin/RuBioRoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [7]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, 
        max_length=512, padding=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [9]:
tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_dataset.column_names,
)

Map:   0%|          | 0/1608 [00:00<?, ? examples/s]

In [10]:
id = 4
input_ids, attention_mask, labels = list(tokenized_dataset[id].values())
for i in range(len(input_ids)):
    if labels[i] > 0:
        if labels[i] == 1:
            print(" ")
        print(tokenizer.decode(input_ids[i]), end='')

 
 состояние тяжелое 
 Sat О 2 92% 
 пастозность стоп. 
 Sat О 2 92% 
 притупление в правой нижней доле 
 при аускультации - ослабленное дыхание, крепитация инспираторная в нижних отделах правого легкого 
 ЧДД - 22 в мин 
 Акцент II тона над Аортой 
 Артериальное давление 110/70 мм. рт. ст.

In [32]:
seqeval = evaluate.load("seqeval")
label_list = ['O', 'B', 'I']


def find_symtoms(result):
    text = result['text'].lower()
    symptoms = result['symptoms']
    for symptom in symptoms:
        text = text[:symptom[0]] + text[symptom[0]:symptom[1]].upper() + text[symptom[1]:]
    return text


def compute_metrics(p):
    global model, tokenizer, cur_texts, text_log_ids, processor
    model.eval()
    model.cpu()

    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    for log_id in text_log_ids:

        if len(cur_texts[log_id]) == 0:
            result = processor.process_labeled_texts(
                raw_dataset[log_id]['tokens'],
                preds=torch.LongTensor([tokenized_dataset[log_id]['labels']]),
                is_split=True
                )
            true_text = find_symtoms(result[0])
            cur_texts[log_id].append(true_text)

        result = processor.process_text(raw_dataset[log_id]['tokens'], model, is_split=True)
        cur_text = find_symtoms(result)
        cur_texts[log_id].append(cur_text)
        mlflow.log_text('\n'.join(cur_texts[log_id]), f'example_{log_id}.txt')

    model.train()
    model.to(torch.cuda.current_device())

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [33]:
id2label = {
    0: "O",
    1: "B",
    2: "I",
}
label2id = {
    "O": 0,
    "B": 1,
    "I": 2,
}

In [34]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
eval_dataset = tokenized_dataset.train_test_split(test_size=0.1)['test']

In [35]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    device_map={'': torch.cuda.current_device()},
    cache_dir='.cache',
    num_labels=3,
    id2label=id2label,
    label2id=label2id
    )

for param in model.roberta.parameters():
    param.requires_grad = False

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioRoBERTa and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
text_log_ids = [0, 1, 2, 3, 4, 5]
cur_texts = [[] for _ in text_log_ids]
processor = TextProcessor()

cur_run_id = 6
num_train_epochs = 15

name = "RuBioRoBERTa-finetune-head"
run_name = f'{name}-{cur_run_id:02}'
output_dir = f'./logs/{run_name}'

training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_steps=1,
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to="mlflow",
    run_name=run_name
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
mlflow.end_run()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5585,0.58629,0.003185,0.005059,0.003909,0.743196
2,0.5355,0.554895,0.015385,0.026981,0.019596,0.758505
3,0.5176,0.541742,0.028,0.047218,0.035154,0.761439
4,0.4758,0.531387,0.02551,0.042159,0.031786,0.764628
5,0.5089,0.52212,0.032312,0.065767,0.043333,0.771177
6,0.494,0.519107,0.035524,0.067454,0.046539,0.769689
7,0.5785,0.519185,0.037074,0.062395,0.046512,0.768881
8,0.575,0.517468,0.038382,0.062395,0.047527,0.769476
9,0.554,0.516973,0.037696,0.060708,0.046512,0.769051
10,0.4922,0.508128,0.038367,0.079258,0.051705,0.775132


In [41]:
i = 0
for row in raw_dataset:
    tokenized_row = tokenizer(row['tokens'], truncation=True, is_split_into_words=True,return_tensors='pt')
    tokens = tokenized_row['input_ids']
    if tokens.shape[1] > 512:

        i += 1
        print(tokens.shape[1])
        break
print(i)

1533
1


tensor([   1, 1088,  338,  ..., 4011,   18,    2])

In [84]:
tokenizer.decode(tokenized_row['input_ids'][0], skip_special_tokens=True)

' Анамнез жизни 7 лет назад установлен диагноз Аутоиммунный тиреоидит, по поводу которого получает Эутирокс 75 мг 1 раз в день.Особенности развития отсутствуют. Наследственность не отягощена. Профессиональные вредности: отсутствуют. Вредные привычки: не курит. Аллергологический анамнез: лекарственную аллергию отрицает. Эпид. анамнез: тифы, малярию, гепатит, венерические заболевания отрицает. Наличие тесных контактов за последние 14 дней с лицами, находящимися под наблюдением по COVID 19 отрицает. Наличие тесных контактов за последние 14 дней с лицами, которые заболели COVID - 19: отрицает. Наличие тесных контактов за последние 14 дней с лицами, у которых лабораторно подтвержден диагноз COVID - 19: отрицает. ОИМ, ОНМК, СД, онкологические заболевания - отрицает. Анамнез заболевания С 1997 г. был установлен диагноз Бронхиальная астма средней степени тяжести инфекционно-аллергического генеза. До 2003 г. перидически возникали обострения, требующие лечения в стационарных условиях. В 2003г. п

0
1
2


In [76]:
len(preds_full)

1533

In [70]:
text_batch[2]['input_ids'].shape

torch.Size([1, 509])

In [65]:
model.cpu()
model.eval();

In [69]:
model(**text_batch[2]).logits.argmax(axis=2).shape

torch.Size([1, 509])

In [None]:
path_to_model = f'../../models/{run_name}.pt'
torch.save(model, path_to_model)

In [45]:
model2 = torch.load('../../models/rubio_frozen.pt')

In [46]:
input_ids = torch.LongTensor(tokenized_dataset['input_ids']).to(model.device)
attention_mask = torch.LongTensor(tokenized_dataset['attention_mask']).to(model.device)

preds = model2(input_ids=input_ids, attention_mask=attention_mask)
p = [preds['logits'].detach().cpu(), tokenized_dataset['labels']]
compute_metrics(p)

{'precision': 0.0017035775127768314,
 'recall': 0.003173164097914778,
 'f1': 0.002216943784639747,
 'accuracy': 0.7840474930935167}

In [26]:
input_ids = torch.LongTensor(tokenized_dataset['input_ids']).to(model.device)
attention_mask = torch.LongTensor(tokenized_dataset['attention_mask']).to(model.device)

preds = model(input_ids=input_ids, attention_mask=attention_mask)
p = [preds['logits'].detach().cpu(), tokenized_dataset['labels']]
compute_metrics(p)

{'precision': 0.013938594838952722,
 'recall': 0.033544877606527655,
 'f1': 0.019693945442448436,
 'accuracy': 0.8131428907306177}

In [33]:
logits = preds.logits.argmax(axis=2).detach().cpu()

In [38]:
(logits == 1).sum()

tensor(2697)