In [46]:
!pip install datasets transformers seqeval
!pip install corus razdel
!pip install accelerate -U
!pip install transformers[torch]



In [47]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict

from datasets import Dataset, DatasetDict
from datasets import load_dataset, load_metric
from corus import load_rudrec
from razdel import tokenize

In [48]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import logging
from transformers.trainer import logger as noisy_logger

In [49]:
from google.colab import files
uploaded = files.upload()

Saving rudrec_annotated.json to rudrec_annotated (1).json


In [50]:
model_checkpoint = "cointegrated/rubert-tiny"
batch_size = 16

In [51]:
drugs = list(load_rudrec('rudrec_annotated.json'))
drugs[2]

RuDReCRecord(
    file_name='592814.tsv',
    text='Пила этот препарат для повышения иммунитета 5 лет назад.\n',
    sentence_id=0,
    entities=[RuDReCEntity(
         entity_id='*[1]',
         entity_text='повышения иммунитета',
         entity_type='DI',
         start=23,
         end=43,
         concept_id='C0020971',
         concept_name=nan
     )]
)

In [52]:
type2text = defaultdict(Counter)
ents = Counter()
for item in drugs:
    for e in item.entities:
        ents[e.entity_type] += 1
        type2text[e.entity_type][e.entity_text] += 1

for k, v in ents.most_common():
    print(k, v)
    print(type2text[k].most_common(3))

DI 1401
[('простуды', 64), ('ОРВИ', 47), ('профилактики', 42)]
Drugname 1043
[('Виферон', 33), ('Анаферон', 25), ('Циклоферон', 24)]
Drugform 836
[('таблетки', 154), ('таблеток', 79), ('свечи', 63)]
ADR 720
[('аллергия', 16), ('слабость', 13), ('диарея', 12)]
Drugclass 330
[('противовирусный', 21), ('противовирусное', 18), ('противовирусных', 13)]
Finding 236
[('аллергии', 12), ('температуры', 6), ('сонливости', 5)]


In [53]:
drugs[2].text

'Пила этот препарат для повышения иммунитета 5 лет назад.\n'

Напишем функцию, перекладывающую разметку сущностей на уровень слов. Будем использовать [IOB](https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging))-нотацию, чтобы разделять несколько сущностей одного типа, идущих подряд.

In [54]:
def extract_labels(item):
    raw_toks = list(tokenize(item.text))
    words = [tok.text for tok in raw_toks]
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item.text)
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    for e in item.entities:
        e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})
        word_labels[e_words[0]] = 'B-' + e.entity_type
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + e.entity_type

    return {'tokens': words, 'tags': word_labels}

In [55]:
print(extract_labels(drugs[2]))

{'tokens': ['Пила', 'этот', 'препарат', 'для', 'повышения', 'иммунитета', '5', 'лет', 'назад', '.'], 'tags': ['O', 'O', 'O', 'O', 'B-DI', 'I-DI', 'O', 'O', 'O', 'O']}


In [56]:
ner_data = [extract_labels(item) for item in drugs]
ner_train, ner_test = train_test_split(ner_data, test_size=0.1, random_state=16)

Пример данных

In [57]:
pd.options.display.max_colwidth = 300
pd.DataFrame(ner_train).sample(3)

Unnamed: 0,tokens,tags
463,"[Здраствуйте, !]","[O, O]"
1744,"[Думайте, принимать, его, или, нет, .]","[O, O, O, O, O, O]"
1502,"[Ведь, обычно, все, препараты, только, до, 14, детские, ,, а, дальше, надо, брать, обычные, ,, но, именно, эти, таблетки, подходят, с, 1, месяца, жизни, до, 18, лет, !]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-Drugform, O, O, O, O, O, O, O, O, O]"


Соберём все виды меток в список.

In [58]:
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O',
 'B-ADR',
 'B-DI',
 'B-Drugclass',
 'B-Drugform',
 'B-Drugname',
 'B-Finding',
 'I-ADR',
 'I-DI',
 'I-Drugclass',
 'I-Drugform',
 'I-Drugname',
 'I-Finding']

In [59]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 4328
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 481
    })
})

## Preprocessing the data

In [60]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [61]:
example = ner_train[2]
print(example["tokens"])

['Это', 'средство', 'дополняет', 'основное', 'лечение', '.']


In [63]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'Это', 'средство', 'доп', '##ол', '##няет', 'основно', '##е', 'ле', '##чение', '.', '[SEP]']


In [64]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example["tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

12 12


In [65]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [66]:
tokenize_and_align_labels(ner_data['train'][22:23])

{'input_ids': [[2, 16365, 1172, 15846, 991, 5215, 25397, 24133, 3136, 12383, 21352, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}

In [67]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4328 [00:00<?, ? examples/s]

Map:   0%|          | 0/481 [00:00<?, ? examples/s]

## Fine-tuning the model

In [68]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none'
)

In [70]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [71]:
metric = load_metric("seqeval")

In [72]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [73]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [74]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 2.8059113025665283,
 'eval_precision': 0.02112676056338028,
 'eval_recall': 0.13872832369942195,
 'eval_f1': 0.03666921313980137,
 'eval_accuracy': 0.02525169169830005,
 'eval_runtime': 0.6207,
 'eval_samples_per_second': 774.881,
 'eval_steps_per_second': 49.94}

In [75]:
noisy_logger.setLevel(logging.WARNING)

In [76]:
for param in model.parameters():
    param.requires_grad = True

In [77]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.514994,0.61361,0.341683,0.438944,0.872421
2,0.714400,0.426636,0.600309,0.499679,0.545391,0.890659
3,0.714400,0.380244,0.605702,0.559409,0.581636,0.897838
4,0.409600,0.355115,0.601013,0.609505,0.60523,0.902212
5,0.409600,0.338598,0.61125,0.628131,0.619576,0.905595
6,0.339800,0.327525,0.609697,0.646114,0.627378,0.906338
7,0.339800,0.318625,0.622688,0.648683,0.63542,0.908153
8,0.302800,0.321416,0.581006,0.667951,0.621452,0.904192
9,0.302800,0.312857,0.612426,0.66474,0.637512,0.908566
10,0.285400,0.311928,0.611834,0.664098,0.636896,0.908566


TrainOutput(global_step=2710, training_loss=0.40048212846706716, metrics={'train_runtime': 61.3947, 'train_samples_per_second': 704.946, 'train_steps_per_second': 44.141, 'total_flos': 40693453175520.0, 'train_loss': 0.40048212846706716, 'epoch': 10.0})

In [78]:
trainer.evaluate()

{'eval_loss': 0.31192803382873535,
 'eval_precision': 0.6118343195266273,
 'eval_recall': 0.6640976236351959,
 'eval_f1': 0.6368955959347089,
 'eval_accuracy': 0.9085657699290312,
 'eval_runtime': 0.5283,
 'eval_samples_per_second': 910.433,
 'eval_steps_per_second': 58.677,
 'epoch': 10.0}

In [79]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

  _warn_prf(average, modifier, msg_start, len(result))


{'ADR': {'precision': 0.15422885572139303,
  'recall': 0.2,
  'f1': 0.1741573033707865,
  'number': 155},
 'DI': {'precision': 0.43344155844155846,
  'recall': 0.5959821428571429,
  'f1': 0.5018796992481204,
  'number': 448},
 'Drugclass': {'precision': 0.8581081081081081,
  'recall': 0.8819444444444444,
  'f1': 0.8698630136986302,
  'number': 144},
 'Drugform': {'precision': 0.8664122137404581,
  'recall': 0.850187265917603,
  'f1': 0.8582230623818525,
  'number': 267},
 'Drugname': {'precision': 0.8250539956803455,
  'recall': 0.9030732860520094,
  'f1': 0.8623024830699774,
  'number': 423},
 'Finding': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 120},
 'overall_precision': 0.6118343195266273,
 'overall_recall': 0.6640976236351959,
 'overall_f1': 0.6368955959347089,
 'overall_accuracy': 0.9085657699290312}

In [80]:
model.save_pretrained('ner_bert.bin')
tokenizer.save_pretrained('ner_bert.bin')

('ner_bert.bin/tokenizer_config.json',
 'ner_bert.bin/special_tokens_map.json',
 'ner_bert.bin/vocab.txt',
 'ner_bert.bin/added_tokens.json',
 'ner_bert.bin/tokenizer.json')

# Применение модели

In [81]:
text = ' '.join(ner_test[22]['tokens'])
text

'Но лимфоузлы продолжались расти и тогда нам прописали гель кетопрофен , которую нужно было Осторожно применять в течение 2 недель .'

In [82]:
tokens = tokenizer(text, return_tensors='pt')
tokens = {k: v.to(model.device) for k, v in tokens.items()}

with torch.no_grad():
    pred = model(**tokens)
pred.logits.shape

torch.Size([1, 44, 13])

In [83]:
from transformers import pipeline

In [84]:
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)

In [85]:
print(text)
print(pipe(text))

Но лимфоузлы продолжались расти и тогда нам прописали гель кетопрофен , которую нужно было Осторожно применять в течение 2 недель .
[{'entity_group': 'Drugname', 'score': 0.7288921, 'word': 'лимфоузлы', 'start': 3, 'end': 12}, {'entity_group': 'DI', 'score': 0.26982975, 'word': 'расти', 'start': 26, 'end': 31}, {'entity_group': 'Drugform', 'score': 0.30260277, 'word': 'гель', 'start': 54, 'end': 58}, {'entity_group': 'Drugname', 'score': 0.5645015, 'word': 'кетопрофен', 'start': 59, 'end': 69}]
