In [1]:
model_checkpoint = "cointegrated/rubert-tiny2"
batch_size=16

In [2]:
from datasets import load_dataset, load_metric
from corus import load_rudrec
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
from razdel import tokenize
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
drugs = list(load_rudrec('rudrec_annotated.json'))
print(len(drugs))
print(drugs[10].text)

4809
Доброй ночи дорогие друзья и просто читатели.



In [4]:
type2text = defaultdict(Counter)
ents = Counter()
for item in drugs:
    for e in item.entities:
        ents[e.entity_type] += 1
        type2text[e.entity_type][e.entity_text] += 1

for k, v in ents.most_common():
    print(k, v)
    print(type2text[k].most_common(3))

DI 1401
[('простуды', 64), ('ОРВИ', 47), ('профилактики', 42)]
Drugname 1043
[('Виферон', 33), ('Анаферон', 25), ('Циклоферон', 24)]
Drugform 836
[('таблетки', 154), ('таблеток', 79), ('свечи', 63)]
ADR 720
[('аллергия', 16), ('слабость', 13), ('диарея', 12)]
Drugclass 330
[('противовирусный', 21), ('противовирусное', 18), ('противовирусных', 13)]
Finding 236
[('аллергии', 12), ('температуры', 6), ('сонливости', 5)]


In [5]:
def extract_labels(item):
    raw_toks = list(tokenize(item.text))
    words = [tok.text for tok in raw_toks]
    word_labels = ["O"] * len(raw_toks)
    char2word = [None] * len(item.text)
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)


    for e in item.entities:
        e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})
        word_labels[e_words[0]] = "B-" + e.entity_type
        for idx in e_words[1:]:
            word_labels[idx] = "I-" + e.entity_type

    return {"tokens": words,"tags": word_labels}

In [6]:
extract_labels(drugs[0])

{'tokens': ['нам',
  'прописали',
  ',',
  'так',
  'мой',
  'ребенок',
  'сыпью',
  'покрылся',
  ',',
  'глаза',
  'опухли',
  ',',
  'сверху',
  'и',
  'снизу',
  'на',
  'веках',
  'высыпала',
  'сыпь',
  ',',
  '(',
  '8',
  'месяцев',
  'сыну',
  ')',
  'А',
  'от',
  'виферона',
  'такого',
  'не',
  'было',
  '...',
  'У',
  'кого',
  'ещё',
  'такие',
  'побочки',
  ',',
  'отзовитесь',
  '!',
  '1',
  'Чем',
  'спасались',
  '?'],
 'tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ADR',
  'I-ADR',
  'O',
  'B-ADR',
  'I-ADR',
  'O',
  'O',
  'O',
  'O',
  'B-ADR',
  'I-ADR',
  'I-ADR',
  'I-ADR',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Drugform',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']}

In [7]:
ner_data = [extract_labels(item) for item in drugs]
ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)

In [8]:
pd.DataFrame(ner_train).sample(3)

Unnamed: 0,tokens,tags
2590,"[На, следующую, ночь, свечки, уже, никакие, не...","[O, O, O, B-Drugform, O, O, O, O, O, O, O, O]"
1491,"[ни, слова, сказать, не, мог, ,, даже, пошевел...","[B-ADR, I-ADR, I-ADR, I-ADR, I-ADR, O, O, O, O]"
740,"[Стоят, таблетки, недорого, ,, но, препарат, н...","[O, B-Drugform, O, O, O, O, O, O, O]"


In [9]:
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
print(label_list)

['O', 'B-ADR', 'B-DI', 'B-Drugclass', 'B-Drugform', 'B-Drugname', 'B-Finding', 'I-ADR', 'I-DI', 'I-Drugclass', 'I-Drugform', 'I-Drugname', 'I-Finding']


In [10]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))    
})
print(ner_data)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 3847
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 962
    })
})


  if _pandas_api.is_sparse(col):


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
tokenizer("Hello world")

{'input_ids': [2, 9944, 1419, 3], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [13]:
sent = ner_train[5]

In [14]:
tokenized_input = tokenizer(sent["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'Мы', 'поменяли', 'место', 'жительства', 'и', 'перевели', 'дочь', 'в', 'школу', ',', 'которая', 'находится', 'ближе', 'к', 'дому', '.', '[SEP]']


In [15]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids =  []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
    
        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [16]:
tokenize_and_align_labels(ner_data['train'][22:23])

{'input_ids': [[2, 1041, 37038, 33265, 19106, 40305, 22018, 548, 22276, 320, 21538, 16, 47886, 548, 59614, 11137, 626, 56606, 700, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 1, 1, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}

In [17]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3847/3847 [00:00<00:00, 14181.13 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 962/962 [00:00<00:00, 26485.59 examples/s]


In [18]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = 'epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [21]:
metric.compute(predictions=[ner_train[4]['tags']], references=[ner_train[4]['tags']])

{'DI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Drugform': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [22]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 2.6721551418304443,
 'eval_precision': 0.016947875866224767,
 'eval_recall': 0.11375126390293225,
 'eval_f1': 0.029500458896027273,
 'eval_accuracy': 0.06463173504695996,
 'eval_runtime': 0.6238,
 'eval_samples_per_second': 1542.049,
 'eval_steps_per_second': 97.781}

In [25]:
for param in model.bert.parameters():
    param.requires_grad = True

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.437829,0.641841,0.387765,0.483454,0.889212
2,No log,0.352718,0.592048,0.549545,0.570005,0.905648
3,0.562500,0.311709,0.64561,0.609707,0.627145,0.915843
4,0.562500,0.29005,0.657375,0.655713,0.656543,0.920539
5,0.291400,0.276298,0.631989,0.697169,0.662981,0.923443
6,0.291400,0.265435,0.666027,0.701719,0.683407,0.927706
7,0.239300,0.261459,0.657534,0.728008,0.690979,0.929004
8,0.239300,0.256533,0.670093,0.724975,0.696455,0.930672
9,0.206800,0.256405,0.657596,0.733064,0.693282,0.929869
10,0.206800,0.255454,0.66682,0.730536,0.697226,0.931105


TrainOutput(global_step=2410, training_loss=0.3037856200918617, metrics={'train_runtime': 45.2163, 'train_samples_per_second': 850.799, 'train_steps_per_second': 53.299, 'total_flos': 24794241657930.0, 'train_loss': 0.3037856200918617, 'epoch': 10.0})

In [27]:
trainer.evaluate()

{'eval_loss': 0.25545403361320496,
 'eval_precision': 0.6668204891555145,
 'eval_recall': 0.7305358948432761,
 'eval_f1': 0.6972255729794934,
 'eval_accuracy': 0.9311047948591201,
 'eval_runtime': 0.3374,
 'eval_samples_per_second': 2851.596,
 'eval_steps_per_second': 180.818,
 'epoch': 10.0}

In [28]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
print(results)

{'ADR': {'precision': 0.2736318407960199, 'recall': 0.22727272727272727, 'f1': 0.24830699774266363, 'number': 242}, 'DI': {'precision': 0.4041916167664671, 'recall': 0.6367924528301887, 'f1': 0.49450549450549447, 'number': 424}, 'Drugclass': {'precision': 0.8260869565217391, 'recall': 0.8769230769230769, 'f1': 0.8507462686567164, 'number': 195}, 'Drugform': {'precision': 0.8794326241134752, 'recall': 0.8888888888888888, 'f1': 0.8841354723707666, 'number': 279}, 'Drugname': {'precision': 0.8665018541409147, 'recall': 0.9422043010752689, 'f1': 0.9027688345138442, 'number': 744}, 'Finding': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 94}, 'overall_precision': 0.6668204891555145, 'overall_recall': 0.7305358948432761, 'overall_f1': 0.6972255729794934, 'overall_accuracy': 0.9311047948591201}


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
from sklearn.metrics import confusion_matrix
cm = pd.DataFrame(
    confusion_matrix(sum(true_labels, []), sum(true_predictions, []), labels=label_list),
    index=label_list,
    columns=label_list
)
cm

Unnamed: 0,O,B-ADR,B-DI,B-Drugclass,B-Drugform,B-Drugname,B-Finding,I-ADR,I-DI,I-Drugclass,I-Drugform,I-Drugname,I-Finding
O,13478,18,85,20,22,51,0,7,18,0,0,0,0
B-ADR,70,77,82,1,0,5,0,4,3,0,0,0,0
B-DI,76,19,301,3,6,9,0,0,10,0,0,0,0
B-Drugclass,19,0,4,171,1,0,0,0,0,0,0,0,0
B-Drugform,29,0,1,0,249,0,0,0,0,0,0,0,0
B-Drugname,17,0,6,2,3,715,0,0,1,0,0,0,0
B-Finding,23,16,51,3,0,1,0,0,0,0,0,0,0
I-ADR,93,23,22,0,0,0,0,39,32,0,0,0,0
I-DI,107,10,63,0,0,2,0,3,39,0,0,0,0
I-Drugclass,0,0,0,2,0,0,0,0,0,0,0,0,0


In [30]:
model.save_pretrained('ner_bert.bin')
tokenizer.save_pretrained('ner_bert.bin')

('ner_bert.bin/tokenizer_config.json',
 'ner_bert.bin/special_tokens_map.json',
 'ner_bert.bin/vocab.txt',
 'ner_bert.bin/added_tokens.json',
 'ner_bert.bin/tokenizer.json')

In [31]:
import torch

text = ' '.join(ner_test[7]["tokens"])
text

'Хочу поделиться с вами отзывом о снотворных таблетках " Красная звезда " Сондокс , которые я приобрела по рекомендации моей подруги .'

In [35]:
def predict_ents(text):
    tokens = tokenizer(text, return_tensors='pt')
    tokens = {k: v.to(model.device) for k, v in tokens.items()}
    with torch.no_grad():
        pred = model(**tokens)
    indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
    token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
    for t, idx in zip(token_text, indices):
        print(f"{t:15s} {label_list[idx]:10s}")

In [42]:
predict_ents("Словил тут Миша на неделе гастрит. Миша сильно пострадал, ноги и руки болели даже. Таблетки и успокоительные пришлось пить.")

[CLS]           O         
Слов            O         
##ил            O         
тут             O         
Миша            O         
на              O         
неделе          O         
гастр           B-DI      
##ит            B-DI      
.               O         
Миша            O         
сильно          O         
пострадал       O         
,               O         
ноги            O         
и               O         
руки            O         
болели          O         
даже            O         
.               O         
Табл            B-Drugform
##етки          B-Drugform
и               O         
успоко          B-Drugclass
##ительные      B-Drugclass
пришлось        O         
пить            O         
.               O         
[SEP]           O         
